In [None]:
from matplotlib import pyplot as plt  # type: ignore
from matplotlib import cm, colors  # type: ignore
from dotenv import load_dotenv
import geopandas as gpd  # type: ignore
import sqlalchemy as sq
import pandas as pd  # type: ignore
import numpy as np
import os, sys

sys.path.append("../")
from Shared.DataService import DataService

In [None]:
load_dotenv()
PG_DB = os.getenv("POSTGRES_DB")
PG_ADDR = os.getenv("POSTGRES_ADDR")
PG_PORT = os.getenv("POSTGRES_PORT")
PG_USER = os.getenv("POSTGRES_USER")
PG_PW = os.getenv("POSTGRES_PW")

In [None]:
if (
    PG_DB is None
    or PG_ADDR is None
    or PG_PORT is None
    or PG_USER is None
    or PG_PW is None
):
    raise ValueError("Environment variables not set")

# connecting to database
db = DataService(PG_DB, PG_ADDR, int(PG_PORT), PG_USER, PG_PW)
conn = db.connect()

## Visualization before aggregation

In [None]:
query = sq.text("SELECT * FROM public.ergot_sample")
ergot_df = pd.read_sql(query, conn)

In [None]:
ergot_df

In [None]:
ergot_df.loc[ergot_df["province"] == "MB", "district"] = (
    ergot_df.loc[ergot_df["province"] == "MB", "crop_district"] + 4600
)
ergot_df.loc[ergot_df["province"] == "SK", "district"] = (
    ergot_df.loc[ergot_df["province"] == "SK", "crop_district"] - 1
) + 4700
ergot_df.loc[ergot_df["province"] == "AB", "district"] = (
    ergot_df.loc[ergot_df["province"] == "AB", "crop_district"] * 10
) + 4800

In [None]:
ergot_df.drop(columns=["crop_district", "sample_id"], inplace=True)

In [None]:
ergot_df["district"] = pd.to_numeric(ergot_df["district"], downcast="integer")
ergot_df

In [None]:
# sample with incidence = True: per year, per province
samples_df = (
    ergot_df[ergot_df["incidence"] == True]
    .groupby(["province", "year"])["incidence"]
    .count()
    .reset_index()
)
mb_df = samples_df[samples_df["province"] == "MB"]
ab_df = samples_df[samples_df["province"] == "AB"]
sk_df = samples_df[samples_df["province"] == "SK"]

year = mb_df["year"].tolist()
mb_incidence = mb_df["incidence"].tolist()
ab_incidence = ab_df["incidence"].tolist()
sk_incidence = sk_df["incidence"].tolist()


plt.figure(figsize=(10, 5))
plt.xlabel("Year")
plt.ylabel("Incidence")
plt.plot(year, mb_incidence, color="blue")
plt.plot(year, ab_incidence, color="green")
plt.plot(year, sk_incidence, color="red")
plt.legend(["Manitoba", "Alberta", "Saskatchewan"])
plt.show()

In [None]:
ratio_df = ergot_df.groupby(["province", "year"])["incidence"].count().reset_index()
ratio_df["ratio"] = (samples_df["incidence"] / ratio_df["incidence"]).to_frame()
ratio_df.drop(columns=["incidence"], inplace=True)
ratio_df

In [None]:
# Min, max for each province
mb_ratio = ratio_df[ratio_df["province"] == "MB"]["ratio"].tolist()
ab_ratio = ratio_df[ratio_df["province"] == "AB"]["ratio"].tolist()
sk_ratio = ratio_df[ratio_df["province"] == "SK"]["ratio"].tolist()


plt.figure(figsize=(10, 5))
plt.xlabel("Year")
plt.ylabel("Incidence")
plt.plot(year, mb_ratio, color="blue")
plt.plot(year, ab_ratio, color="green")
plt.plot(year, sk_ratio, color="red")
plt.legend(["Manitoba", "Alberta", "Saskatchewan"])
plt.show()

print(
    "Highest ratio in MB: {}, in year: {}".format(
        max(mb_ratio), mb_ratio.index(max(mb_ratio)) + 1995
    )
)
print(
    "Highest ratio in AB: {}, in year: {}".format(
        max(ab_ratio), ab_ratio.index(max(ab_ratio)) + 1995
    )
)
print(
    "Highest ratio in SK: {}, in year: {}".format(
        max(sk_ratio), sk_ratio.index(max(sk_ratio)) + 1995
    )
)

print(
    "Lowest ratio in MB: {}, in year: {}".format(
        min(mb_ratio), mb_ratio.index(min(mb_ratio)) + 1995
    )
)
print(
    "Lowest ratio in AB: {}, in year: {}".format(
        min(ab_ratio), ab_ratio.index(min(ab_ratio)) + 1995
    )
)
print(
    "Lowest ratio in SK: {}, in year: {}".format(
        min(sk_ratio), sk_ratio.index(min(sk_ratio)) + 1995
    )
)

In [None]:
total_df = ergot_df.groupby(["year", "district"])["incidence"].count().reset_index()
region_df = (
    ergot_df[ergot_df["incidence"] == True]
    .groupby(["province", "year", "district"])["incidence"]
    .count()
    .reset_index()
)
region_df["ratio"] = (region_df["incidence"] / total_df["incidence"]) * 100
region_df

In [None]:
regionQuery = sq.text("select district, color, geometry FROM public.census_ag_regions")
agRegions = gpd.GeoDataFrame.from_postgis(
    regionQuery, conn, crs="EPSG:3347", geom_col="geometry"
)

In [None]:
def color_map_color(
    value: int, cmap_name="Wistia", vmin: int = 0, vmax: int = 100
) -> str:
    norm = colors.Normalize(vmin=vmin, vmax=vmax)
    cmap = cm.get_cmap(cmap_name)
    rgb = cmap(norm(abs(value)))[:3]
    color = colors.rgb2hex(rgb)

    return color

In [None]:
def get_color(ratio_year: pd.DataFrame) -> pd.Series:
    color_map = []

    for district in agRegions["district"].tolist():
        if district in ratio_year["district"].tolist():
            ratio = ratio_year[ratio_year["district"] == district]["ratio"].tolist()[0]
            color_map.append(color_map_color(ratio))
        else:
            color_map.append(color_map_color(0))

    return pd.Series(color_map)

In [None]:
def plot_map(color_map: pd.Series, year: int):
    minx, miny, maxx, maxy = agRegions.total_bounds
    fig, ax = plt.subplots(figsize=(20, 20))
    ax.set_ylim(miny, maxy)
    ax.set_xlim(minx, maxx)
    ax.set_title("Incident level for district in " + str(year))
    agRegions.plot(ax=ax, color=color_map, edgecolor="black")
    agRegions.apply(
        lambda x: ax.annotate(
            text=x["district"],
            xy=x.geometry.centroid.coords[0],
            ha="center",
            color="black",
            size=10,
        ),
        axis=1,
    )
    plt.show()

In [None]:
for currYear in range(1995, 2023):
    ratio_year = region_df.loc[region_df["year"] == year]
    color = get_color(ratio_year)
    plot_map(color, currYear)

# Visualization for aggregated ergot

In [None]:
query = sq.text("SELECT * FROM public.agg_ergot_samples")
agg_ergot_df = pd.read_sql(query, conn)
agg_ergot_df

In [None]:
agg_ergot_df["severity"].unique()

In [None]:
def stats(data: list):
    # assign your quartiles, limits and iq3
    q1, q2, q3 = np.percentile(data, [25, 50, 75])
    iqr = q3 - q1
    print("iqr: ", iqr)
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    # create conditions to isolate the outliers
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    return q1, q2, q3, outliers

In [None]:
data = np.array(agg_ergot_df["severity"])
q1, q2, q3, outliers = stats(data.tolist())
print("number of outliers:", len(outliers))
print("q1, q2, q3: {}, {}, {}".format(q1, q2, q3))
print(outliers)

In [None]:
wo_outliers = pd.DataFrame(data, columns=["severity"])
wo_outliers = wo_outliers[~wo_outliers["severity"].isin(outliers)]
wo_outliers

In [None]:
wo_outliers.boxplot(column=["severity"], showfliers=False, autorange=True)

In [None]:
# incident/present of urgot information by district
present_df = agg_ergot_df[
    [
        "year",
        "has_ergot",
        "district",
        "present_prev1",
        "present_prev2",
        "present_prev3",
        "present_in_neighbor",
    ]
].drop_duplicates()
present_df

In [None]:
# find percentage of having ergot in previous year -> having ergot in this year
percent1 = (
    present_df[
        (present_df["has_ergot"] == True) & (present_df["present_prev1"] == True)
    ].shape[0]
    / present_df.shape[0]
)
print("Percent of having ergot when prev year had ergot: ", percent1)

# find percentage of having ergot in previous 2 years -> having ergot in this year
percent2 = (
    present_df[
        (present_df["has_ergot"] == True) & (present_df["present_prev2"] == True)
    ].shape[0]
    / present_df.shape[0]
)
print("Percent of having ergot when prev 2 year had ergot: ", percent2)

# find percentage of having ergot when having ergot in previous 3 year
percent3 = (
    present_df[
        (present_df["has_ergot"] == True) & (present_df["present_prev3"] == True)
    ].shape[0]
    / present_df.shape[0]
)
print("Percent of having ergot when prev 3 year had ergot: ", percent3)

# find percentage of having ergot when neighbor has ergot
percent4 = (
    present_df[
        (present_df["has_ergot"] == True) & (present_df["present_in_neighbor"] == True)
    ].shape[0]
    / present_df.shape[0]
)
print("Percent of having ergot when neighbor is having ergot: ", percent4)

In [None]:
# severity information by district
severity_df = agg_ergot_df[
    [
        "year",
        "district",
        "has_ergot",
        "sum_severity",
        "severity_prev1",
        "severity_prev2",
        "severity_prev3",
    ]
].drop_duplicates()
severity_df

In [None]:
severity_df[(~severity_df["has_ergot"]) & (severity_df["sum_severity"] > 0)]

In [None]:
# assign your quartiles, limits and iq3
data = np.array(severity_df["sum_severity"])
q1, q2, q3, outliers = stats(data.tolist())
print("number of outliers:", len(outliers))
print("q1, q2, q3: {}, {}, {}".format(q1, q2, q3))
print(outliers)

In [None]:
# plt.rcParams.update({'figure.figsize':(7,5), 'figure.dpi':100})

# plot
severity_df.plot.box(title="Severity", column=["sum_severity"], showfliers=False)