In [None]:
import matplotlib.pyplot as plt  # type: ignore
from dotenv import load_dotenv
import sqlalchemy as sq  # type: ignore
import seaborn as sns  # type: ignore
import pandas as pd  # type: ignore
import numpy as np
import os, sys

sys.path.append("../")
from Shared.DataService import DataService

In [None]:
load_dotenv()
PG_DB = os.getenv("POSTGRES_DB")
PG_ADDR = os.getenv("POSTGRES_ADDR")
PG_PORT = os.getenv("POSTGRES_PORT")
PG_USER = os.getenv("POSTGRES_USER")
PG_PW = os.getenv("POSTGRES_PW")

In [None]:
if (
    PG_DB is None
    or PG_ADDR is None
    or PG_PORT is None
    or PG_USER is None
    or PG_PW is None
):
    raise ValueError("Environment variables not set")

db = DataService(PG_DB, PG_ADDR, int(PG_PORT), PG_USER, PG_PW)
conn = db.connect()

In [None]:
mbQuery = sq.text("SELECT * FROM public.mb_hly_station_data")
mb_df = pd.read_sql(mbQuery, conn)

skQuery = sq.text("SELECT * FROM public.sk_hly_station_data")
sk_df = pd.read_sql(skQuery, conn)

abQuery = sq.text("SELECT * FROM public.ab_hly_station_data")
ab_df = pd.read_sql(abQuery, conn)

hlyDF = pd.concat([mb_df, sk_df, ab_df])

In [None]:
stationDataQuery = sq.text(
    """
    SELECT station_id, district FROM public.stations_hly
    WHERE district IS NOT NULL;
    """
)

stationData = pd.read_sql(stationDataQuery, conn)
stationData[["district"]] = stationData[["district"]].astype(int)

In [None]:
hlyDF = hlyDF.merge(stationData, on="station_id")

In [None]:
ergotQuery = sq.text("SELECT * FROM public.agg_ergot_samples")

ergotDF = pd.read_sql_query(ergotQuery, conn)

In [None]:
db.cleanup()

In [None]:
final_df = hlyDF.merge(ergotDF, on=["year", "district"])

In [None]:
final_df.columns

In [None]:
final_df.drop(columns=["id", "station_id", "year", "month", "day"], inplace=True)

In [None]:
# Compute the correlation matrix
corr = final_df.corr()

In [None]:
corr

In [None]:
sns.set_theme(style="white")

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(
    corr,
    mask=mask,
    cmap=cmap,
    vmax=0.1,
    vmin=-0.1,
    center=0,
    square=True,
    linewidths=0.5,
    cbar_kws={"shrink": 0.5},
)

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
fig.suptitle("Temperature (°C)")


ax1.hist(ab_df["min_temp"], alpha=0.5, label="ab")
ax1.hist(mb_df["min_temp"], alpha=0.5, label="mb")
ax1.hist(sk_df["min_temp"], alpha=0.5, label="sk")
ax1.title.set_text("min")
ax1.legend(loc="upper left")

ax2.hist(ab_df["mean_temp"], alpha=0.5, label="ab")
ax2.hist(mb_df["mean_temp"], alpha=0.5, label="mb")
ax2.hist(sk_df["mean_temp"], alpha=0.5, label="sk")
ax2.title.set_text("mean")
ax2.legend(loc="upper left")

ax3.hist(ab_df["max_temp"], alpha=0.5, label="ab")
ax3.hist(mb_df["max_temp"], alpha=0.5, label="mb")
ax3.hist(sk_df["max_temp"], alpha=0.5, label="sk")
ax3.title.set_text("max")
ax3.legend(loc="upper left")

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
fig.suptitle("Dew Point Temperature (°C)")


ax1.hist(ab_df["min_dew_point_temp"], alpha=0.5, label="ab")
ax1.hist(mb_df["min_dew_point_temp"], alpha=0.5, label="mb")
ax1.hist(sk_df["min_dew_point_temp"], alpha=0.5, label="sk")
ax1.title.set_text("min")
ax1.legend(loc="upper left")

ax2.hist(ab_df["mean_dew_point_temp"], alpha=0.5, label="ab")
ax2.hist(mb_df["mean_dew_point_temp"], alpha=0.5, label="mb")
ax2.hist(sk_df["mean_dew_point_temp"], alpha=0.5, label="sk")
ax2.title.set_text("mean")
ax2.legend(loc="upper left")

ax3.hist(ab_df["max_dew_point_temp"], alpha=0.5, label="ab")
ax3.hist(mb_df["max_dew_point_temp"], alpha=0.5, label="mb")
ax3.hist(sk_df["max_dew_point_temp"], alpha=0.5, label="sk")
ax3.title.set_text("max")
ax3.legend(loc="upper left")

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
fig.suptitle("Humidity Index (air temperature + humidity)")


ax1.hist(ab_df["min_humidex"], alpha=0.5, label="ab")
ax1.hist(mb_df["min_humidex"], alpha=0.5, label="mb")
ax1.hist(sk_df["min_humidex"], alpha=0.5, label="sk")
ax1.title.set_text("min")
ax1.legend(loc="upper left")

ax2.hist(ab_df["mean_humidex"], alpha=0.5, label="ab")
ax2.hist(mb_df["mean_humidex"], alpha=0.5, label="mb")
ax2.hist(sk_df["mean_humidex"], alpha=0.5, label="sk")
ax2.title.set_text("mean")
ax2.legend(loc="upper left")

ax3.hist(ab_df["max_humidex"], alpha=0.5, label="ab")
ax3.hist(mb_df["max_humidex"], alpha=0.5, label="mb")
ax3.hist(sk_df["max_humidex"], alpha=0.5, label="sk")
ax3.title.set_text("max")
ax3.legend(loc="upper left")

In [None]:
plt.hist(ab_df["total_precip"], alpha=0.5, label="ab")
plt.hist(mb_df["total_precip"], alpha=0.5, label="mb")
plt.hist(sk_df["total_precip"], alpha=0.5, label="sk")
plt.legend(loc="upper right")
plt.title("Total Precipitation (mm)")
plt.xlim(0, 2500)
plt.show()

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
fig.suptitle("Humidity (%)")


ax1.hist(ab_df["min_rel_humid"], alpha=0.5, label="ab")
ax1.hist(mb_df["min_rel_humid"], alpha=0.5, label="mb")
ax1.hist(sk_df["min_rel_humid"], alpha=0.5, label="sk")
ax1.title.set_text("min")
ax1.legend(loc="upper left")

ax2.hist(ab_df["mean_rel_humid"], alpha=0.5, label="ab")
ax2.hist(mb_df["mean_rel_humid"], alpha=0.5, label="mb")
ax2.hist(sk_df["mean_rel_humid"], alpha=0.5, label="sk")
ax2.title.set_text("mean")
ax2.legend(loc="upper left")

ax3.hist(ab_df["max_rel_humid"], alpha=0.5, label="ab")
ax3.hist(mb_df["max_rel_humid"], alpha=0.5, label="mb")
ax3.hist(sk_df["max_rel_humid"], alpha=0.5, label="sk")
ax3.title.set_text("max")
ax3.legend(loc="upper left")

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
fig.suptitle("Station Pressure (kPa)")


ax1.hist(ab_df["min_stn_press"], alpha=0.5, label="ab")
ax1.hist(mb_df["min_stn_press"], alpha=0.5, label="mb")
ax1.hist(sk_df["min_stn_press"], alpha=0.5, label="sk")
ax1.title.set_text("min")
ax1.legend(loc="upper left")

ax2.hist(ab_df["mean_stn_press"], alpha=0.5, label="ab")
ax2.hist(mb_df["mean_stn_press"], alpha=0.5, label="mb")
ax2.hist(sk_df["mean_stn_press"], alpha=0.5, label="sk")
ax2.title.set_text("mean")
ax2.legend(loc="upper left")

ax3.hist(ab_df["max_stn_press"], alpha=0.5, label="ab")
ax3.hist(mb_df["max_stn_press"], alpha=0.5, label="mb")
ax3.hist(sk_df["max_stn_press"], alpha=0.5, label="sk")
ax3.title.set_text("max")
ax3.legend(loc="upper left")

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
fig.suptitle("Visibility (km)")


ax1.hist(ab_df["min_visibility"], alpha=0.5, label="ab")
ax1.hist(mb_df["min_visibility"], alpha=0.5, label="mb")
ax1.hist(sk_df["min_visibility"], alpha=0.5, label="sk")
ax1.title.set_text("min")
ax1.legend(loc="upper left")

ax2.hist(ab_df["mean_visibility"], alpha=0.5, label="ab")
ax2.hist(mb_df["mean_visibility"], alpha=0.5, label="mb")
ax2.hist(sk_df["mean_visibility"], alpha=0.5, label="sk")
ax2.title.set_text("mean")
ax2.legend(loc="upper left")

ax3.hist(ab_df["max_visibility"], alpha=0.5, label="ab")
ax3.hist(mb_df["max_visibility"], alpha=0.5, label="mb")
ax3.hist(sk_df["max_visibility"], alpha=0.5, label="sk")
ax3.title.set_text("max")
ax3.legend(loc="upper left")

In [None]:
mb_df.drop(columns=["id", "station_id", "year", "month", "day"], inplace=True)
sk_df.drop(columns=["id", "station_id", "year", "month", "day"], inplace=True)
ab_df.drop(columns=["id", "station_id", "year", "month", "day"], inplace=True)

In [None]:
mb_df.describe()

In [None]:
sk_df.describe()

In [None]:
ab_df.describe()