In [None]:
import sqlalchemy as sq
import geopandas as gpd  # type: ignore
import pandas as pd  # type: ignore
import numpy as np
from dotenv import load_dotenv
import os, sys, calendar

sys.path.append("../")
from Shared.DataService import DataService

In [None]:
MIN_MONTH = 1
MAX_MONTH = 12

MIN_YEAR = 1995
MAX_YEAR = 2022
TABLENAME = "agg_dly_weather"

load_dotenv()
PG_DB = os.getenv("POSTGRES_DB")
PG_ADDR = os.getenv("POSTGRES_ADDR")
PG_PORT = os.getenv("POSTGRES_PORT")
PG_USER = os.getenv("POSTGRES_USER")
PG_PW = os.getenv("POSTGRES_PW")

In [None]:
if (
    PG_DB is None
    or PG_ADDR is None
    or PG_PORT is None
    or PG_USER is None
    or PG_PW is None
):
    raise ValueError("Environment variables not set")

db = DataService(PG_DB, PG_ADDR, int(PG_PORT), PG_USER, PG_PW)
conn = db.connect()

In [None]:
# pulling weather station data from the database
weatherDataQuery = sq.text(
    """
    SELECT * FROM public.ab_station_data
    UNION
    SELECT * FROM public.mb_station_data
    UNION
    SELECT * FROM public.sk_station_data;
    """
)

weatherData = pd.read_sql(weatherDataQuery, conn)

In [None]:
# pulling station data from the database
stationDataQuery = sq.text(
    """
    SELECT station_id, district FROM public.stations_dly
    WHERE district IS NOT NULL;
    """
)

stationData = pd.read_sql(stationDataQuery, conn)
stationData[["district"]] = stationData[["district"]].astype(int)

In [None]:
# merge both the weather station data and the station data together
df = weatherData.merge(stationData, on="station_id")

In [None]:
df

In [None]:
# aggregate the values in the dataframe by date and district
agg_df = (
    df.groupby(["district", "date"])
    .agg(
        {
            "max_temp": "mean",
            "min_temp": "mean",
            "mean_temp": "mean",
            "total_rain": ["min", "max", "mean"],
            "total_snow": ["min", "max", "mean"],
            "total_precip": ["min", "max", "mean"],
            "snow_on_grnd": ["min", "max", "mean"],
        }
    )
    .reset_index()
)

# sets the column names for the aggregate dataframe
agg_df.columns = [  # type: ignore
    "district",
    "date",
    "max_temp",
    "min_temp",
    "mean_temp",
    "min_total_rain",
    "max_total_rain",
    "mean_total_rain",
    "min_total_snow",
    "max_total_snow",
    "mean_total_snow",
    "min_total_precip",
    "max_total_precip",
    "mean_total_precip",
    "min_snow_on_grnd",
    "max_snow_on_grnd",
    "mean_snow_on_grnd",
]

In [None]:
agg_df

In [None]:
# figure out the date range for date processing - puts all 365 days as MO-DA into dates
dates = []  # all 365 days as MO-DA - strings

months = [
    str(month) for month in range(MIN_MONTH, MAX_MONTH + 1)
]  # the month range we want to pull data from - strings
for month in months:
    if len(month) == 1:
        month = "0" + month

    numDays = calendar.monthrange(2001, int(month))[1]
    days = [str(day) for day in range(1, numDays + 1)]

    for day in days:
        if len(day) == 1:
            day = "0" + day

        dates.append(f"{month}-{day}")

In [90]:
years = [
    year for year in range(MIN_YEAR, MAX_YEAR + 1)
]  # the year range we want to pull data from - ints
uniqueDistricts = stationData["district"].unique()

cols = agg_df.columns.tolist()  # type: ignore
cols.remove("district")
cols.remove("date")

In [None]:
listForDF = (
    []
)  # loads all data where each row is its own dictionary so that it may added to a dataframe later (fast processing)

for year in years:
    print(f"Processing year: {year}")

    for district in uniqueDistricts:
        currData = {}  # for each year/district combination create a dictionary

        # adds the year and district
        currData["year"] = year
        currData["district"] = district

        for (
            date
        ) in (
            dates
        ):  # for each day we want to grab all attributes and establish them as columns i.e MO-DA:attribute
            fullDate = np.datetime64(
                f"{str(year)}-{date}"
            )  # calculates the date we are current processing
            currRow = agg_df.loc[
                (agg_df["date"] == fullDate) & (agg_df["district"] == district)
            ]  # grab the row from the aggregated df

            for col in cols:  # parse each of the desired columns
                currAttr = f"{date}:{col}"  # the current attribute which corresponds to the date and the column
                currVal = 0  # defaults as zero incase it does not exist

                if len(currRow[col]) == 1:
                    currVal = currRow[
                        col
                    ].item()  # the current value from the loaded data

                currData[currAttr] = currVal

        listForDF.append(currData)

In [None]:
final_df = pd.DataFrame(listForDF)

In [None]:
final_df

In [None]:
# final_df.to_sql(TABLENAME, conn, schema="public", if_exists="append", index=False)

In [None]:
db.cleanup()