In [None]:
import sqlalchemy as sq
import geopandas as gpd  # type: ignore
import pandas as pd  # type: ignore
import numpy as np
from dotenv import load_dotenv
import os, sys, calendar

sys.path.append("../")
from Shared.DataService import DataService

In [None]:
MIN_MONTH = 1
MAX_MONTH = 12

MIN_YEAR = 1995
MAX_YEAR = 2022
TABLENAME = "agg_hly_weather"

load_dotenv()
PG_DB = os.getenv("POSTGRES_DB")
PG_ADDR = os.getenv("POSTGRES_ADDR")
PG_PORT = os.getenv("POSTGRES_PORT")
PG_USER = os.getenv("POSTGRES_USER")
PG_PW = os.getenv("POSTGRES_PW")

In [None]:
def pullWeatherData(conn: sq.engine.Connection) -> pd.DataFrame:
    # pulling weather station data from the database
    weatherDataQuery = sq.text(
        """
        SELECT * FROM public.ab_hly_station_data
        UNION
        SELECT * FROM public.mb_hly_station_data
        UNION
        SELECT * FROM public.sk_hly_station_data;
        """
    )

    return pd.read_sql(weatherDataQuery, conn)

In [None]:
def pullStationData(conn: sq.engine.Connection) -> pd.DataFrame:
    # pulling station data from the database
    stationDataQuery = sq.text(
        """
        SELECT station_id, district FROM public.stations_dly
        WHERE district IS NOT NULL;
        """
    )

    stationData = pd.read_sql(stationDataQuery, conn)
    stationData[["district"]] = stationData[["district"]].astype(int)

    return stationData

In [None]:
def aggregateHlyData(df: pd.DataFrame) -> pd.DataFrame:
    # aggregate the values in the dataframe by date and district
    agg_df = (
        df.groupby(["district", "year", "month", "day"])
        .agg(
            {
                "min_temp": "mean",
                "max_temp": "mean",
                "mean_temp": "mean",
                "min_dew_point_temp": "mean",
                "max_dew_point_temp": "mean",
                "mean_dew_point_temp": "mean",
                "min_humidex": "mean",
                "max_humidex": "mean",
                "mean_humidex": "mean",
                "total_precip": ["min", "max", "mean"],
                "min_rel_humid": "mean",
                "max_rel_humid": "mean",
                "mean_rel_humid": "mean",
                "min_stn_press": "mean",
                "max_stn_press": "mean",
                "mean_stn_press": "mean",
                "min_visibility": "mean",
                "max_visibility": "mean",
                "mean_visibility": "mean",
            }
        )
        .reset_index()
    )

    # sets the column names for the aggregate dataframe
    agg_df.columns = [  # type: ignore
        "district",
        "year",
        "month",
        "day",
        "min_temp",
        "max_temp",
        "mean_temp",
        "min_dew_point_temp",
        "max_dew_point_temp",
        "mean_dew_point_temp",
        "min_humidex",
        "max_humidex",
        "mean_humidex",
        "min_precip",
        "max_precip",
        "mean_precip",
        "min_rel_humid",
        "max_rel_humid",
        "mean_rel_humid",
        "min_stn_press",
        "max_stn_press",
        "mean_stn_press",
        "min_visibility",
        "max_visibility",
        "mean_visibility",
    ]

    return agg_df

In [None]:
def getDates() -> list:
    # figure out the date range for date processing - puts all 365 days as MO-DA into dates
    dates = []  # all 365 days as MO-DA - strings

    # the month range we want to pull data from - strings
    months = [str(month) for month in range(MIN_MONTH, MAX_MONTH + 1)]
    for month in months:
        if len(month) == 1:
            month = "0" + month

        numDays = calendar.monthrange(2001, int(month))[1]
        days = [str(day) for day in range(1, numDays + 1)]

        for day in days:
            if len(day) == 1:
                day = "0" + day

            dates.append(f"{month}-{day}")

    return dates

In [None]:
def reshapeHlyData(
    dates: list, agg_df: pd.DataFrame, stationData: pd.DataFrame
) -> pd.DataFrame:
    # loads all data where each row is its own dictionary so that it may added to a dataframe later (fast processing)
    listForDF = []

    # the year range we want to pull data from - ints
    years = [year for year in range(MIN_YEAR, MAX_YEAR + 1)]
    uniqueDistricts = stationData["district"].unique()

    # get the columns we will want to pull information from
    cols = agg_df.columns.tolist()  # type: ignore
    cols.remove("district")
    cols.remove("year")
    cols.remove("month")
    cols.remove("day")

    for year in years:
        print(f"Processing year: {year}")

        for district in uniqueDistricts:
            currData = {}  # for each year/district combination create a dictionary

            # adds the year and district
            currData["year"] = year
            currData["district"] = district

            # for each day we want to grab all attributes and establish them as columns i.e MO-DA:attribute
            for date in dates:
                dateComponents = date.split("-")
                monthInt = int(dateComponents[0])
                dayInt = int(dateComponents[1])

                # grab the row from the aggregated df
                currRow = agg_df.loc[
                    (agg_df["year"] == year)
                    & (agg_df["month"] == monthInt)
                    & (agg_df["day"] == dayInt)
                    & (agg_df["district"] == district)
                ]

                for col in cols:  # parse each of the desired columns
                    currAttr = f"{date}:{col}"  # the current attribute which corresponds to the date and the column
                    currVal = 0  # defaults as zero incase it does not exist

                    if len(currRow[col]) == 1:
                        # the current value from the loaded data
                        currVal = currRow[col].item()

                    currData[currAttr] = currVal

            listForDF.append(currData)

    return listForDF

In [None]:
def main():
    if (
        PG_DB is None
        or PG_ADDR is None
        or PG_PORT is None
        or PG_USER is None
        or PG_PW is None
    ):
        raise ValueError("Environment variables not set")

    db = DataService(PG_DB, PG_ADDR, int(PG_PORT), PG_USER, PG_PW)
    conn = db.connect()

    weatherData = pullWeatherData(conn)
    stationData = pullStationData(conn)

    # merge both the weather station data and the station data together
    df = weatherData.merge(stationData, on="station_id")

    agg_df = aggregateHlyData(df)
    dates = getDates()
    listForDF = reshapeHlyData(dates, agg_df, stationData)
    final_df = pd.DataFrame(listForDF)

    # final_df.to_sql(TABLENAME, conn, schema="public", if_exists="append", index=False)
    db.cleanup()

In [None]:
if __name__ == "__main__":
    main()