In [14]:
import sqlalchemy as sq
import geopandas as gpd  # type: ignore
import pandas as pd  # type: ignore
import numpy as np
from dotenv import load_dotenv
import os, sys, calendar

sys.path.append("../")
from Shared.DataService import DataService

In [15]:
MIN_MONTH = 1
MAX_MONTH = 12

MIN_YEAR = 1995
MAX_YEAR = 2022
TABLENAME = "agg_dly_weather"

load_dotenv()
PG_DB = os.getenv("POSTGRES_DB")
PG_ADDR = os.getenv("POSTGRES_ADDR")
PG_PORT = os.getenv("POSTGRES_PORT")
PG_USER = os.getenv("POSTGRES_USER")
PG_PW = os.getenv("POSTGRES_PW")

In [16]:
def pullWeatherData(conn: sq.engine.Connection) -> pd.DataFrame:
    # pulling weather station data from the database
    weatherDataQuery = sq.text(
        """
        SELECT * FROM public.ab_station_data
        UNION
        SELECT * FROM public.mb_station_data
        UNION
        SELECT * FROM public.sk_station_data;
        """
    )

    return pd.read_sql(weatherDataQuery, conn)

In [17]:
def pullStationData(conn: sq.engine.Connection) -> pd.DataFrame:
    # pulling station data from the database
    stationDataQuery = sq.text(
        """
        SELECT station_id, district FROM public.stations_dly
        WHERE district IS NOT NULL;
        """
    )

    stationData = pd.read_sql(stationDataQuery, conn)
    stationData[["district"]] = stationData[["district"]].astype(int)

    return stationData

In [18]:
def aggregateDlyData(df: pd.DataFrame) -> pd.DataFrame:
    # aggregate the values in the dataframe by date and district
    agg_df = (
        df.groupby(["district", "date"])
        .agg(
            {
                "max_temp": "mean",
                "min_temp": "mean",
                "mean_temp": "mean",
                "total_rain": ["min", "max", "mean"],
                "total_snow": ["min", "max", "mean"],
                "total_precip": ["min", "max", "mean"],
                "snow_on_grnd": ["min", "max", "mean"],
            }
        )
        .reset_index()
    )

    # sets the column names for the aggregate dataframe
    agg_df.columns = [  # type: ignore
        "district",
        "date",
        "max_temp",
        "min_temp",
        "mean_temp",
        "min_total_rain",
        "max_total_rain",
        "mean_total_rain",
        "min_total_snow",
        "max_total_snow",
        "mean_total_snow",
        "min_total_precip",
        "max_total_precip",
        "mean_total_precip",
        "min_snow_on_grnd",
        "max_snow_on_grnd",
        "mean_snow_on_grnd",
    ]

    return agg_df

In [19]:
def getDates() -> list:
    # figure out the date range for date processing - puts all 365 days as MO-DA into dates
    dates = []  # all 365 days as MO-DA - strings

    # the month range we want to pull data from - strings
    months = [str(month) for month in range(MIN_MONTH, MAX_MONTH + 1)]
    for month in months:
        if len(month) == 1:
            month = "0" + month

        numDays = calendar.monthrange(2001, int(month))[1]
        days = [str(day) for day in range(1, numDays + 1)]

        for day in days:
            if len(day) == 1:
                day = "0" + day

            dates.append(f"{month}-{day}")

    return dates

In [20]:
def reshapeDlyData(
    dates: list, agg_df: pd.DataFrame, stationData: pd.DataFrame
) -> list:
    # loads all data where each row is its own dictionary so that it may added to a dataframe later (fast processing)
    listForDF = []

    # the year range we want to pull data from - ints
    years = [year for year in range(MIN_YEAR, MAX_YEAR + 1)]
    uniqueDistricts = stationData["district"].unique()

    # get the columns we will want to pull information from
    cols = agg_df.columns.tolist()  # type: ignore
    cols.remove("district")
    cols.remove("date")

    for year in years:
        print(f"Processing year: {year}")

        for district in uniqueDistricts:
            currData = {}  # for each year/district combination create a dictionary

            # adds the year and district
            currData["year"] = year
            currData["district"] = district

            # for each day we want to grab all attributes and establish them as columns i.e MO-DA:attribute
            for date in dates:
                # calculates the date we are current processing
                fullDate = np.datetime64(f"{str(year)}-{date}")

                # grab the row from the aggregated df
                currRow = agg_df.loc[
                    (agg_df["date"] == fullDate) & (agg_df["district"] == district)
                ]

                for col in cols:  # parse each of the desired columns
                    currAttr = f"{date}:{col}"  # the current attribute which corresponds to the date and the column
                    currVal = 0  # defaults as zero incase it does not exist

                    if len(currRow[col]) == 1:
                        # the current value from the loaded data
                        currVal = currRow[col].item()

                    currData[currAttr] = currVal

            listForDF.append(currData)

    return listForDF

In [21]:
def main():
    if (
        PG_DB is None
        or PG_ADDR is None
        or PG_PORT is None
        or PG_USER is None
        or PG_PW is None
    ):
        raise ValueError("Environment variables not set")

    db = DataService(PG_DB, PG_ADDR, int(PG_PORT), PG_USER, PG_PW)
    conn = db.connect()

    weatherData = pullWeatherData(conn)
    stationData = pullStationData(conn)

    # merge both the weather station data and the station data together
    df = weatherData.merge(stationData, on="station_id")

    agg_df = aggregateDlyData(df)
    dates = getDates()
    listForDF = reshapeDlyData(dates, agg_df, stationData)
    final_df = pd.DataFrame(listForDF)

    try:
        final_df.to_csv(path_or_buf='data/aggregatedDly.csv', sep=',', columns=final_df.columns.tolist())
        # final_df.to_sql(TABLENAME, conn, schema="public", if_exists="append", index=False)
    except Exception as e:
        print('[ERROR]')
        print(e)
    
    db.cleanup()

In [22]:
if __name__ == "__main__":
    main()

Processing year: 1995
Processing year: 1996
Processing year: 1997
Processing year: 1998
Processing year: 1999
Processing year: 2000
Processing year: 2001
Processing year: 2002
Processing year: 2003
Processing year: 2004
Processing year: 2005
Processing year: 2006
Processing year: 2007
Processing year: 2008
Processing year: 2009
Processing year: 2010
Processing year: 2011
Processing year: 2012
Processing year: 2013
Processing year: 2014
Processing year: 2015
Processing year: 2016
Processing year: 2017
Processing year: 2018
Processing year: 2019
Processing year: 2020
Processing year: 2021
Processing year: 2022
