# aggregateDly.ipynb
After loading the [daily weather stations](https://github.com/ChromaticPanic/CGC_Grain_Outcome_Predictions#stations_dly) and the [daily weather station data](https://github.com/ChromaticPanic/CGC_Grain_Outcome_Predictions#ab_hld_station_data) the following class can be used to calculate the minimum, mean and maximum of all attributes per district

##### Output:
An excel document with the expected output columns (saves as specified by pathToSave i.e datasets uses datasets/data/)

##### Remarks: 
- As weeks change per year, the weekly aggregation uses the year of 2001 (not a leap year)
- Although a tablename is assigned there is a column limit in postgres that hinders this option

In [None]:
from dotenv import load_dotenv
import sqlalchemy as sq
import geopandas as gpd  # type: ignore
import pandas as pd
import numpy as np
import os, sys, calendar

sys.path.append("../")
from Shared.DataService import DataService

In [None]:
DLY_STATIONS = "stations_dly"  # table that contains the hourly stations

AB_DLY_TABLE = "ab_dly_station_data"  # table that contains Albertas data
MB_DLY_TABLE = "mb_dly_station_data"  # table that contains Manitobas data
SK_DLY_TABLE = "sk_dly_station_data"  # table that contains Saskatchewans data

MIN_MONTH = 1  # The month to start aggregating on
MAX_MONTH = 12  # The month to finish aggregating on

MIN_YEAR = 1995  # The year to start aggregating on
MAX_YEAR = 2022  # The year to finish aggregating on
TABLENAME = "agg_dly_weather"  # Name of the table where results are stored


# Load the database connection environment variables located in the docker folder
load_dotenv("../docker/.env")
PG_DB = os.getenv("POSTGRES_DB")
PG_ADDR = os.getenv("POSTGRES_ADDR")
PG_PORT = os.getenv("POSTGRES_PORT")
PG_USER = os.getenv("POSTGRES_USER")
PG_PW = os.getenv("POSTGRES_PW")

In [None]:
def main():
    if (
        PG_DB is None
        or PG_ADDR is None
        or PG_PORT is None
        or PG_USER is None
        or PG_PW is None
    ):
        raise ValueError("Environment variables not set")

    db = DataService(PG_DB, PG_ADDR, int(PG_PORT), PG_USER, PG_PW)
    conn = db.connect()

    weatherData = pullWeatherData(conn)
    stationData = pullStationData(conn)

    # merge both the weather station data and the station data together
    df = weatherData.merge(stationData, on="station_id")

    agg_df = aggregateDlyData(df)
    dates = getDates()
    listForDF = reshapeDlyData(dates, agg_df, stationData)
    final_df = pd.DataFrame(listForDF)

    try:
        final_df.to_csv(
            path_or_buf="data/aggregatedDly.csv",
            sep=",",
            columns=final_df.columns.tolist(),
        )
        # final_df.to_sql(TABLENAME, conn, schema="public", if_exists="append", index=False)
    except Exception as e:
        print("[ERROR]")
        print(e)

    db.cleanup()

In [None]:
def pullWeatherData(conn: sq.engine.Connection) -> pd.DataFrame:
    """
    Purpose:
    Loads the weather station data per province from the weather station data tables

    Tables:
    - [ab_dly_station_data](https://github.com/ChromaticPanic/CGC_Grain_Outcome_Predictions#ab_dly_station_data)
    - [mb_dly_station_data](https://github.com/ChromaticPanic/CGC_Grain_Outcome_Predictions#mb_dly_station_data)
    - [sk_dly_station_data](https://github.com/ChromaticPanic/CGC_Grain_Outcome_Predictions#sk_dly_station_data)

    Psuedocode:
    - Create the weather station data SQL query
    - [Load the data from the database directly into a DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.read_sql.html)
    """
    weatherDataQuery = sq.text(
        f"""
        SELECT * FROM public.{AB_DLY_TABLE}
        UNION
        SELECT * FROM public.{MB_DLY_TABLE}
        UNION
        SELECT * FROM public.{SK_DLY_TABLE};
        """
    )

    return pd.read_sql(weatherDataQuery, conn)

In [None]:
def pullStationData(conn: sq.engine.Connection) -> pd.DataFrame:
    """
    Purpose:
    Loads the weather stations from the daily weather station table

    Tables:
    - [stations_dly](https://github.com/ChromaticPanic/CGC_Grain_Outcome_Predictions#stations_dly)

    Psuedocode:
    - Create the weather station SQL query
    - [Load the data from the database directly into a DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.read_sql.html)
    - [Cast district into an integer](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.astype.html)

    Remarks: if district is not casted, future methods will throw errors
    """
    stationDataQuery = sq.text(
        f"""
        SELECT station_id, district FROM public.{DLY_STATIONS}
        WHERE district IS NOT NULL;
        """
    )

    stationData = pd.read_sql(stationDataQuery, conn)
    stationData[["district"]] = stationData[["district"]].astype(int)

    return stationData

In [None]:
def aggregateDlyData(df: pd.DataFrame) -> pd.DataFrame:
    """
    Purpose:
    Aggregate the daily weather station data by district and date

    Psuedocode:
    - Aggregate the columns by district and date
    - Name the columns into the final DataFrame
    """
    agg_df = (
        df.groupby(["district", "date"])
        .agg(
            {
                "max_temp": "mean",
                "min_temp": "mean",
                "mean_temp": "mean",
                "total_rain": ["min", "max", "mean"],
                "total_snow": ["min", "max", "mean"],
                "total_precip": ["min", "max", "mean"],
                "snow_on_grnd": ["min", "max", "mean"],
            }
        )
        .reset_index()
    )

    # sets the column names for the aggregate dataframe
    agg_df.columns = [  # type: ignore
        "district",
        "date",
        "max_temp",
        "min_temp",
        "mean_temp",
        "min_total_rain",
        "max_total_rain",
        "mean_total_rain",
        "min_total_snow",
        "max_total_snow",
        "mean_total_snow",
        "min_total_precip",
        "max_total_precip",
        "mean_total_precip",
        "min_snow_on_grnd",
        "max_snow_on_grnd",
        "mean_snow_on_grnd",
    ]

    return agg_df

In [None]:
def getDates() -> list:
    # figure out the date range for date processing - puts all 365 days as MO-DA into dates
    dates = []  # all 365 days as MO-DA - strings

    # the month range we want to pull data from - strings
    months = [str(month) for month in range(MIN_MONTH, MAX_MONTH + 1)]
    for month in months:
        if len(month) == 1:
            month = "0" + month

        numDays = calendar.monthrange(2001, int(month))[1]
        days = [str(day) for day in range(1, numDays + 1)]

        for day in days:
            if len(day) == 1:
                day = "0" + day

            dates.append(f"{month}-{day}")

    return dates

In [None]:
def reshapeDlyData(
    dates: list, agg_df: pd.DataFrame, stationData: pd.DataFrame
) -> list:
    # loads all data where each row is its own dictionary so that it may added to a dataframe later (fast processing)
    listForDF = []

    # the year range we want to pull data from - ints
    years = [year for year in range(MIN_YEAR, MAX_YEAR + 1)]
    uniqueDistricts = stationData["district"].unique()

    # get the columns we will want to pull information from
    cols = agg_df.columns.tolist()  # type: ignore
    cols.remove("district")
    cols.remove("date")

    for year in years:
        print(f"Processing year: {year}")

        for district in uniqueDistricts:
            currData = {}  # for each year/district combination create a dictionary

            # adds the year and district
            currData["year"] = year
            currData["district"] = district

            # for each day we want to grab all attributes and establish them as columns i.e MO-DA:attribute
            for date in dates:
                # calculates the date we are current processing
                fullDate = np.datetime64(f"{str(year)}-{date}")

                # grab the row from the aggregated df
                currRow = agg_df.loc[
                    (agg_df["date"] == fullDate) & (agg_df["district"] == district)
                ]

                for col in cols:  # parse each of the desired columns
                    currAttr = f"{date}:{col}"  # the current attribute which corresponds to the date and the column
                    currVal = 0  # defaults as zero incase it does not exist

                    if len(currRow[col]) == 1:
                        # the current value from the loaded data
                        currVal = currRow[col].item()

                    currData[currAttr] = currVal

            listForDF.append(currData)

    return listForDF

In [None]:
if __name__ == "__main__":
    main()