In [17]:
import sqlalchemy as sq
import geopandas as gpd  # type: ignore
import pandas as pd  # type: ignore
from dotenv import load_dotenv
import os, sys, calendar

sys.path.append("../")
from Shared.DataService import DataService  # type: ignore

In [18]:
TABLENAME = "agg_station_dly"

load_dotenv()
PG_DB = os.getenv("POSTGRES_DB")
PG_ADDR = os.getenv("POSTGRES_ADDR")
PG_PORT = os.getenv("POSTGRES_PORT")
PG_USER = os.getenv("POSTGRES_USER")
PG_PW = os.getenv("POSTGRES_PW")

In [27]:
db = DataService(PG_DB, PG_ADDR, PG_PORT, PG_USER, PG_PW)
conn = db.connect()

In [29]:
# pulling alberta weather sation data from database
query = sq.text(
    """
    SELECT * FROM public.ab_station_data
    UNION
    SELECT * FROM public.mb_station_data
    UNION
    SELECT * FROM public.sk_station_data;
    """
)

weatherData = pd.read_sql(query, conn)

In [33]:
query = sq.text(
    """
    SELECT station_id, district FROM public.stations_dly
    WHERE district IS NOT NULL;
    """
)

stationData = pd.read_sql(query, conn)
stationData[['district']] = stationData[['district']].astype(int)

In [34]:
df = weatherData.merge(stationData, on="station_id")

In [35]:
df

Unnamed: 0,station_id,date,year,month,day,max_temp,min_temp,mean_temp,total_rain,total_snow,total_precip,snow_on_grnd,district
0,3010010,2002-11-26,2002,11,26,5.1,-13.2,-4.1,0.0,0.0,0.0,0.0,4860
1,3010010,2002-11-27,2002,11,27,7.6,-2.8,2.4,0.0,0.0,0.0,0.0,4860
2,3010010,2002-11-28,2002,11,28,8.7,1.7,5.2,0.0,0.0,0.0,0.0,4860
3,3010010,2002-11-29,2002,11,29,3.5,-3.6,-0.1,0.0,0.0,0.0,0.0,4860
4,3010010,2002-11-30,2002,11,30,7.8,-6.3,0.8,0.0,0.0,0.0,0.0,4860
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3387796,506B047,2005-09-04,2005,9,4,21.1,11.6,16.4,1.0,0.0,1.0,0.0,4612
3387797,506B047,2005-09-05,2005,9,5,22.3,9.3,15.8,9.5,0.0,9.5,0.0,4612
3387798,506B047,2005-09-06,2005,9,6,16.1,11.4,13.8,0.0,0.0,0.0,0.0,4612
3387799,506B047,2005-09-07,2005,9,7,17.8,9.6,13.7,0.0,0.0,0.0,0.0,4612


In [36]:
final_df = df.groupby(["district", "date"]).agg({
    "max_temp": "mean",
    "min_temp": "mean",
    "mean_temp": "mean",
    "total_rain": ["min", "max", "mean"],
    "total_snow": ["min", "max", "mean"],
    "total_precip": ["min", "max", "mean"]
}).reset_index()

final_df.columns = [  # type: ignore
    "district",
    "date",
    "max_temp",
    "min_temp",
    "mean_temp",
    "min_rain",
    "max_rain",
    "mean_rain",
    "min_snow",
    "max_snow",
    "mean_snow",
    "min_precip",
    "max_precip",
    "mean_precip"
]

In [37]:
final_df

Unnamed: 0,district,date,max_temp,min_temp,mean_temp,min_rain,max_rain,mean_rain,min_snow,max_snow,mean_snow,min_precip,max_precip,mean_precip
0,4601,1992-12-01,2.000000,-6.200000,-2.100000,0.0,0.0,0.0,0.0,0.0,0.000000,1.1,1.1,1.100000
1,4601,1992-12-02,-5.900000,-15.700000,-10.800000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
2,4601,1992-12-05,-3.000000,-16.000000,-9.500000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
3,4601,1992-12-06,-7.500000,-20.600000,-14.100000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
4,4601,1992-12-07,-9.500000,-19.400000,-14.500000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
921681,4870,2022-12-27,-16.286667,-20.503333,-18.380000,0.0,0.0,0.0,0.0,4.0,0.246667,0.0,7.2,2.946667
921682,4870,2022-12-28,-13.950000,-19.213333,-16.590000,0.0,0.0,0.0,0.0,0.4,0.013333,0.0,4.0,0.553333
921683,4870,2022-12-29,-11.080000,-20.066667,-15.566667,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.8,0.123333
921684,4870,2022-12-30,-9.356667,-18.296667,-13.830000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.3,0.010000


In [42]:
MIN_MONTH = 1
MAX_MONTH = 12

MIN_YEAR = 1995
MAX_YEAR = 2023

years = [year for year in range(MIN_YEAR, MAX_YEAR + 1)]  # the year range we want to pull data from
months = [month for month in range(MIN_MONTH, MAX_MONTH + 1)]  # the month range we want to pull data from


columns = ['year', 'district']
dates = []
for year in years:
    for month in months:
        numDays = calendar.monthrange(int(year), int(month))[1]

        for day in range(1, numDays + 1):
            dates.append(f'{year}/{month}/{day}')
            columns.append(f'{year}/{month}/{day}:max_temp')
            columns.append(f'{year}/{month}/{day}:min_temp')
            columns.append(f'{year}/{month}/{day}:mean_temp')
            columns.append(f'{year}/{month}/{day}:min_total_rain')
            columns.append(f'{year}/{month}/{day}:max_total_rain')
            columns.append(f'{year}/{month}/{day}:mean_total_rain')
            columns.append(f'{year}/{month}/{day}:min_total_snow')
            columns.append(f'{year}/{month}/{day}:max_total_snow')
            columns.append(f'{year}/{month}/{day}:mean_total_snow')
            columns.append(f'{year}/{month}/{day}:min_total_precip')
            columns.append(f'{year}/{month}/{day}:max_total_precip')
            columns.append(f'{year}/{month}/{day}:mean_total_precip')
            columns.append(f'{year}/{month}/{day}:min_snow_on_grnd')
            columns.append(f'{year}/{month}/{day}:max_snow_on_grnd')
            columns.append(f'{year}/{month}/{day}:mean_snow_on_grnd')

In [43]:
aggregated = pd.DataFrame(columns=columns)

In [44]:
aggregated

Unnamed: 0,year,district,1995/1/1:max_temp,1995/1/1:min_temp,1995/1/1:mean_temp,1995/1/1:min_total_rain,1995/1/1:max_total_rain,1995/1/1:mean_total_rain,1995/1/1:min_total_snow,1995/1/1:max_total_snow,...,2023/12/31:mean_total_rain,2023/12/31:min_total_snow,2023/12/31:max_total_snow,2023/12/31:mean_total_snow,2023/12/31:min_total_precip,2023/12/31:max_total_precip,2023/12/31:mean_total_precip,2023/12/31:min_snow_on_grnd,2023/12/31:max_snow_on_grnd,2023/12/31:mean_snow_on_grnd
