In [59]:
import pandas as pd
import numpy as np

In [60]:
def get_mean_for_region(region,attribute,climate_df):
    return climate_df.groupby('Prov_or_Ter')[attribute].mean()[region]

In [61]:
all_processed_df = pd.DataFrame()

for year in range(2001,2024):
    for month in range(1,13):
        month = f'{month:>02}'  # to get months as 01, 02, 03, etc

        print("Processing",month,year)
        climate_df = pd.read_csv(f"./datasets/climate_data/en_climate_summaries_All_{month}-{year}.csv")
        climate_df.drop(['DwTm', 'DwTx', 'DwTn', 'DwS', 'DwP', 'DwBS'],axis=1,inplace=True)
        climate_df.drop(climate_df[climate_df["Prov_or_Ter"].isin(['NT','NU','YT','NL','NS','NB','PE'])].index, inplace=True)
        climate_df['P'] = climate_df['P'].replace('######',np.nan).replace('',np.nan).astype(float)
        climate_df['S%N'] = climate_df['S%N'].replace('######',np.nan).replace('',np.nan).astype(float)
        climate_df['P%N'] = climate_df['P%N'].replace('######',np.nan).replace('',np.nan).astype(float)

        processed_df = pd.DataFrame()
        processed_df['Province'] = climate_df['Prov_or_Ter'].unique()
        processed_df['Tm'] = processed_df['Province'].apply(get_mean_for_region,args=('Tm',climate_df))
        processed_df['Tn'] = processed_df['Province'].apply(get_mean_for_region,args=('Tn',climate_df))
        processed_df['Tx'] = processed_df['Province'].apply(get_mean_for_region,args=('Tx',climate_df))
        processed_df['D'] = processed_df['Province'].apply(get_mean_for_region,args=('D',climate_df))
        processed_df['S'] = processed_df['Province'].apply(get_mean_for_region,args=('S',climate_df))
        processed_df['S%N'] = processed_df['Province'].apply(get_mean_for_region,args=('S%N',climate_df))
        processed_df['P'] = processed_df['Province'].apply(get_mean_for_region,args=('P',climate_df))
        processed_df['P%N'] = processed_df['Province'].apply(get_mean_for_region,args=('P%N',climate_df))
        processed_df['S_G'] = processed_df['Province'].apply(get_mean_for_region,args=('S_G',climate_df))
        processed_df['Pd'] = processed_df['Province'].apply(get_mean_for_region,args=('Pd',climate_df))
        processed_df['BS'] = processed_df['Province'].apply(get_mean_for_region,args=('BS',climate_df))
        processed_df['BS%'] = processed_df['Province'].apply(get_mean_for_region,args=('BS%',climate_df))
        processed_df['HDD'] = processed_df['Province'].apply(get_mean_for_region,args=('HDD',climate_df))
        processed_df['CDD'] = processed_df['Province'].apply(get_mean_for_region,args=('CDD',climate_df))
        processed_df['Month'] = f'{year}-{month}'

        cols = processed_df.columns.tolist()
        cols = cols[-1:] + cols[:-1]

        processed_df = processed_df[cols] # move month to start of list
        all_processed_df = pd.concat([all_processed_df,processed_df])        

all_processed_df.head()

Processing 01 2001
Processing 02 2001
Processing 03 2001
Processing 04 2001
Processing 05 2001
Processing 06 2001
Processing 07 2001
Processing 08 2001
Processing 09 2001
Processing 10 2001
Processing 11 2001
Processing 12 2001
Processing 01 2002
Processing 02 2002
Processing 03 2002
Processing 04 2002
Processing 05 2002
Processing 06 2002
Processing 07 2002
Processing 08 2002
Processing 09 2002
Processing 10 2002
Processing 11 2002
Processing 12 2002
Processing 01 2003
Processing 02 2003
Processing 03 2003
Processing 04 2003
Processing 05 2003
Processing 06 2003
Processing 07 2003
Processing 08 2003
Processing 09 2003
Processing 10 2003
Processing 11 2003
Processing 12 2003
Processing 01 2004
Processing 02 2004
Processing 03 2004
Processing 04 2004
Processing 05 2004
Processing 06 2004
Processing 07 2004
Processing 08 2004
Processing 09 2004
Processing 10 2004
Processing 11 2004
Processing 12 2004
Processing 01 2005
Processing 02 2005
Processing 03 2005
Processing 04 2005
Processing 0

Unnamed: 0,Month,Province,Tm,Tn,Tx,D,S,S%N,P,P%N,S_G,Pd,BS,BS%,HDD,CDD
0,2001-01,BC,0.351884,-8.668497,8.602305,1.8915,18.861538,36.088106,107.797911,63.116071,19.377926,11.515625,47.826667,92.857143,519.116812,0.0
1,2001-01,AB,-4.457322,-18.166109,8.743096,6.969307,4.954146,16.724138,5.833195,22.02521,6.722222,1.784232,105.425,117.428571,683.53431,0.0
2,2001-01,SK,-8.665789,-24.729032,4.388312,6.291262,6.587407,34.563636,7.125,35.327586,28.449541,2.68125,116.233333,118.5,824.227632,0.0
3,2001-01,MB,-12.049573,-29.186325,2.558974,5.515873,10.712821,43.716049,10.385294,45.918605,41.987179,3.875,80.857143,84.571429,931.437607,0.0
4,2001-01,ON,-7.284188,-24.331624,2.815385,1.8008,42.97641,87.454545,46.653304,72.245902,35.673469,10.585903,61.79375,76.538462,756.844017,0.0


In [62]:
# generate Climate_ID surrogate key 
all_processed_df.insert(0, 'Climate_ID', range(1000, 1000 + len(all_processed_df)))

In [63]:
# rename columns
renamed_columns = {
    'Tm': 'Mean_Temp', 
    'Tn': 'Min_Temp',
    'Tx': 'Max_Temp',
    'D': 'Mean_Diff_from_Normal',
    'S': 'Snowfall',
    'S%N': 'Percent_of_Normal_Snowfall',
    'P': 'Total_Precip',
    'P%N': 'Percent_of_Normal_Precip',
    'S_G': 'Snow_on_Ground',
    'Pd': 'Days_with_Precip',
    'BS': 'Bright_Sunshine_Days',
    'BS%': 'Percent_of_Normal_Bright_Sunshine',
    'HDD': 'Heating_Degree_Days',
    'CDD': 'Cooling_Degree_Days',
    'Heating Degree Days': 'Heating_Degree_Days',
    'Cooling Degree Days': 'Cooling_Degree_Days',
    'Percent of Normal Snowfall': 'Percent_of_Normal_Snowfall',
    'Percent of Normal Precip': 'Percent_of_Normal_Precip',
}


all_processed_df.rename(columns=renamed_columns, inplace=True)

In [64]:
all_processed_df.head()

Unnamed: 0,Climate_ID,Month,Province,Mean_Temp,Min_Temp,Max_Temp,Mean_Diff_from_Normal,Snowfall,Percent_of_Normal_Snowfall,Total_Precip,Percent_of_Normal_Precip,Snow_on_Ground,Days_with_Precip,Bright_Sunshine_Days,Percent_of_Normal_Bright_Sunshine,Heating_Degree_Days,Cooling_Degree_Days
0,1000,2001-01,BC,0.351884,-8.668497,8.602305,1.8915,18.861538,36.088106,107.797911,63.116071,19.377926,11.515625,47.826667,92.857143,519.116812,0.0
1,1001,2001-01,AB,-4.457322,-18.166109,8.743096,6.969307,4.954146,16.724138,5.833195,22.02521,6.722222,1.784232,105.425,117.428571,683.53431,0.0
2,1002,2001-01,SK,-8.665789,-24.729032,4.388312,6.291262,6.587407,34.563636,7.125,35.327586,28.449541,2.68125,116.233333,118.5,824.227632,0.0
3,1003,2001-01,MB,-12.049573,-29.186325,2.558974,5.515873,10.712821,43.716049,10.385294,45.918605,41.987179,3.875,80.857143,84.571429,931.437607,0.0
4,1004,2001-01,ON,-7.284188,-24.331624,2.815385,1.8008,42.97641,87.454545,46.653304,72.245902,35.673469,10.585903,61.79375,76.538462,756.844017,0.0


In [65]:

all_processed_df.isnull().sum(axis = 0)

# missing Percent_of_Normal_Snowfall are set to 100 as they occur in months of July and August where normal Snowfall is 0 

Climate_ID                              0
Month                                   0
Province                                0
Mean_Temp                               0
Min_Temp                                0
Max_Temp                                0
Mean_Diff_from_Normal                   0
Snowfall                                0
Percent_of_Normal_Snowfall            133
Total_Precip                            0
Percent_of_Normal_Precip                0
Snow_on_Ground                          0
Days_with_Precip                        0
Bright_Sunshine_Days                  970
Percent_of_Normal_Bright_Sunshine    1034
Heating_Degree_Days                     0
Cooling_Degree_Days                     0
dtype: int64

In [66]:
len(all_processed_df)

1656

In [67]:
pd.set_option('display.max_rows', 10)
all_processed_df[all_processed_df['Percent_of_Normal_Snowfall'].isnull()]


Unnamed: 0,Climate_ID,Month,Province,Mean_Temp,Min_Temp,Max_Temp,Mean_Diff_from_Normal,Snowfall,Percent_of_Normal_Snowfall,Total_Precip,Percent_of_Normal_Precip,Snow_on_Ground,Days_with_Precip,Bright_Sunshine_Days,Percent_of_Normal_Bright_Sunshine,Heating_Degree_Days,Cooling_Degree_Days
4,1046,2001-08,ON,20.705882,7.802092,34.581933,1.862097,0.0,,67.999200,83.704000,0.000000,8.052000,243.155556,111.083333,13.444958,96.027311
4,1118,2002-08,ON,19.875688,8.042466,31.880734,1.023009,0.0,,48.264192,58.447368,0.000000,6.126638,255.264286,112.555556,19.259174,76.976606
4,1190,2003-08,ON,20.123474,6.841784,30.916432,1.274074,0.0,,78.631416,95.574074,0.000000,8.477876,90.450000,,19.194366,83.791549
4,1262,2004-08,ON,17.163158,5.175120,28.348571,-1.578846,0.0,,73.681818,89.596154,0.000000,9.590909,242.300000,94.166667,59.804785,33.696651
4,1334,2005-08,ON,20.108738,8.021256,31.905825,1.405208,0.0,,80.449774,106.673684,0.000000,8.692308,246.500000,103.333333,17.764078,82.006796
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,2625,2023-07,QC,18.068889,7.993333,30.870476,1.800000,0.0,,147.157055,179.416667,0.000000,12.564417,,,47.913333,51.867407
0,2626,2023-08,BC,18.361321,7.709906,31.405140,1.716949,0.0,,32.328934,66.666667,0.012821,4.497462,,,34.571698,44.233962
3,2629,2023-08,MB,17.606452,5.904762,29.788710,0.550000,0.0,,46.860000,69.750000,0.000000,6.800000,,,40.701613,30.403226
4,2630,2023-08,ON,17.664634,5.695122,28.050000,-0.656000,0.0,,84.843506,117.625000,0.000000,8.974026,,,39.761585,29.334146


In [68]:
all_processed_df[all_processed_df['Bright_Sunshine_Days'].isnull()][['Month','Province','Bright_Sunshine_Days','Percent_of_Normal_Bright_Sunshine']]

Unnamed: 0,Month,Province,Bright_Sunshine_Days,Percent_of_Normal_Bright_Sunshine
3,2006-04,MB,,
3,2006-05,MB,,
3,2006-06,MB,,
3,2006-07,MB,,
3,2006-10,MB,,
...,...,...,...,...
1,2023-12,AB,,
2,2023-12,SK,,
3,2023-12,MB,,
4,2023-12,ON,,


In [69]:
# Handle nan values .
# missing Percent_of_Normal_Snowfall are set to 0 as they occur in months of July and August where Snowfall is 0 
all_processed_df['Percent_of_Normal_Snowfall'].fillna(0,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_processed_df['Percent_of_Normal_Snowfall'].fillna(0,inplace=True)


In [70]:
# dropping attributes with large number of missing values 
all_processed_df.drop(['Bright_Sunshine_Days','Percent_of_Normal_Bright_Sunshine'],axis=1,inplace=True)

In [71]:
all_processed_df.reset_index(inplace=True)
all_processed_df.head(20)

Unnamed: 0,index,Climate_ID,Month,Province,Mean_Temp,Min_Temp,Max_Temp,Mean_Diff_from_Normal,Snowfall,Percent_of_Normal_Snowfall,Total_Precip,Percent_of_Normal_Precip,Snow_on_Ground,Days_with_Precip,Heating_Degree_Days,Cooling_Degree_Days
0,0,1000,2001-01,BC,0.351884,-8.668497,8.602305,1.891500,18.861538,36.088106,107.797911,63.116071,19.377926,11.515625,519.116812,0.000000
1,1,1001,2001-01,AB,-4.457322,-18.166109,8.743096,6.969307,4.954146,16.724138,5.833195,22.025210,6.722222,1.784232,683.534310,0.000000
2,2,1002,2001-01,SK,-8.665789,-24.729032,4.388312,6.291262,6.587407,34.563636,7.125000,35.327586,28.449541,2.681250,824.227632,0.000000
3,3,1003,2001-01,MB,-12.049573,-29.186325,2.558974,5.515873,10.712821,43.716049,10.385294,45.918605,41.987179,3.875000,931.437607,0.000000
4,4,1004,2001-01,ON,-7.284188,-24.331624,2.815385,1.800800,42.976410,87.454545,46.653304,72.245902,35.673469,10.585903,756.844017,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15,3,1015,2001-03,MB,-6.758261,-23.648696,5.978261,0.433871,19.719328,113.679012,21.252113,93.023256,29.957746,4.183099,763.560000,0.000000
16,4,1016,2001-03,ON,-2.883333,-18.515385,9.482479,-0.541270,37.885427,140.604839,42.961373,76.360000,20.985714,8.446352,628.831624,0.000000
17,5,1017,2001-03,QC,-5.451273,-27.953091,8.161455,-0.596875,59.890050,164.954955,60.131873,98.027027,64.041026,8.406375,707.307636,0.000000
18,0,1018,2001-04,BC,6.320000,-4.087572,19.440000,-0.707960,6.649401,70.548718,77.856555,94.172566,3.100000,10.442159,325.337391,0.002029


In [74]:
all_processed_df.drop('index',axis=1,inplace=True)
all_processed_df.head(20)

Unnamed: 0,Climate_ID,Month,Province,Mean_Temp,Min_Temp,Max_Temp,Mean_Diff_from_Normal,Snowfall,Percent_of_Normal_Snowfall,Total_Precip,Percent_of_Normal_Precip,Snow_on_Ground,Days_with_Precip,Heating_Degree_Days,Cooling_Degree_Days
0,1000,2001-01,BC,0.351884,-8.668497,8.602305,1.8915,18.861538,36.088106,107.797911,63.116071,19.377926,11.515625,519.116812,0.0
1,1001,2001-01,AB,-4.457322,-18.166109,8.743096,6.969307,4.954146,16.724138,5.833195,22.02521,6.722222,1.784232,683.53431,0.0
2,1002,2001-01,SK,-8.665789,-24.729032,4.388312,6.291262,6.587407,34.563636,7.125,35.327586,28.449541,2.68125,824.227632,0.0
3,1003,2001-01,MB,-12.049573,-29.186325,2.558974,5.515873,10.712821,43.716049,10.385294,45.918605,41.987179,3.875,931.437607,0.0
4,1004,2001-01,ON,-7.284188,-24.331624,2.815385,1.8008,42.97641,87.454545,46.653304,72.245902,35.673469,10.585903,756.844017,0.0
5,1005,2001-01,QC,-11.848165,-27.704128,-0.005505,0.778947,46.938514,86.672269,43.582915,62.791667,44.892617,11.251256,904.394037,0.0
6,1006,2001-02,BC,-1.307781,-13.054441,9.0,-1.39799,15.914201,95.544248,47.26276,52.769912,18.104235,7.651042,526.445533,0.0
7,1007,2001-02,AB,-12.235021,-30.208439,8.502532,-3.784694,16.21746,92.045045,15.730332,91.123894,12.285714,4.559242,829.360759,0.0
8,1008,2001-02,SK,-17.148026,-34.496753,3.194156,-5.204902,11.06015,91.376147,10.78141,87.426087,32.785047,4.025641,983.901974,0.0
9,1009,2001-02,MB,-18.767521,-35.649573,-1.538462,-4.688889,13.370435,88.353659,12.841667,81.62069,47.30137,4.681818,1016.544444,0.0


In [75]:
pd.set_option('display.max_rows', 2000)
all_processed_df.isnull().sum(axis = 0)

Climate_ID                    0
Month                         0
Province                      0
Mean_Temp                     0
Min_Temp                      0
Max_Temp                      0
Mean_Diff_from_Normal         0
Snowfall                      0
Percent_of_Normal_Snowfall    0
Total_Precip                  0
Percent_of_Normal_Precip      0
Snow_on_Ground                0
Days_with_Precip              0
Heating_Degree_Days           0
Cooling_Degree_Days           0
dtype: int64

In [76]:
all_processed_df.to_csv('datasets/climate_data/climate_dimension.csv',index=False)