In [96]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.layers import Dropout

In [2]:
# this csv contains all the data from canadian weather stations from 1917-2017
df = pd.read_csv('./data/canada_all_df.csv')

In [4]:
# the csv was saved with an index droping the index column
df.drop(columns='Unnamed: 0', inplace = True)


KeyError: "['Unnamed: 0'] not found in axis"

In [None]:
# makes a new column that called decade. The value reflects how many decades the after 1910 the data was recorded  

dec ={'1910':0, '1920':1, '1930':2, '1940':3, '1950':4, '1960':5, '1970':6, '1980':7,
       '1990':8, '2000':9, '2010':10}
df['decade'] = df['Year']//10 *10
df['decade'] = df['decade'].astype(str)
df['decade']=   df['decade'].map(dec)

In [122]:
# shows the baseline of each decade
df['decade'].value_counts(normalize=True)

7     0.148362
6     0.145253
8     0.131075
5     0.123600
9     0.099980
4     0.098478
3     0.079141
2     0.063546
1     0.049516
10    0.049088
0     0.011960
Name: decade, dtype: float64

In [6]:
# Makes the dataframe contain only stations with more than 30 years worth of observations



# https://stackoverflow.com/questions/29836836/how-do-i-filter-a-pandas-dataframe-based-on-value-counts


df = df.groupby("Clim_ID").filter(lambda x: len(x) > 360)

In [7]:
# sets the features for the model to be all numerical values in our dataset, and sets out target to what decade the data came from

X= df.drop(columns= ['Stn_Name', 'Prov', 'Clim_ID', 'decade', 'Year'])
y = df['decade']

X_train, X_test,y_train,y_test = train_test_split(X,y, stratify=y)

In [8]:
# scales the data
sc = StandardScaler()

X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [10]:
# did a quick randomforest model with max depth of 10
rf =RandomForestClassifier(max_depth=10)
rf.fit(X_train_sc,y_train)
rf.score(X_train_sc,y_train), rf.score(X_test_sc,y_test)

(0.2388341578227841, 0.22741238372639294)

In [23]:
# made a random forest model with a max depth of 100
# the score 
rf =RandomForestClassifier(max_depth=100)
rf.fit(X_train_sc,y_train)
rf.score(X_train_sc,y_train), rf.score(X_test_sc,y_test)

(0.9999874721450976, 0.41590247110777034)

In [62]:
# sets X and y and transform y into an arry to show which decade its in.

X= df.drop(columns= ['Stn_Name', 'Prov', 'Clim_ID', 'decade', 'Year'])
y = df['decade']

y = to_categorical(y)

# splits X and y into train and test
X_train, X_test,y_train,y_test = train_test_split(X,y, stratify=y)


# scales X and y
sc = StandardScaler()

X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [71]:
# creates a neural nets  model
nn= Sequential()

#  2 hidden layers
nn.add(Dense(32, input_dim=X_train_sc.shape[1], activation='relu'))
nn.add(Dense(32, activation='relu'))

# output layer
nn.add(Dense(11, activation='softmax'))

# compiles model
nn.compile(loss ='categorical_crossentropy', optimizer= 'adam', metrics=['acc'])

In [72]:
# fits our model
results = nn.fit(X_train_sc, y_train,
                    batch_size=32,
                    epochs=10,
                    validation_data=(X_test_sc,y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [79]:
# let's create our model
nn= Sequential()
# hidden layers
nn.add(Dense(512, input_dim=X_train_sc.shape[1], activation='relu'))
nn.add(Dense(512, activation='relu'))
nn.add(Dense(512, activation='relu'))

# output layer
nn.add(Dense(11, activation='softmax'))


#compiles model
nn.compile(loss ='categorical_crossentropy', optimizer= 'adam', metrics=['acc'])

results = nn.fit(X_train_sc, y_train,
                    batch_size=256,
                    epochs=16,
                    validation_data=(X_test_sc,y_test))

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [80]:
# let's create our model
nn= Sequential()
# hidden layers
nn.add(Dense(512, input_dim=X_train_sc.shape[1], activation='relu'))
nn.add(Dense(512, activation='relu'))
nn.add(Dense(512, activation='relu'))
nn.add(Dense(512, activation='relu'))

# output layer
nn.add(Dense(11, activation='softmax'))

# compiles model
nn.compile(loss ='categorical_crossentropy', optimizer= 'adam', metrics=['acc'])

results = nn.fit(X_train_sc, y_train,
                    batch_size=256,
                    epochs=16,
                    validation_data=(X_test_sc,y_test))

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [86]:
# transform the data with PCA
pca=PCA()
pca.fit(X_train_sc)
Z_train =pca.transform(X_train_sc)
Z_test = pca.transform(X_test_sc)

In [123]:
# transform the data with PCA

pca=PCA(n_components=10)
pca.fit(X_train_sc)
Z_train =pca.transform(X_train_sc)
Z_test = pca.transform(X_test_sc)
var_exp =pca.explained_variance_ratio_
print(f'Explained variance for each component): {var_exp}')

Explained variance for each component): [0.29250123 0.24555788 0.13175641 0.08162141 0.05747355 0.05360277
 0.04149689 0.03025271 0.02440467 0.01465115]


In [90]:
# Makes a neural net model with the PCA transformed data
nn= Sequential()
# hidden layers
nn.add(Dense(512, input_dim=Z_train.shape[1], activation='relu'))
nn.add(Dense(512, activation='relu'))
nn.add(Dense(512, activation='relu'))


# output layer
nn.add(Dense(11, activation='softmax'))

# compiles model
nn.compile(loss ='categorical_crossentropy', optimizer= 'adam', metrics=['acc'])

results1 = nn.fit(Z_train, y_train,
                    batch_size=256,
                    epochs=25,
                    validation_data=(Z_test,y_test))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [99]:
# just using features that record temp
X= df[['Tx','Tn','Tm']]
y = df['decade']

y = to_categorical(y)
X_train_t, X_test_t,y_train_t,y_test_t = train_test_split(X,y, stratify=y)

sc_t = StandardScaler()

X_train_t_sc = sc_t.fit_transform(X_train_t)
X_test_t_sc = sc_t.transform(X_test_t)

In [100]:
# creates a model that uses features that record temp
nn_t= Sequential()
# 1st hidden layer
nn_t.add(Dense(512, input_dim=X_train_t_sc.shape[1], activation='relu'))
nn_t.add(Dense(512, activation='relu'))
nn_t.add(Dense(512, activation='relu'))

# output layer
nn_t.add(Dense(11, activation='softmax'))

nn_t.compile(loss ='categorical_crossentropy', optimizer= 'adam', metrics=['acc'])

results_t = nn_t.fit(X_train_t_sc, y_train_t,
                    batch_size=256,
                    epochs=16,
                    validation_data=(X_test_t_sc,y_test_t))

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [101]:
# create a model using dropout
nn_dropout= Sequential()
# hidden layer
nn_dropout.add(Dense(1024, input_dim=X_train_sc.shape[1], activation='relu'))

nn_dropout.add(Dropout(.5))

nn_dropout.add(Dense(1024, activation='relu'))

nn_dropout.add(Dropout(.5))

nn_dropout.add(Dense(1024, activation='relu'))

nn_dropout.add(Dropout(.5))
# output layer
nn_dropout.add(Dense(11, activation='softmax'))

nn_dropout.compile(loss ='categorical_crossentropy', optimizer= 'adam', metrics=['acc'])

results1 = nn_dropout.fit(X_train_sc, y_train,
                    batch_size=256,
                    epochs=16,
                    validation_data=(X_test_sc,y_test))

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [111]:
# adds a new column that multiplies longitude and lattitude together
df['LxL'] = df['Long'] * df['Lat']

In [117]:
# adds the column 'LxL' to the feature used to model with
X= df.drop(columns= ['Stn_Name', 'Prov', 'Clim_ID', 'decade', 'Year'])
y = df['decade']

y = to_categorical(y)



X_train_l, X_test_l,y_train_l,y_test_l = train_test_split(X,y, stratify=y)

In [118]:
sc_l = StandardScaler()

X_train_sc_l = sc_l.fit_transform(X_train_l)
X_test_sc_l = sc_l.transform(X_test_l)

In [119]:
# creates a model using the new feature 'LxL'
nn3= Sequential()
# 1st hidden layer
nn3.add(Dense(512, input_dim=X_train_sc_l.shape[1], activation='relu'))
nn3.add(Dense(512, activation='relu'))
nn3.add(Dense(512, activation='relu'))

# output layer
nn3.add(Dense(11, activation='softmax'))

# compiles model
nn3.compile(loss ='categorical_crossentropy', optimizer= 'adam', metrics=['acc'])

results3 = nn3.fit(X_train_sc_l, y_train_l,
                    batch_size=256,
                    epochs=30,
                    validation_data=(X_test_sc_l,y_test_l))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [125]:
# creates our model using the origonal features (lacks 'lxL')
nn= Sequential()
# hidden layers
nn.add(Dense(512, input_dim=X_train_sc.shape[1], activation='relu'))
nn.add(Dense(512, activation='relu'))
nn.add(Dense(512, activation='relu'))

# output layer
nn.add(Dense(11, activation='softmax'))


#compiles model
nn.compile(loss ='categorical_crossentropy', optimizer= 'adam', metrics=['acc'])

results = nn.fit(X_train_sc, y_train,
                    batch_size=256,
                    epochs=30,
                    validation_data=(X_test_sc,y_test))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
