In [24]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error  

In [25]:
%store -r dates
%store -r snow_water_equivalent
%store -r change_snow_water_equivalent
%store -r snow_depth
%store -r change_snow_depth
%store -r air_temp
%store -r discharge
%store -r stage
%store -r CO_df

In [26]:
# splitting train and test data

split = int(len(stage) * 0.80)
train_x = CO_df[0:split]
test_x = CO_df[split:len(stage)]
train_y = stage[0:split]
test_y = stage[split:len(stage)]
print('Observations: x = %d and y = %d' % (len(stage), len(CO_df)))
print('Training Observations: %d' % (len(train_x)))
print('Testing Observations: %d' % (len(test_x)))

Observations: x = 3650 and y = 3650
Training Observations: 2920
Testing Observations: 730


In [27]:
# drop any labels?
#train_data = CO_stations.drop(columns=['Date'],axis=1)
#test_data = CO_stations.drop(columns=['Date'],axis=1)
#print(train_data.tail(3))

# shape of the dataset
print('\nShape of training data :',train_x.shape)
print('\nShape of testing data :',test_x.shape)
print('\nTraining model with {} dimensions.'.format(train_x.shape[1]))

# create object of model
model = LinearRegression()

# fit the model with the training data
model.fit(train_x,train_y)

# predict the target on the train dataset
predict_train = model.predict(train_x)

# Accuray Score on train dataset
rmse_train = mean_squared_error(train_y,predict_train)**(0.5)
print('\nRMSE on train dataset : ', rmse_train)

# predict the target on the test dataset
predict_test = model.predict(test_x)

# Accuracy Score on test dataset
rmse_test = mean_squared_error(test_y,predict_test)**(0.5)
print('\nRMSE on test dataset : ', rmse_test)


Shape of training data : (2920, 535)

Shape of testing data : (730, 535)

Training model with 535 dimensions.

RMSE on train dataset :  150.17778019420678

RMSE on test dataset :  209.99777957234355


In [28]:
# create the PCA (Principal Component Analysis) model
# reduce the dimensions of the data to d

#Some parameters to possibly add are : svd_solver, iterated_power??
#https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

d=200
model_pca = PCA(n_components=d)

new_train = model_pca.fit_transform(train_x)
new_test  = model_pca.fit_transform(test_x)

print('\nTraining model with {} dimensions.'.format(new_train.shape[1]))

# create object of model
model_new = LinearRegression()

# fit the model with the training data
model_new.fit(new_train,train_y)

# predict the target on the new train dataset
predict_train_pca = model_new.predict(new_train)

# Accuray Score on train dataset
rmse_train_pca = mean_squared_error(train_y,predict_train_pca)**(0.5)
print('\nRMSE on new train dataset : ', rmse_train_pca)

# predict the target on the new test dataset
predict_test_pca = model_new.predict(new_test)

# Accuracy Score on test dataset
rmse_test_pca = mean_squared_error(test_y,predict_test_pca)**(0.5)
print('\nRMSE on new test dataset : ', rmse_test_pca)


Training model with 200 dimensions.

RMSE on new train dataset :  163.65757092162326

RMSE on new test dataset :  141.4364076045244
