In [8]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error  

In [2]:
%store -r dates
%store -r snow_water_equivalent
%store -r change_snow_water_equivalent
%store -r snow_depth
%store -r change_snow_depth
%store -r air_temp
%store -r discharge
%store -r stage
%store -r df_CO

In [3]:
# check if dates align (manually looking for now, how to refer to 'datetimeUTC' in target dataframe?)
print(discharge) #note: as of now, this is the 75th street station in Boulder
df_CO['Date']

                           USGS:06730200:00060:00003
datetimeUTC                                         
2011-04-20 00:00:00+00:00                       17.9
2011-04-21 00:00:00+00:00                       23.4
2011-04-22 00:00:00+00:00                       26.2
2011-04-23 00:00:00+00:00                       23.9
2011-04-24 00:00:00+00:00                       23.9
...                                              ...
2021-04-15 00:00:00+00:00                       14.7
2021-04-16 00:00:00+00:00                       31.8
2021-04-17 00:00:00+00:00                       23.6
2021-04-18 00:00:00+00:00                       18.0
2021-04-19 00:00:00+00:00                       23.9

[3653 rows x 1 columns]


0       2011-04-21
1       2011-04-22
2       2011-04-23
3       2011-04-24
4       2011-04-25
           ...    
3648    2021-04-16
3649    2021-04-17
3650    2021-04-18
3651    2021-04-19
3652    2021-04-20
Name: Date, Length: 3653, dtype: object

In [4]:
# assign data and drop dates

X = df_CO.drop(columns = ['Date'], axis = 1)
Y = discharge
#print(X.tail(3))

In [6]:
# splitting train and test data

length = len(X)

split = int(length * 0.80)
train_x = X[0:split]
test_x = X[split:length]
train_y = Y[0:split]
test_y = Y[split:length]
print('Observations: x = %d and y = %d' % (length, len(Y)))
print('Training Observations: %d' % (len(train_x)))
print('Testing Observations: %d' % (len(test_x)))

Observations: x = 3653 and y = 3653
Training Observations: 2922
Testing Observations: 731


In [9]:
# shape of the dataset
print('\nShape of training data :',train_x.shape)
print('\nShape of testing data :',test_x.shape)
print('\nTraining model with {} dimensions.'.format(train_x.shape[1]))

# create object of model
model = LinearRegression()

# fit the model with the training data
model.fit(train_x,train_y)

# predict the target on the train dataset
predict_train = model.predict(train_x)

# Accuray Score on train dataset
rmse_train = mean_squared_error(train_y,predict_train)**(0.5)
print('\nRMSE on train dataset : ', rmse_train)

# predict the target on the test dataset
predict_test = model.predict(test_x)

# Accuracy Score on test dataset
rmse_test = mean_squared_error(test_y,predict_test)**(0.5)
print('\nRMSE on test dataset : ', rmse_test)


Shape of training data : (2922, 530)

Shape of testing data : (731, 530)

Training model with 530 dimensions.

RMSE on train dataset :  150.19915998444506

RMSE on test dataset :  187.32503158699626


In [10]:
# create the PCA (Principal Component Analysis) model
# reduce the dimensions of the data to d

#Some parameters to possibly add are : svd_solver, iterated_power??
#https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

d=200
model_pca = PCA(n_components=d)

new_train = model_pca.fit_transform(train_x)
new_test  = model_pca.fit_transform(test_x)

print('\nTraining model with {} dimensions.'.format(new_train.shape[1]))

# create object of model
model_new = LinearRegression()

# fit the model with the training data
model_new.fit(new_train,train_y)

# predict the target on the new train dataset
predict_train_pca = model_new.predict(new_train)

# Accuray Score on train dataset
rmse_train_pca = mean_squared_error(train_y,predict_train_pca)**(0.5)
print('\nRMSE on new train dataset : ', rmse_train_pca)

# predict the target on the new test dataset
predict_test_pca = model_new.predict(new_test)

# Accuracy Score on test dataset
rmse_test_pca = mean_squared_error(test_y,predict_test_pca)**(0.5)
print('\nRMSE on new test dataset : ', rmse_test_pca)


Training model with 200 dimensions.

RMSE on new train dataset :  163.66862029109282

RMSE on new test dataset :  143.1307160661241
