In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from preprocessor import prepare_data, get_features 
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

# Load in pre-split data
X_train, X_test, y_train, y_test = prepare_data()

# Load in features 
features = get_features()

Train data(range):
2019-04-11 21:49:48
2019-06-14 19:06:40
Test data(range):
2019-06-14 19:07:10
2019-08-22 19:05:05


## Linear Regression Model(s)

In [23]:
# Linear Regression model (without regularization)
lm = LinearRegression()
lm.fit(X_train, y_train)

# Compute predictions
y_pred_train = lm.predict(X_train)
y_pred_test = lm.predict(X_test)

# Compute metrics
MSE = mean_squared_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)

print("Train Scores:")
print(MSE)
print(r2)

MSE = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

print("Test Scores:")
print(MSE)
print(r2)

0.8468011993043935
Train Scores:
103.75570367036494
0.8468011993043935
Test Scores:
105.18463105412509
0.8160356246149084


In [17]:
# Linear Regression model (with Ridge Regression)
rr = Ridge(alpha=0.02)
rr.fit(X_train, y_train)

# Compute predictions
y_pred_train = rr.predict(X_train)
y_pred_test = rr.predict(X_test)

# Compute metrics
MSE = mean_squared_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)

print("Train Scores:")
print(MSE)
print(r2)

MSE = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

print("Test Scores:")
print(MSE)
print(r2)

Train Scores:
103.71239514613167
0.8468651458031181
Test Scores:
104.81110578176316
0.8166889077298651


In [6]:
# Linear Regression model (with Ridge Regression)
lso = Lasso(alpha=0.1, tol=0.001)
lso.fit(X_train, y_train)

# Compute predictions
y_pred_train = lso.predict(X_train)
y_pred_test = lso.predict(X_test)

# Compute metrics
MSE = mean_squared_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)

print("Train Scores:")
print(MSE)
print(r2)

MSE = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

print("Test Scores:")
print(MSE)
print(r2)

coef_df = pd.DataFrame(lso.coef_, features, columns=['Coefficient'])
print(coef_df)

Train Scores:
104.2247066086749
0.8461087006259587
Test Scores:
103.23472472565295
0.8194459450788575
[-4.71113821e+00  0.00000000e+00  0.00000000e+00  2.43473156e-01
 -8.12242197e-02  1.63427830e+01 -0.00000000e+00  1.14589404e-01
  0.00000000e+00 -7.37736798e-01 -8.75434856e-01 -2.03611605e+00
  8.28721969e-01  0.00000000e+00 -1.20418571e+00 -2.16899273e+00
  0.00000000e+00  1.02273392e+00 -0.00000000e+00 -8.68587672e-01
 -2.33477317e+00  0.00000000e+00 -0.00000000e+00 -4.38681611e-01
 -3.35841875e-03  9.08554934e-02  8.74378573e-01 -6.83235248e-01
  9.19977034e-01 -5.56385280e-01  0.00000000e+00  2.02260358e+00
  4.64805124e-03  5.51920223e-01  5.00044459e-01 -1.62249142e+00
 -1.10477939e-01 -5.13597684e-01  5.32336786e-02 -9.08742891e-02
 -6.76461577e-01 -2.47164051e+00  0.00000000e+00 -1.28094949e-01
 -0.00000000e+00  8.29539789e+00  0.00000000e+00  0.00000000e+00
 -3.38977618e+00  0.00000000e+00  8.42790836e-01  0.00000000e+00
  3.88170240e-01  4.66989676e+00 -1.58547027e+00  1.3