In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import operator
import math
import preprocessor

from preprocessor import prepare_data, get_features
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

# Load in pre-split data
X_train, X_test, y_train, y_test = prepare_data()

# Load in features 
features = get_features()

Train data(range):
2019-04-11 21:49:48
2019-06-14 19:06:40
Test data(range):
2019-06-14 19:07:10
2019-08-22 19:05:05


## Linear Regression Model(s)

In [3]:
# Linear Regression model (without regularization)
lm = LinearRegression()
lm.fit(X_train, y_train)

# Compute predictions
y_pred_train = lm.predict(X_train)
y_pred_test = lm.predict(X_test)


# Compute upper and lower bounds on actual response variable values
upper_bound = preprocessor.get_upper_bounds(y_train, 10)
lower_bound = preprocessor.get_lower_bounds(y_train, 10)

# Compute metrics
MSE = mean_squared_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)

print("Train Scores:")
print(MSE)
print(r2)

MSE = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

print("Test Scores:")
print(MSE)
print(r2)

coef_df = pd.DataFrame(lm.coef_[0], features, columns=['Coefficient'])
coef_df

Train Scores:
103.75570367036494
0.8468011993043935
Test Scores:
105.18463105412509
0.8160356246149084


Unnamed: 0,Coefficient
travel_mins,-5.456833
originpop,746361200000.0
destinationpop,2837532000000.0
days_to_holiday,0.3526014
days_from_holiday,-0.2405141
distance,75666630000000.0
month,0.3051612
date,0.303461
hour,0.1126601
minute,-0.827709


In [5]:
# Linear Regression model (with Ridge Regression)
rr = Ridge(alpha=0.02)
rr.fit(X_train, y_train)

# Compute predictions
y_pred_train = rr.predict(X_train)
y_pred_test = rr.predict(X_test)

# Compute metrics
MSE = mean_squared_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)

print("Train Scores:")
print(MSE)
print(r2)

MSE = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

print("Test Scores:")
print(MSE)
print(r2)

coef_df = pd.DataFrame(rr.coef_[0], features, columns=['Coefficient'])
coef_df

Train Scores:
103.71239514613167
0.8468651458031181
Test Scores:
104.81110578176316
0.8166889077298651


Unnamed: 0,Coefficient
travel_mins,-5.45718
originpop,2.931526
destinationpop,3.027909
days_to_holiday,0.349106
days_from_holiday,-0.242399
distance,7.296336
month,0.284688
date,0.312428
hour,0.118838
minute,-0.827938


In [3]:
# Linear Regression model (with Lasso)
lso = Lasso(alpha=0.1, tol=0.001)
lso.fit(X_train, y_train)

# Compute predictions
y_pred_train = lso.predict(X_train)
y_pred_test = lso.predict(X_test)

# Compute metrics
MSE = mean_squared_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)

print("Train Scores:")
print(MSE)
print(r2)

MSE = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

print("Test Scores:")
print(MSE)
print(r2)

coef_df = pd.DataFrame(lso.coef_, features, columns=['Coefficient'])
coef_df

Train Scores:
104.2247066086749
0.8461087006259587
Test Scores:
103.23472472565295
0.8194459450788575


Unnamed: 0,Coefficient
travel_mins,-4.711138
originpop,0.0
destinationpop,0.0
days_to_holiday,0.243473
days_from_holiday,-0.081224
distance,16.342783
month,-0.0
date,0.114589
hour,0.0
minute,-0.737737
