In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import operator
import math
import preprocessor

from preprocessor import prepare_data, get_features
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

# Load in pre-split data
X_train, X_test, y_train, y_test = prepare_data()

# Load in features 
features = get_features()

Train data(range):
2019-04-11 21:50:04
2019-06-14 19:34:41
Test data(range):
2019-06-14 19:38:11
2019-08-22 19:02:45


## Linear Regression Model(s)

In [21]:
# Linear Regression model (without regularization)
lm = LinearRegression()
lm.fit(X_train, y_train)

# Compute predictions
y_pred_train = lm.predict(X_train)
y_pred_test = lm.predict(X_test)

# Compute upper and lower bounds on actual response variable values
y_train_upper_bound = preprocessor.get_upper_bounds(y_train, 5)
y_train_lower_bound = preprocessor.get_lower_bounds(y_train, 5)
y_test_upper_bound = preprocessor.get_upper_bounds(y_test, 5)
y_test_lower_bound = preprocessor.get_lower_bounds(y_test, 5)

# Compute metrics
MSE = mean_squared_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)

print("Train Scores:")
print(MSE)
print(r2)

MSE = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

print("Test Scores:")
print(MSE)
print(r2)

coef_df = pd.DataFrame(lm.coef_[0], features, columns=['Coefficient'])
coef_df

Unnamed: 0,values
travel_mins,0.089645
originpop,0.840395
destinationpop,1.0
days_to_holiday,0.054545
days_from_holiday,0.037736
distance,1.0
month,0.0
date,0.8
hour,0.25
minute,0.431034


In [22]:
# Compute accuracy metrics for model comparison
train_preds_acc = []
for i in range(len(y_train)):
    if (y_train_lower_bound.iloc[i].price <= y_pred_train[i] <= y_train_upper_bound.iloc[i].price):
        train_preds_acc.append(1)
    else:
        train_preds_acc.append(0)
        
test_preds_acc = []
for i in range(len(y_test)):
    if (y_test_lower_bound.iloc[i].price <= y_pred_test[i] <= y_test_upper_bound.iloc[i].price):
        test_preds_acc.append(1)
    else:
        test_preds_acc.append(0)

In [7]:
print(np.mean(train_preds_acc))
print(np.mean(test_preds_acc))

0.31096
0.27584


In [23]:
# Linear Regression model (with Ridge Regression)
rr = Ridge(alpha=0.1)
rr.fit(X_train, y_train)

# Compute predictions
y_pred_train = rr.predict(X_train)
y_pred_test = rr.predict(X_test)

# Compute upper and lower bounds on actual response variable values
y_train_upper_bound = preprocessor.get_upper_bounds(y_train, 5)
y_train_lower_bound = preprocessor.get_lower_bounds(y_train, 5)
y_test_upper_bound = preprocessor.get_upper_bounds(y_test, 5)
y_test_lower_bound = preprocessor.get_lower_bounds(y_test, 5)

# Compute metrics
MSE = mean_squared_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)

print("Train Scores:")
print(MSE)
print(r2)

MSE = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

print("Test Scores:")
print(MSE)
print(r2)

coef_df = pd.DataFrame(rr.coef_[0], features, columns=['Coefficient'])
coef_df

Train Scores:
101.74142591646259
0.8492489256570518
Test Scores:
107.90427199672502
0.8038054558330674


Unnamed: 0,Coefficient
travel_mins,-37.30694
originpop,7.551013
destinationpop,7.99854
days_to_holiday,0.950183
days_from_holiday,-1.956525
distance,20.926199
month,1.94493
date,1.592954
hour,0.64614
minute,-3.236306


In [24]:
# Compute accuracy metrics for model comparison
train_preds_acc = []
for i in range(len(y_train)):
    if (y_train_lower_bound.iloc[i].price <= y_pred_train[i] <= y_train_upper_bound.iloc[i].price):
        train_preds_acc.append(1)
    else:
        train_preds_acc.append(0)
        
test_preds_acc = []
for i in range(len(y_test)):
    if (y_test_lower_bound.iloc[i].price <= y_pred_test[i] <= y_test_upper_bound.iloc[i].price):
        test_preds_acc.append(1)
    else:
        test_preds_acc.append(0)

In [25]:
print(np.mean(train_preds_acc))
print(np.mean(test_preds_acc))

0.30989333333333335
0.26512


In [16]:
# Linear Regression model (with Lasso)
lso = Lasso(alpha=0.10, tol=0.001)
lso.fit(X_train, y_train)

# Compute predictions
y_pred_train = lso.predict(X_train)
y_pred_test = lso.predict(X_test)

# Compute metrics
MSE = mean_squared_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)

print("Train Scores:")
print(MSE)
print(r2)

MSE = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

print("Test Scores:")
print(MSE)
print(r2)

coef_df = pd.DataFrame(lso.coef_, features, columns=['Coefficient'])
coef_df

Train Scores:
109.6564082259782
0.837521233855832
Test Scores:
114.75624720899619
0.79134700420239


Unnamed: 0,Coefficient
travel_mins,-18.802707
originpop,0.293206
destinationpop,0.0
days_to_holiday,0.0
days_from_holiday,-0.0
distance,25.365063
month,-0.0
date,0.0
hour,0.0
minute,-1.675469
