In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import operator
import math
import preprocessor
import base_preprocessor

from preprocessor import prepare_data, get_features
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

# Load in pre-split data
X_train, X_test, y_train, y_test = preprocessor.prepare_data()

# Load in features 
features = preprocessor.get_features()

Train data(range):
2019-04-11 21:49:48
2019-06-18 23:27:30
Test data(range):
2019-06-18 23:27:49
2019-08-22 19:05:30


## Linear Regression Model(s)

In [2]:
# Linear Regression model (without regularization)
lm = LinearRegression()
lm.fit(X_train, y_train)

# Compute predictions
y_pred_train = lm.predict(X_train)
y_pred_test = lm.predict(X_test)

# Compute upper and lower bounds on actual response variable values
y_train_upper_bound = preprocessor.get_upper_bounds(y_train, 5)
y_train_lower_bound = preprocessor.get_lower_bounds(y_train, 5)
y_test_upper_bound = preprocessor.get_upper_bounds(y_test, 5)
y_test_lower_bound = preprocessor.get_lower_bounds(y_test, 5)

# Compute metrics
MSE = mean_squared_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)

print("Train Scores:")
print(MSE)
print(r2)

MSE = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

print("Test Scores:")
print(MSE)
print(r2)

coef_df = pd.DataFrame(lm.coef_[0], features, columns=['Coefficient'])
coef_df

Train Scores:
104.68806297902465
0.8459680372939303
Test Scores:
102.92700806192381
0.8082335378078275


Unnamed: 0,Coefficient
travel_mins,-37.74485
originpop,-2551987000000.0
destinationpop,-9548511000000.0
days_to_holiday,1.499004
days_from_holiday,-3.537871
distance,-19271920000000.0
month,1.246316
date,1.725967
hour,0.5572149
minute,-2.899624


In [3]:
# Compute accuracy metrics for model comparison
train_preds_acc = []
for i in range(len(y_train)):
    if (y_train_lower_bound.iloc[i].price <= y_pred_train[i] <= y_train_upper_bound.iloc[i].price):
        train_preds_acc.append(1)
    else:
        train_preds_acc.append(0)
        
test_preds_acc = []
for i in range(len(y_test)):
    if (y_test_lower_bound.iloc[i].price <= y_pred_test[i] <= y_test_upper_bound.iloc[i].price):
        test_preds_acc.append(1)
    else:
        test_preds_acc.append(0)

In [4]:
print(np.mean(train_preds_acc))
print(np.mean(test_preds_acc))

0.3204266666666667
0.32064


In [3]:
# Linear Regression model (with Ridge Regression)
rr = Ridge(alpha=0.05)
rr.fit(X_train, y_train)

# Compute predictions
y_pred_train = rr.predict(X_train)
y_pred_test = rr.predict(X_test)

# Compute upper and lower bounds on actual response variable values
y_train_upper_bound = preprocessor.get_upper_bounds(y_train, 5)
y_train_lower_bound = preprocessor.get_lower_bounds(y_train, 5)
y_test_upper_bound = preprocessor.get_upper_bounds(y_test, 5)
y_test_lower_bound = preprocessor.get_lower_bounds(y_test, 5)

# Compute metrics
MSE = mean_squared_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)

print("Train Scores:")
print(MSE)
print(r2)

MSE = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

print("Test Scores:")
print(MSE)
print(r2)

coef_df = pd.DataFrame(rr.coef_[0], features, columns=['Coefficient'])
coef_df

Train Scores:
104.05014617080548
0.846967068394818
Test Scores:
102.05351099113891
0.8071526863479617


Unnamed: 0,Coefficient
travel_mins,-38.810866
originpop,7.716373
destinationpop,8.141137
days_to_holiday,1.35615
days_from_holiday,-3.774059
distance,21.111043
month,1.139687
date,1.598039
hour,0.931256
minute,-2.923148


In [4]:
# Compute accuracy metrics for model comparison
train_preds_acc = []
for i in range(len(y_train)):
    if (y_train_lower_bound.iloc[i].price <= y_pred_train[i] <= y_train_upper_bound.iloc[i].price):
        train_preds_acc.append(1)
    else:
        train_preds_acc.append(0)
        
test_preds_acc = []
for i in range(len(y_test)):
    if (y_test_lower_bound.iloc[i].price <= y_pred_test[i] <= y_test_upper_bound.iloc[i].price):
        test_preds_acc.append(1)
    else:
        test_preds_acc.append(0)

In [5]:
print(np.mean(train_preds_acc))
print(np.mean(test_preds_acc))

0.3033125
0.26535


In [6]:
# Linear Regression model (with Lasso)
lso = Lasso(alpha=0.10, tol=0.001)
lso.fit(X_train, y_train)

# Compute predictions
y_pred_train = lso.predict(X_train)
y_pred_test = lso.predict(X_test)

# Compute metrics
MSE = mean_squared_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)

print("Train Scores:")
print(MSE)
print(r2)

MSE = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

print("Test Scores:")
print(MSE)
print(r2)

coef_df = pd.DataFrame(lso.coef_, features, columns=['Coefficient'])
coef_df

Train Scores:
117.11266072455227
0.8230154790800928
Test Scores:
115.0506710009415
0.8193966912524189


Unnamed: 0,Coefficient
month,-5.41161
date,0.0
hour,-0.0
minute,-1.394424
origin_BARCELONA,30.377277
origin_MADRID,-0.0
origin_PONFERRADA,-5.502852
origin_SEVILLA,0.004431
origin_VALENCIA,-7.046778
destination_BARCELONA,30.45717
