In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import operator
import math
import preprocessor
import base_preprocessor

from preprocessor import prepare_data, get_features
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

# Load in pre-split data
X_train, X_test, y_train, y_test = base_preprocessor.prepare_data()

# Load in features 
features = base_preprocessor.get_features()

Train data(range):
2019-04-11 21:50:04
2019-08-22 19:06:26
Test data(range):
2019-04-11 21:51:08
2019-08-22 18:05:20


## Linear Regression Model(s)

In [2]:
# Linear Regression model (without regularization)
lm = LinearRegression()
lm.fit(X_train, y_train)

# Compute predictions
y_pred_train = lm.predict(X_train)
y_pred_test = lm.predict(X_test)

# Compute upper and lower bounds on actual response variable values
y_train_upper_bound = preprocessor.get_upper_bounds(y_train, 5)
y_train_lower_bound = preprocessor.get_lower_bounds(y_train, 5)
y_test_upper_bound = preprocessor.get_upper_bounds(y_test, 5)
y_test_lower_bound = preprocessor.get_lower_bounds(y_test, 5)

# Compute metrics
MSE = mean_squared_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)

print("Train Scores:")
print(MSE)
print(r2)

MSE = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

print("Test Scores:")
print(MSE)
print(r2)

coef_df = pd.DataFrame(lm.coef_[0], features, columns=['Coefficient'])
coef_df

Train Scores:
109.44350795704166
0.8346053560414902
Test Scores:
107.477385745875
0.8312850215268515


Unnamed: 0,Coefficient
month,-6.464719
date,0.4618538
hour,-0.7250102
minute,-3.033412
origin_BARCELONA,-16020650000000.0
origin_MADRID,-43351580000000.0
origin_PONFERRADA,-16020650000000.0
origin_SEVILLA,-16020650000000.0
origin_VALENCIA,-16020650000000.0
destination_BARCELONA,-25040280000000.0


In [3]:
# Compute accuracy metrics for model comparison
train_preds_acc = []
for i in range(len(y_train)):
    if (y_train_lower_bound.iloc[i].price <= y_pred_train[i] <= y_train_upper_bound.iloc[i].price):
        train_preds_acc.append(1)
    else:
        train_preds_acc.append(0)
        
test_preds_acc = []
for i in range(len(y_test)):
    if (y_test_lower_bound.iloc[i].price <= y_pred_test[i] <= y_test_upper_bound.iloc[i].price):
        test_preds_acc.append(1)
    else:
        test_preds_acc.append(0)

In [4]:
print(np.mean(train_preds_acc))
print(np.mean(test_preds_acc))

0.3204266666666667
0.32064


In [7]:
# Linear Regression model (with Ridge Regression)
rr = Ridge(alpha=0.1)
rr.fit(X_train, y_train)

# Compute predictions
y_pred_train = rr.predict(X_train)
y_pred_test = rr.predict(X_test)

# Compute upper and lower bounds on actual response variable values
y_train_upper_bound = preprocessor.get_upper_bounds(y_train, 5)
y_train_lower_bound = preprocessor.get_lower_bounds(y_train, 5)
y_test_upper_bound = preprocessor.get_upper_bounds(y_test, 5)
y_test_lower_bound = preprocessor.get_lower_bounds(y_test, 5)

# Compute metrics
MSE = mean_squared_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)

print("Train Scores:")
print(MSE)
print(r2)

MSE = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

print("Test Scores:")
print(MSE)
print(r2)

coef_df = pd.DataFrame(rr.coef_[0], features, columns=['Coefficient'])
coef_df

Train Scores:
107.89615406083588
0.8420639577670788
Test Scores:
117.30773779149654
0.7988021682142848


Unnamed: 0,Coefficient
month,-5.45787
date,-0.520288
hour,-0.152271
minute,-3.501229
origin_BARCELONA,26.050089
origin_MADRID,-0.365143
origin_PONFERRADA,-9.490526
origin_SEVILLA,-3.671067
origin_VALENCIA,-12.523353
destination_BARCELONA,27.108946


In [5]:
# Compute accuracy metrics for model comparison
train_preds_acc = []
for i in range(len(y_train)):
    if (y_train_lower_bound.iloc[i].price <= y_pred_train[i] <= y_train_upper_bound.iloc[i].price):
        train_preds_acc.append(1)
    else:
        train_preds_acc.append(0)
        
test_preds_acc = []
for i in range(len(y_test)):
    if (y_test_lower_bound.iloc[i].price <= y_pred_test[i] <= y_test_upper_bound.iloc[i].price):
        test_preds_acc.append(1)
    else:
        test_preds_acc.append(0)

In [25]:
print(np.mean(train_preds_acc))
print(np.mean(test_preds_acc))

0.30989333333333335
0.26512


In [6]:
# Linear Regression model (with Lasso)
lso = Lasso(alpha=0.10, tol=0.001)
lso.fit(X_train, y_train)

# Compute predictions
y_pred_train = lso.predict(X_train)
y_pred_test = lso.predict(X_test)

# Compute metrics
MSE = mean_squared_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)

print("Train Scores:")
print(MSE)
print(r2)

MSE = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

print("Test Scores:")
print(MSE)
print(r2)

coef_df = pd.DataFrame(lso.coef_, features, columns=['Coefficient'])
coef_df

Train Scores:
117.11266072455227
0.8230154790800928
Test Scores:
115.0506710009415
0.8193966912524189


Unnamed: 0,Coefficient
month,-5.41161
date,0.0
hour,-0.0
minute,-1.394424
origin_BARCELONA,30.377277
origin_MADRID,-0.0
origin_PONFERRADA,-5.502852
origin_SEVILLA,0.004431
origin_VALENCIA,-7.046778
destination_BARCELONA,30.45717
