In [36]:
import pandas as pd
import numpy as np
import pickle
import glob
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score
from sklearn import linear_model

In [18]:
trafic_index = dict(pd.read_csv('data/processed/Full_data_set.csv', parse_dates = [8]).Reservation_Time.dt.hour.value_counts())

df = pd.read_csv('data/processed/OctNov.csv', index_col=0, parse_dates=[1])

df = df[df.next_customer]
df.drop(columns=['prev_customer', 'next_customer'], inplace = True)

df['weekend'] = df.time.dt.weekday//5
df['hour_index'] = df.time.dt.hour.map(trafic_index)/10000

# Remove zones with too little support
df = df[~df.leave_zone.isin((df.leave_zone.value_counts() < 30).index[df.leave_zone.value_counts() < 30])]

df = pd.get_dummies(df, columns = ['engine','leave_zone'], prefix=['eng','lz'])


y = df.time_to_reservation
df.drop(columns=['time_to_reservation', 'time', 'park_location_lat', 'park_location_long', 'leave_location_lat', 'leave_location_long', 'leave_fuel', 'park_zone', 'moved', 'movedTF'], inplace = True)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2)

In [30]:
cv_select = GridSearchCV(
    estimator=linear_model.LinearRegression(), 
    param_grid={}, 
    scoring='neg_mean_squared_error', # Use MSE
    n_jobs=-2,
    return_train_score=True,
    verbose=2, 
    cv=10
)

cv_select.fit(X_train, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END .................................................... total time=  10.9s
[CV] END .................................................... total time=  10.9s
[CV] END .................................................... total time=  11.0s
[CV] END .................................................... total time=  11.1s
[CV] END .................................................... total time=  11.2s
[CV] END .................................................... total time=  11.2s
[CV] END .................................................... total time=  11.2s
[CV] END .................................................... total time=   4.0s
[CV] END .................................................... total time=   4.0s
[CV] END .................................................... total time=   4.0s


GridSearchCV(cv=10, estimator=LinearRegression(), n_jobs=-2, param_grid={},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=2)

In [38]:
r2_score(y_test,cv_select.predict(X_test))

0.12583861771471416

In [26]:
def cv(model, parameters, X_train, y_train, cf = 5):    
    """
    Performs paramter tunning using cross-validation on a specifed pipe object.
    """
    # perform cross validaiton over the input parameters
    cv_select = GridSearchCV(
        estimator=model, 
        param_grid=parameters, 
        scoring='neg_mean_squared_error', # Use MSE
        n_jobs=-2,
        return_train_score=True,
        verbose=2, 
        cv=cf
    )
    cv_select.fit(X_train, y_train)
    
    return(cv_select)

In [27]:
elastic_net_model = linear_model.ElasticNet(fit_intercept = True)

parameters = {
    'alpha': np.linspace(0.01, 1, 10),
    'l1_ratio': np.linspace(0, 1, 10)
}

elastic_net_cv = cv(elastic_net_model, parameters, X_train, y_train, cf = 5)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END ............alpha=0.01, l1_ratio=0.1111111111111111; total time=   7.8s
[CV] END ............alpha=0.01, l1_ratio=0.1111111111111111; total time=  10.7s
[CV] END ............alpha=0.01, l1_ratio=0.1111111111111111; total time=   9.9s
[CV] END ............alpha=0.01, l1_ratio=0.1111111111111111; total time=   7.3s
[CV] END ............alpha=0.01, l1_ratio=0.1111111111111111; total time=   5.4s
[CV] END ............alpha=0.01, l1_ratio=0.2222222222222222; total time=   9.2s
[CV] END ............alpha=0.01, l1_ratio=0.2222222222222222; total time=   6.1s
[CV] END ............alpha=0.01, l1_ratio=0.2222222222222222; total time=   6.0s
[CV] END ............alpha=0.01, l1_ratio=0.2222222222222222; total time=   5.0s
[CV] END ............alpha=0.01, l1_ratio=0.2222222222222222; total time=   4.3s
[CV] END ............alpha=0.01, l1_ratio=0.3333333333333333; total time=   7.3s
[CV] END ............alpha=0.01, l1_ratio=0.33

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.0; total time= 4.6min


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.0; total time= 4.6min


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.0; total time= 4.6min
[CV] END ...........................alpha=0.01, l1_ratio=0.0; total time= 4.6min


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.01, l1_ratio=0.0; total time= 4.6min
[CV] END ............alpha=0.12, l1_ratio=0.1111111111111111; total time=   2.3s
[CV] END ............alpha=0.12, l1_ratio=0.1111111111111111; total time=   2.5s
[CV] END ............alpha=0.12, l1_ratio=0.1111111111111111; total time=   2.4s
[CV] END ............alpha=0.12, l1_ratio=0.1111111111111111; total time=   2.3s
[CV] END ............alpha=0.12, l1_ratio=0.1111111111111111; total time=   2.2s
[CV] END ............alpha=0.12, l1_ratio=0.2222222222222222; total time=   2.2s
[CV] END ............alpha=0.12, l1_ratio=0.2222222222222222; total time=   2.1s
[CV] END ............alpha=0.12, l1_ratio=0.2222222222222222; total time=   2.1s
[CV] END ............alpha=0.12, l1_ratio=0.2222222222222222; total time=   2.3s
[CV] END ............alpha=0.12, l1_ratio=0.2222222222222222; total time=   2.3s
[CV] END ............alpha=0.12, l1_ratio=0.3333333333333333; total time=   2.2s
[CV] END ............alpha=0

  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.12, l1_ratio=0.0; total time= 4.1min


  model = cd_fast.enet_coordinate_descent(


[CV] END ...........................alpha=0.12, l1_ratio=0.0; total time= 4.1min


KeyboardInterrupt: 

In [24]:
np.linspace(0.01, 1, 25)

array([0.01   , 0.05125, 0.0925 , 0.13375, 0.175  , 0.21625, 0.2575 ,
       0.29875, 0.34   , 0.38125, 0.4225 , 0.46375, 0.505  , 0.54625,
       0.5875 , 0.62875, 0.67   , 0.71125, 0.7525 , 0.79375, 0.835  ,
       0.87625, 0.9175 , 0.95875, 1.     ])