## Group 3:
## Predicting CTA Station Usage Using Ridership, Demographic, and Socioeconomic Data 

 

#### Lucy Chavez, DePaul University, School of Computing, lchave28@depaul.edu 
#### Denvir Gama, DePaul University, School of Computing, dlnu1@depaul.edu 
#### Zach Hollis, DePaul University, School of Computing, zhollis@depaul.edu 
#### Danielle Martin, DePaul University, School of Computing, dmart164@depaul.edu 

### Load libraries

In [33]:
import warnings
warnings.filterwarnings('ignore')

In [34]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import pandas as pd
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
pd.set_option('display.max_colwidth', 10000)
pd.set_option('display.max_rows', 1000)

In [35]:
from sklearn import datasets
from sklearn import linear_model, ensemble, tree
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import datetime

### Load full dataset

In [36]:
df = pd.read_csv("../data/train_data_with_lag.csv")

In [37]:
df['Year'].unique()

array([2016, 2017, 2018, 2019], dtype=int64)

### Identify numerical vs. categorical variables

In [38]:
NUM_VARS = ['prev_year_no_of_sales', 'prev_year_avg_age',
       'prev_year_price_p_sf', 'prev_year_price_p_house', 'prev_year_avg_sf',
       'Total Population', 'Median Age', 'Median HH Income',
       'Total Housing Units', 'Median Number of Rooms', 'Median Year Built',
       'Median Gross Rent', 'Mean HH Size', 'Percent White', 'Percent Black',
       'Percent HH with Children', 'Percent Housing Vacant',
       'crime_count', 'crimes_per_capita', 'distance_miles']

CAT_VARS = ['pri_neigh', 'sec_neigh', 'side', 'station_name', 'public_schools']

### Identify features and target for model training

In [39]:
FEATURES = ['Norm prev_year_no_of_sales',
 'Norm prev_year_avg_age',
 'Norm prev_year_price_p_sf',
 'Norm prev_year_price_p_house',
 'Norm prev_year_avg_sf',
 'Norm Total Population',
 'Norm Median Age',
 'Norm Median HH Income',
 'Norm Total Housing Units',
 'Norm Median Number of Rooms',
 'Norm Median Year Built',
 'Norm Median Gross Rent',
 'Norm Mean HH Size',
 'Norm Percent White',
 'Norm Percent Black',
 'Norm Percent HH with Children',
 'Norm Percent Housing Vacant',
 'Norm crime_count',
 'Norm crimes_per_capita',
 'Norm distance_miles',
 'pri_neigh_Albany Park',
 'pri_neigh_Andersonville',
 'pri_neigh_Archer Heights',
 'pri_neigh_Armour Square',
 'pri_neigh_Ashburn',
 'pri_neigh_Auburn Gresham',
 'pri_neigh_Austin',
 'pri_neigh_Avalon Park',
 'pri_neigh_Avondale',
 'pri_neigh_Belmont Cragin',
 'pri_neigh_Beverly',
 'pri_neigh_Boystown',
 'pri_neigh_Bridgeport',
 'pri_neigh_Brighton Park',
 'pri_neigh_Bucktown',
 'pri_neigh_Burnside',
 'pri_neigh_Calumet Heights',
 'pri_neigh_Chatham',
 'pri_neigh_Chicago Lawn',
 'pri_neigh_Chinatown',
 'pri_neigh_Clearing',
 'pri_neigh_Douglas',
 'pri_neigh_Dunning',
 'pri_neigh_East Side',
 'pri_neigh_East Village',
 'pri_neigh_Edgewater',
 'pri_neigh_Edison Park',
 'pri_neigh_Englewood',
 'pri_neigh_Fuller Park',
 'pri_neigh_Gage Park',
 'pri_neigh_Galewood',
 'pri_neigh_Garfield Park',
 'pri_neigh_Garfield Ridge',
 'pri_neigh_Gold Coast',
 'pri_neigh_Grand Boulevard',
 'pri_neigh_Grand Crossing',
 'pri_neigh_Greektown',
 'pri_neigh_Hegewisch',
 'pri_neigh_Hermosa',
 'pri_neigh_Humboldt Park',
 'pri_neigh_Hyde Park',
 'pri_neigh_Irving Park',
 'pri_neigh_Jefferson Park',
 'pri_neigh_Kenwood',
 'pri_neigh_Lake View',
 'pri_neigh_Lincoln Park',
 'pri_neigh_Lincoln Square',
 'pri_neigh_Little Italy, UIC',
 'pri_neigh_Little Village',
 'pri_neigh_Logan Square',
 'pri_neigh_Loop',
 'pri_neigh_Lower West Side',
 'pri_neigh_Mckinley Park',
 'pri_neigh_Montclare',
 'pri_neigh_Morgan Park',
 'pri_neigh_Mount Greenwood',
 'pri_neigh_Near South Side',
 'pri_neigh_New City',
 'pri_neigh_North Center',
 'pri_neigh_North Lawndale',
 'pri_neigh_North Park',
 'pri_neigh_Norwood Park',
 "pri_neigh_O'Hare",
 'pri_neigh_Oakland',
 'pri_neigh_Old Town',
 'pri_neigh_Portage Park',
 'pri_neigh_Printers Row',
 'pri_neigh_Pullman',
 'pri_neigh_River North',
 'pri_neigh_Riverdale',
 'pri_neigh_Rogers Park',
 'pri_neigh_Roseland',
 'pri_neigh_Rush & Division',
 'pri_neigh_Sauganash,Forest Glen',
 'pri_neigh_Sheffield & DePaul',
 'pri_neigh_South Chicago',
 'pri_neigh_South Deering',
 'pri_neigh_South Shore',
 'pri_neigh_Streeterville',
 'pri_neigh_Ukrainian Village',
 'pri_neigh_United Center',
 'pri_neigh_Uptown',
 'pri_neigh_Washington Heights',
 'pri_neigh_Washington Park',
 'pri_neigh_West Elsdon',
 'pri_neigh_West Lawn',
 'pri_neigh_West Loop',
 'pri_neigh_West Pullman',
 'pri_neigh_West Ridge',
 'pri_neigh_West Town',
 'pri_neigh_Wicker Park',
 'pri_neigh_Woodlawn',
 'pri_neigh_Wrigleyville',
 'side_Central',
 'side_North',
 'side_South',
 'side_West',
 'public_schools_0.0',
 'public_schools_1.0',
 'public_schools_2.0',
 'public_schools_3.0',
 'public_schools_4.0',
 'public_schools_5.0']

TARGETS = 'price_p_house'

### Create functions for data processing

In [40]:
def process_bool_and_missing(train, test, features):
    
    for f in features:
        if train[f].dtype == 'bool':
            #print(f, "is bool, converting to int")
            train[f] = train[f].astype(int)
            test[f] = test[f].astype(int)
            
        if train[f].dtype in ('float64', 'int64'):   
            #print(f, "training data's mean:", train[f].mean(),
                #"will replace missing values of", f)
            train[f][train[f].isna()] = train[f].mean()
            test[f][test[f].isna()] = train[f].mean()

    return train, test


def normalize_features(train, test, features):
    
    for feature in features:
        scaler = StandardScaler()
        scaler.fit(pd.DataFrame(train.loc[:, feature]))
        n_feature = 'Norm ' + feature
        train[n_feature] = scaler.transform(pd.DataFrame(train.loc[:, feature]))
        test[n_feature] = scaler.transform(pd.DataFrame(test.loc[:, feature]))

    return train, test


def one_hot_encoding_features(train, test, features, prefix):

    train = pd.get_dummies(train, columns = features, prefix = prefix)
    test = pd.get_dummies(test, columns = features, prefix = prefix)
    
    for v in test.columns:
        if v not in train.columns:
            test = test.drop(columns=[v])
        
    for v in train.columns:
            if v not in test.columns:
                test[v] = 0

    return train, test


def prepare_train_test(train, test, num, cat):

    train, test = process_bool_and_missing(train, test, num)
    train, test = normalize_features(train, test, num)
    train, test = one_hot_encoding_features(train, test, cat, cat)

    return train, test


def temporal_train_test_split(df, train_yr, test_yr, num, cat):
    
    train = df.loc[df.Year.isin(train_yr), :]
    test = df.loc[df.Year.isin(test_yr), :]
    #print('TRAIN: ', train_yr, 'TEST: ', test_yr)
    #print('Training size: ', train.shape) 
    #print('Testing size: ', test.shape) 
    train, test = prepare_train_test(train, test, num, cat)
    
    return train, test

### Create functions for building and evaluating regressor

In [41]:
def build_regressors(train, features, targets, model, params):
       
    #print("Training model:", model, "|", params) 
    model.set_params(**params)
    train_features = train[features]
    train_targets = train[targets]
    model.fit(train_features, train_targets)

    return model  


def evaluate_regressors(df, features, targets, model):
    
    actual = df[targets]
    predict = model.predict(df[features])      
    #score = model.score(df[features], df[targets])
    mse = mean_squared_error(actual, predict)
            
    return mse

### Create functions for Cross Validation

In [42]:
def k_fold_CV(train, features, targets, model_class, params, cv=5, scoring='neg_mean_squared_error'):
    
    start = datetime.datetime.now() # Begin timer 
    
    grid_model = GridSearchCV(
                    estimator=model_class, 
                    param_grid=params, 
                    cv=cv,
                    scoring=scoring,
                    return_train_score=True,
                    refit=True)
       
    grid_model_results = grid_model.fit(train[features], train[targets])
    ranked_cv = pd.DataFrame(grid_model.cv_results_).sort_values(by=['rank_test_score'])
    cv_results = ranked_cv[['params','rank_test_score', 'mean_train_score', 'mean_test_score']]
    
    stop = datetime.datetime.now() # End timer
    print("ALL FOLDS Time Elapsed:", stop - start)  
    
    return cv_results, grid_model

### Create functions for Forward Chaining

In [43]:
def forward_chaining(df, features, targets, model_class, params, num, cat):
    
    start = datetime.datetime.now() # Begin timer 
    
    year_list = df['Year'].unique().tolist()
    results = pd.DataFrame()

    # splits the dataframe into train/test set using forward chaining
    for idx, yr in enumerate(year_list[:-1]):
        train_yr = year_list[:idx+1]
        test_yr = [year_list[idx+1]]
        print("TRAIN YEARS: ", train_yr)
        print("TEST YEARS: ", test_yr)
        
        train, test = temporal_train_test_split(df, train_yr, test_yr, num, cat)
        cv_results, grid_model = k_fold_CV(train, features, targets, model_class, params)
        
        cv_results = cv_results.rename(
            columns={"rank_test_score": "rank_by_CV_mean_test_rmse",
                     "mean_train_score": "CV_mean_train_rmse",
                     "mean_test_score": "CV_mean_test_rmse"})
        
        cv_results['CV_mean_train_rmse'] = np.sqrt(-cv_results['CV_mean_train_rmse'])
        cv_results['CV_mean_test_rmse'] = np.sqrt(-cv_results['CV_mean_test_rmse'])
        cv_results['mean_train_rmse'] = 0
        cv_results['mean_test_rmse'] = 0

        cv_results['Train Years'] = str(train_yr)
        cv_results['Test Years'] = str(test_yr)
        
        for i in range(len(cv_results['params'])):
            p = cv_results['params'].iloc[i]
            refit_model = build_regressors(train, features, targets, model_class, p)
            mean_train_mse = evaluate_regressors(train, features, targets, refit_model)
            mean_test_mse = evaluate_regressors(test, features, targets, refit_model)
            
            cv_results['mean_train_rmse'].iloc[i] = np.sqrt(mean_train_mse)
            cv_results['mean_test_rmse'].iloc[i] = np.sqrt(mean_test_mse)
        
        results = results.append(cv_results, ignore_index=True)
    
    stop = datetime.datetime.now() # End timer
    print("ALL TEMPORAL SPLITS Time Elapsed:", stop - start)  
    
    return results

### Configure the models and params to tune

In [49]:
MODELS = {
    'LinearRegression': linear_model.LinearRegression(), 
    'Ridge': linear_model.Ridge(),
    'Lasso': linear_model.Lasso(), 
    'ElasticNet': linear_model.ElasticNet(),
    'DecisionTree': tree.DecisionTreeRegressor(),
    'RandomForest': ensemble.RandomForestRegressor(),
    'Boosting': ensemble.GradientBoostingRegressor()
}

GRID = {
    'LinearRegression': {},
    
    'Ridge': {'max_iter': [10000], 'random_state': [0],
              'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]},  
    
    'Lasso': {'max_iter': [10000], 'random_state': [0],
              'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]},
    
    'ElasticNet': {'max_iter': [10000], 'random_state': [0],
                   'alpha': [0.01, 0.1, 1, 10, 100, 1000],
                   'l1_ratio': [0, 0.2, 0.4, 0.6, 0.8, 1]},
    
    'DecisionTree': {'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                     'max_features': ['auto', 'sqrt'],
                     'min_samples_leaf': [1, 2, 4],
                     'min_samples_split': [2, 5, 10]},
    
    'RandomForest': {'max_depth': [20, 40, 60, 80, 100],
                     'max_features': ['auto', 'sqrt'],
                     'min_samples_split': [2, 5, 10],
                     'n_estimators': [10, 100]},
    
    'Boosting': {"learning_rate": [0.01,0.1,0.9],
                 "min_samples_split": np.linspace(0.1, 0.5, 3),
                 "min_samples_leaf": np.linspace(0.1, 0.5, 3),
                 "max_depth": [3, 5, 8],
                 "max_features": ['auto', 'sqrt'],
                 "criterion": ['friedman_mse'],
                 "subsample": [0.5, 0.75, 1.0],
                 "n_estimators": [10, 100, 500]}
} 

#### Linear Regression

In [13]:
linear = forward_chaining(df, FEATURES, TARGETS, MODELS['LinearRegression'],
                          GRID['LinearRegression'], NUM_VARS, CAT_VARS)

TRAIN YEARS:  [2016]
TEST YEARS:  [2017]
ALL FOLDS Time Elapsed: 0:00:00.207416
TRAIN YEARS:  [2016, 2017]
TEST YEARS:  [2018]
ALL FOLDS Time Elapsed: 0:00:00.256428
TRAIN YEARS:  [2016, 2017, 2018]
TEST YEARS:  [2019]
ALL FOLDS Time Elapsed: 0:00:00.301209
ALL TEMPORAL SPLITS Time Elapsed: 0:00:01.953930


In [14]:
linear

Unnamed: 0,params,rank_by_CV_mean_test_rmse,CV_mean_train_rmse,CV_mean_test_rmse,mean_train_rmse,mean_test_rmse,Train Years,Test Years
0,{},1,115024.102404,1.064346e+16,116783.913855,112810.487544,[2016],[2017]
1,{},1,111231.797098,3563200000000000.0,112679.57929,123131.921637,"[2016, 2017]",[2018]
2,{},1,113020.187417,3525372000000000.0,114506.622179,105074.562296,"[2016, 2017, 2018]",[2019]


#### Ridge Regression

In [15]:
ridge = forward_chaining(df, FEATURES, TARGETS, MODELS['Ridge'],
                          GRID['Ridge'], NUM_VARS, CAT_VARS)   

TRAIN YEARS:  [2016]
TEST YEARS:  [2017]
ALL FOLDS Time Elapsed: 0:00:00.734852
TRAIN YEARS:  [2016, 2017]
TEST YEARS:  [2018]
ALL FOLDS Time Elapsed: 0:00:01.138967
TRAIN YEARS:  [2016, 2017, 2018]
TEST YEARS:  [2019]
ALL FOLDS Time Elapsed: 0:00:01.246403
ALL TEMPORAL SPLITS Time Elapsed: 0:00:04.682405


In [16]:
ridge

Unnamed: 0,params,rank_by_CV_mean_test_rmse,CV_mean_train_rmse,CV_mean_test_rmse,mean_train_rmse,mean_test_rmse,Train Years,Test Years
0,"{'alpha': 0.1, 'max_iter': 10000, 'random_state': 0}",1,115033.383808,158368.029406,116789.927505,112753.062497,[2016],[2017]
1,"{'alpha': 0.01, 'max_iter': 10000, 'random_state': 0}",2,115024.281769,158532.329745,116783.994517,112802.643696,[2016],[2017]
2,"{'alpha': 0.001, 'max_iter': 10000, 'random_state': 0}",3,115024.183486,158557.098438,116783.932287,112808.25976,[2016],[2017]
3,"{'alpha': 1, 'max_iter': 10000, 'random_state': 0}",4,115548.613412,159355.598866,117186.34617,112690.27151,[2016],[2017]
4,"{'alpha': 1000, 'max_iter': 10000, 'random_state': 0}",5,140376.611143,163102.678413,141288.51094,131894.812536,[2016],[2017]
5,"{'alpha': 10, 'max_iter': 10000, 'random_state': 0}",6,121519.549415,163989.536524,123062.870372,116666.543925,[2016],[2017]
6,"{'alpha': 100, 'max_iter': 10000, 'random_state': 0}",7,131365.363808,164025.923901,133561.419623,124643.886085,[2016],[2017]
7,"{'alpha': 10000, 'max_iter': 10000, 'random_state': 0}",8,172607.206771,194023.460898,169687.175526,162285.493488,[2016],[2017]
8,"{'alpha': 1, 'max_iter': 10000, 'random_state': 0}",1,111393.262363,149290.800688,112795.131296,123116.912019,"[2016, 2017]",[2018]
9,"{'alpha': 0.1, 'max_iter': 10000, 'random_state': 0}",2,111234.019496,150066.603348,112681.063031,123121.646003,"[2016, 2017]",[2018]


#### Lasso Regression

In [17]:
lasso = forward_chaining(df, FEATURES, TARGETS, MODELS['Lasso'],
                          GRID['Lasso'], NUM_VARS, CAT_VARS)   

TRAIN YEARS:  [2016]
TEST YEARS:  [2017]
ALL FOLDS Time Elapsed: 0:00:43.639994
TRAIN YEARS:  [2016, 2017]
TEST YEARS:  [2018]
ALL FOLDS Time Elapsed: 0:02:02.175074
TRAIN YEARS:  [2016, 2017, 2018]
TEST YEARS:  [2019]
ALL FOLDS Time Elapsed: 0:04:58.175731
ALL TEMPORAL SPLITS Time Elapsed: 0:10:01.382627


In [18]:
lasso

Unnamed: 0,params,rank_by_CV_mean_test_rmse,CV_mean_train_rmse,CV_mean_test_rmse,mean_train_rmse,mean_test_rmse,Train Years,Test Years
0,"{'alpha': 10, 'max_iter': 10000, 'random_state': 0}",1,115039.766047,160067.075297,116800.757894,112706.360651,[2016],[2017]
1,"{'alpha': 0.1, 'max_iter': 10000, 'random_state': 0}",2,115024.183926,160226.139334,116783.933156,112807.745798,[2016],[2017]
2,"{'alpha': 1, 'max_iter': 10000, 'random_state': 0}",3,115024.343786,160400.390972,116784.10971,112797.086028,[2016],[2017]
3,"{'alpha': 100, 'max_iter': 10000, 'random_state': 0}",4,116198.216446,163196.153057,118034.186701,113181.591764,[2016],[2017]
4,"{'alpha': 1000, 'max_iter': 10000, 'random_state': 0}",5,131025.124897,167647.484731,135141.482969,125472.888948,[2016],[2017]
5,"{'alpha': 10000, 'max_iter': 10000, 'random_state': 0}",6,139682.875423,170629.857828,141960.23277,129267.883935,[2016],[2017]
6,"{'alpha': 0.01, 'max_iter': 10000, 'random_state': 0}",7,115024.182498,171230.608854,116783.931669,112808.788419,[2016],[2017]
7,"{'alpha': 0.001, 'max_iter': 10000, 'random_state': 0}",8,115024.182485,173328.473626,116783.931655,112808.88131,[2016],[2017]
8,"{'alpha': 10, 'max_iter': 10000, 'random_state': 0}",1,111249.224008,149220.896484,112698.037602,123069.118635,"[2016, 2017]",[2018]
9,"{'alpha': 1, 'max_iter': 10000, 'random_state': 0}",2,111231.972831,149609.457465,112679.807471,123124.034389,"[2016, 2017]",[2018]


#### Elastic Net

In [19]:
enet = forward_chaining(df, FEATURES, TARGETS, MODELS['ElasticNet'],
                          GRID['ElasticNet'], NUM_VARS, CAT_VARS)   

TRAIN YEARS:  [2016]
TEST YEARS:  [2017]
ALL FOLDS Time Elapsed: 0:01:47.389453
TRAIN YEARS:  [2016, 2017]
TEST YEARS:  [2018]
ALL FOLDS Time Elapsed: 0:04:47.330931
TRAIN YEARS:  [2016, 2017, 2018]
TEST YEARS:  [2019]
ALL FOLDS Time Elapsed: 0:08:58.602658
ALL TEMPORAL SPLITS Time Elapsed: 0:21:30.217476


In [20]:
enet

Unnamed: 0,params,rank_by_CV_mean_test_rmse,CV_mean_train_rmse,CV_mean_test_rmse,mean_train_rmse,mean_test_rmse,Train Years,Test Years
0,"{'alpha': 10, 'l1_ratio': 1, 'max_iter': 10000, 'random_state': 0}",1,115039.766047,160067.075297,116800.757894,112706.360651,[2016],[2017]
1,"{'alpha': 0.1, 'l1_ratio': 1, 'max_iter': 10000, 'random_state': 0}",2,115024.183926,160226.139334,116783.933156,112807.745798,[2016],[2017]
2,"{'alpha': 1, 'l1_ratio': 1, 'max_iter': 10000, 'random_state': 0}",3,115024.343786,160400.390972,116784.10971,112797.086028,[2016],[2017]
3,"{'alpha': 0.01, 'l1_ratio': 0.8, 'max_iter': 10000, 'random_state': 0}",4,117675.658972,162065.393871,119794.012437,114272.895657,[2016],[2017]
4,"{'alpha': 1, 'l1_ratio': 0.6, 'max_iter': 10000, 'random_state': 0}",5,138308.845688,162288.909729,140664.050956,131195.205015,[2016],[2017]
5,"{'alpha': 1, 'l1_ratio': 0.8, 'max_iter': 10000, 'random_state': 0}",6,135362.116949,162306.512523,137981.849602,128329.122862,[2016],[2017]
6,"{'alpha': 100, 'l1_ratio': 1, 'max_iter': 10000, 'random_state': 0}",7,116198.216446,163196.153057,118034.186701,113181.591764,[2016],[2017]
7,"{'alpha': 0.1, 'l1_ratio': 0, 'max_iter': 10000, 'random_state': 0}",8,133161.837694,163252.516293,135976.450572,126519.694398,[2016],[2017]
8,"{'alpha': 1, 'l1_ratio': 0.4, 'max_iter': 10000, 'random_state': 0}",9,140779.824435,163316.322313,142948.501838,133765.827416,[2016],[2017]
9,"{'alpha': 0.01, 'l1_ratio': 0.6, 'max_iter': 10000, 'random_state': 0}",10,120050.982549,163466.914368,122494.804959,116241.459832,[2016],[2017]


#### Decision Tree

In [21]:
dtree = forward_chaining(df, FEATURES, TARGETS, MODELS['DecisionTree'],
                          GRID['DecisionTree'], NUM_VARS, CAT_VARS) 

TRAIN YEARS:  [2016]
TEST YEARS:  [2017]
ALL FOLDS Time Elapsed: 0:01:07.621150
TRAIN YEARS:  [2016, 2017]
TEST YEARS:  [2018]
ALL FOLDS Time Elapsed: 0:01:16.505232
TRAIN YEARS:  [2016, 2017, 2018]
TEST YEARS:  [2019]
ALL FOLDS Time Elapsed: 0:02:00.826139
ALL TEMPORAL SPLITS Time Elapsed: 0:05:29.270660


In [22]:
dtree

Unnamed: 0,params,rank_by_CV_mean_test_rmse,CV_mean_train_rmse,CV_mean_test_rmse,mean_train_rmse,mean_test_rmse,Train Years,Test Years
0,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10}",1,59586.203169,135262.57785,61518.440861,137160.3035,[2016],[2017]
1,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 10}",2,67586.205876,136019.246238,69738.271905,122281.524652,[2016],[2017]
2,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10}",3,82513.021878,136186.619657,82011.229783,125883.301621,[2016],[2017]
3,"{'max_depth': 80, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10}",4,47398.436391,136861.526945,46424.919785,138422.348373,[2016],[2017]
4,"{'max_depth': 100, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10}",5,47398.436391,137128.151164,46424.919785,140165.812528,[2016],[2017]
5,"{'max_depth': 40, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10}",6,47398.436391,137942.649204,46424.919785,139856.295039,[2016],[2017]
6,"{'max_depth': 70, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10}",7,47398.436391,138161.776137,46424.919785,139232.628572,[2016],[2017]
7,"{'max_depth': 70, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10}",8,79316.537118,138273.14978,77787.164512,128023.366081,[2016],[2017]
8,"{'max_depth': 40, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10}",9,79316.537118,138284.560243,77787.164512,127384.732121,[2016],[2017]
9,"{'max_depth': 30, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10}",10,79316.537118,138342.455993,77787.164512,128012.410894,[2016],[2017]


#### Random Forest

In [23]:
rf = forward_chaining(df, FEATURES, TARGETS, MODELS['RandomForest'],
                          GRID['RandomForest'], NUM_VARS, CAT_VARS) 

TRAIN YEARS:  [2016]
TEST YEARS:  [2017]
ALL FOLDS Time Elapsed: 0:05:39.487751
TRAIN YEARS:  [2016, 2017]
TEST YEARS:  [2018]
ALL FOLDS Time Elapsed: 0:13:19.805606
TRAIN YEARS:  [2016, 2017, 2018]
TEST YEARS:  [2019]
ALL FOLDS Time Elapsed: 0:21:27.427924
ALL TEMPORAL SPLITS Time Elapsed: 0:50:54.574490


In [24]:
rf

Unnamed: 0,params,rank_by_CV_mean_test_rmse,CV_mean_train_rmse,CV_mean_test_rmse,mean_train_rmse,mean_test_rmse,Train Years,Test Years
0,"{'max_depth': 60, 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 10}",1,48162.248988,125956.169587,56529.009428,113237.926733,[2016],[2017]
1,"{'max_depth': 100, 'max_features': 'auto', 'min_samples_split': 5, 'n_estimators': 100}",2,53724.658931,127513.56565,53252.687268,105352.367339,[2016],[2017]
2,"{'max_depth': 40, 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 100}",3,44539.169041,128319.424098,43535.863715,103327.299832,[2016],[2017]
3,"{'max_depth': 40, 'max_features': 'auto', 'min_samples_split': 10, 'n_estimators': 100}",4,62735.160998,128417.9162,63120.85337,106956.419718,[2016],[2017]
4,"{'max_depth': 100, 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 100}",5,43780.609272,128528.94694,44704.941195,103337.644853,[2016],[2017]
5,"{'max_depth': 60, 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 100}",6,43015.627487,128560.4743,44202.068007,105254.389894,[2016],[2017]
6,"{'max_depth': 60, 'max_features': 'auto', 'min_samples_split': 5, 'n_estimators': 10}",7,57526.421773,128598.79482,59375.43069,111595.808702,[2016],[2017]
7,"{'max_depth': 60, 'max_features': 'auto', 'min_samples_split': 5, 'n_estimators': 100}",8,53425.543408,128800.577674,54297.571846,104758.919572,[2016],[2017]
8,"{'max_depth': 20, 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 100}",9,43939.088682,129058.507413,43010.886513,102876.94183,[2016],[2017]
9,"{'max_depth': 80, 'max_features': 'auto', 'min_samples_split': 10, 'n_estimators': 100}",10,62721.769292,129099.375786,62690.707131,106221.951023,[2016],[2017]


#### Boosting

In [50]:
boost = forward_chaining(df, FEATURES, TARGETS, MODELS['Boosting'],
                          GRID['Boosting'], NUM_VARS, CAT_VARS) 

TRAIN YEARS:  [2016]
TEST YEARS:  [2017]
ALL FOLDS Time Elapsed: 1:11:35.432372
TRAIN YEARS:  [2016, 2017]
TEST YEARS:  [2018]
ALL FOLDS Time Elapsed: 2:19:15.139496
TRAIN YEARS:  [2016, 2017, 2018]
TEST YEARS:  [2019]
ALL FOLDS Time Elapsed: 3:40:02.263513
ALL TEMPORAL SPLITS Time Elapsed: 17:13:21.645677


In [51]:
boost

Unnamed: 0,params,rank_by_CV_mean_test_rmse,CV_mean_train_rmse,CV_mean_test_rmse,mean_train_rmse,mean_test_rmse,Train Years,Test Years
0,"{'criterion': 'friedman_mse', 'learning_rate': 0.1, 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'min_samples_split': 0.1, 'n_estimators': 500, 'subsample': 1.0}",1,93634.047341,134915.422446,96789.594272,112269.628600,[2016],[2017]
1,"{'criterion': 'friedman_mse', 'learning_rate': 0.1, 'max_depth': 8, 'max_features': 'auto', 'min_samples_leaf': 0.1, 'min_samples_split': 0.5, 'n_estimators': 500, 'subsample': 1.0}",2,92963.917295,135720.365810,96309.540457,112490.819778,[2016],[2017]
2,"{'criterion': 'friedman_mse', 'learning_rate': 0.1, 'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 0.1, 'min_samples_split': 0.5, 'n_estimators': 500, 'subsample': 1.0}",2,92963.917295,135720.365810,96309.540457,112490.819778,[2016],[2017]
3,"{'criterion': 'friedman_mse', 'learning_rate': 0.1, 'max_depth': 3, 'max_features': 'auto', 'min_samples_leaf': 0.1, 'min_samples_split': 0.1, 'n_estimators': 100, 'subsample': 1.0}",4,108922.653581,135815.537645,110965.020295,114776.412957,[2016],[2017]
4,"{'criterion': 'friedman_mse', 'learning_rate': 0.1, 'max_depth': 8, 'max_features': 'auto', 'min_samples_leaf': 0.1, 'min_samples_split': 0.1, 'n_estimators': 100, 'subsample': 1.0}",5,107049.106591,135956.789979,109974.625706,114936.773510,[2016],[2017]
...,...,...,...,...,...,...,...,...
4369,"{'criterion': 'friedman_mse', 'learning_rate': 0.9, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 0.5, 'min_samples_split': 0.1, 'n_estimators': 10, 'subsample': 0.5}",1454,217275.261266,237978.922703,219316.394417,202837.993421,"[2016, 2017, 2018]",[2019]
4370,"{'criterion': 'friedman_mse', 'learning_rate': 0.9, 'max_depth': 3, 'max_features': 'auto', 'min_samples_leaf': 0.30000000000000004, 'min_samples_split': 0.1, 'n_estimators': 10, 'subsample': 0.5}",1455,217280.463992,237982.778186,219275.273818,202481.112171,"[2016, 2017, 2018]",[2019]
4371,"{'criterion': 'friedman_mse', 'learning_rate': 0.9, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 0.30000000000000004, 'min_samples_split': 0.30000000000000004, 'n_estimators': 10, 'subsample': 0.5}",1456,217278.615485,238092.022789,219270.075001,202388.504467,"[2016, 2017, 2018]",[2019]
4372,"{'criterion': 'friedman_mse', 'learning_rate': 0.9, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 0.30000000000000004, 'min_samples_split': 0.30000000000000004, 'n_estimators': 100, 'subsample': 0.5}",1457,217276.337891,238405.267091,219271.285590,202414.712916,"[2016, 2017, 2018]",[2019]


#### Combine results

In [56]:
linear['model'] = 'Linear'
ridge['model'] = 'Ridge'
lasso['model'] = 'Lasso'
enet['model'] = 'ElasticNet'
dtree['model'] = 'DecisionTree'
rf['model'] = 'RandomForest'
boost['model'] = 'Boosting'

In [57]:
df = pd.concat([linear, ridge, lasso, enet, dtree, rf, boost], sort=False)

In [58]:
df = df.rename(columns={'Test Years': 'Validate Year'})

In [59]:
df.head()

Unnamed: 0,params,rank_by_CV_mean_test_rmse,CV_mean_train_rmse,CV_mean_test_rmse,mean_train_rmse,mean_test_rmse,Train Years,Validate Year,model
0,{},1,115024.102404,1.064346e+16,116783.913855,112810.487544,[2016],[2017],Linear
1,{},1,111231.797098,3563200000000000.0,112679.57929,123131.921637,"[2016, 2017]",[2018],Linear
2,{},1,113020.187417,3525372000000000.0,114506.622179,105074.562296,"[2016, 2017, 2018]",[2019],Linear
0,"{'alpha': 0.1, 'max_iter': 10000, 'random_state': 0}",1,115033.383808,158368.0,116789.927505,112753.062497,[2016],[2017],Ridge
1,"{'alpha': 0.01, 'max_iter': 10000, 'random_state': 0}",2,115024.281769,158532.3,116783.994517,112802.643696,[2016],[2017],Ridge


In [60]:
df.to_csv('../data/all_results.csv', index=False)