In [34]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Lasso

from sklearn.neighbors import KNeighborsRegressor

from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


In [35]:
X_train = pd.read_csv('X_train')
X_test = pd.read_csv('X_test')
y_train = pd.read_csv('y_train')
y_test = pd.read_csv('y_test')


### Multiple Linear Regression

In [20]:
def run_multipleLinearRegressor(X_train, X_test, y_train, y_test):
    slr = LinearRegression()
    slr.fit(X_train, y_train)
    print('Train set')
    pred = slr.predict(X_train)
    print('Multiple Linear Regression R2: {}'.format(slr.score(X_train, y_train)))
    print('Multiple Linear Regression MSE: {}'.format(mean_squared_error(y_train, pred)))
    
    print('Test set')
    pred = slr.predict(X_test)
    print('Multiple Linear Regression R2: {}'.format(slr.score(X_test, y_test)))
    print('Multiple Linear Regression MSE: {}'.format(mean_squared_error(y_test, pred)))


In [21]:
run_multipleLinearRegressor(X_train,
                  X_test,
                  y_train, y_test)

Train set
Simple Linear Regression R2: 0.6682823372108286
Simple Linear Regression MSE: 332550167.96296227
Test set
Simple Linear Regression R2: 0.6674402655508858
Simple Linear Regression MSE: 290766593.0625662


### Lasso Regression

In [22]:
def run_lassoRegressor(X_train, X_test, y_train, y_test):
    lr = Lasso()
    lr.fit(X_train, y_train)
    print('Train set')
    pred = lr.predict(X_train)
    print('Simple Linear Regression R2: {}'.format(lr.score(X_train, y_train)))
    print('Simple Linear Regression MSE: {}'.format(mean_squared_error(y_train, pred)))
    
    print('Test set')
    pred = lr.predict(X_test)
    print('Simple Linear Regression R2: {}'.format(lr.score(X_test, y_test)))
    print('Simple Linear Regression MSE: {}'.format(mean_squared_error(y_test, pred)))


In [23]:
run_lassoRegressor(X_train,
                  X_test,
                  y_train, y_test)

Train set
Simple Linear Regression R2: 0.6682821104364218
Simple Linear Regression MSE: 332550395.30650026
Test set
Simple Linear Regression R2: 0.6675021644041192
Simple Linear Regression MSE: 290712473.1051427


### Nearest Neighbors

In [24]:
def run_nearestNeighbors(X_train, X_test, y_train, y_test):
    neigh = KNeighborsRegressor(n_neighbors=50)
    neigh.fit(X_train, y_train)
    print('Train set')
    pred = neigh.predict(X_train)
    print('Nearest Neighbors Regression R2: {}'.format(neigh.score(X_train, y_train)))
    print('Nearest Neighbors Regression MSE: {}'.format(mean_squared_error(y_train, pred)))
    
    print('Test set')
    pred = neigh.predict(X_test)
    print('Nearest Neighbors Regression R2: {}'.format(neigh.score(X_test, y_test)))
    print('Nearest Neighbors Regression MSE: {}'.format(mean_squared_error(y_test, pred)))


In [25]:
run_nearestNeighbors(X_train,
                  X_test,
                  y_train, y_test)

Train set
Simple Linear Regression R2: 0.5674981115540114
Simple Linear Regression MSE: 433587329.77213967
Test set
Simple Linear Regression R2: 0.5588932563108957
Simple Linear Regression MSE: 385672382.29205626


### Decision Tree

In [26]:
def run_decisionTree(X_train, X_test, y_train, y_test):
    dt = DecisionTreeRegressor()
    dt.fit(X_train, y_train)
    print('Train set')
    pred = dt.predict(X_train)
    print('Decision Tree R2: {}'.format(dt.score(X_train, y_train)))
    print('Decision Tree MSE: {}'.format(mean_squared_error(y_train, pred)))
    
    print('Test set')
    pred = dt.predict(X_test)
    print('Decision Tree R2: {}'.format(dt.score(X_test, y_test)))
    print('Decision Tree MSE: {}'.format(mean_squared_error(y_test, pred)))



In [27]:
run_decisionTree(X_train,
        X_test,
        y_train, y_test)

Train set
Decision Tree R2: 0.9999999895392025
Decision Tree MSE: 10.487050739957716
Test set
Decision Tree R2: 0.34668806046803224
Decision Tree MSE: 571209521.740444


### Random Forest

In [36]:
def run_randomForests(X_train, X_test, y_train, y_test):
    rf = RandomForestRegressor(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict(X_train)
    print('Random Forests R2: {}'.format(r2_score(y_train, pred)))
    print('Random Forests MSE: {}'.format(mean_squared_error(y_train, pred)))
    
    print('Test set')
    pred = rf.predict(X_test)
    print('Random Forests R2: {}'.format(r2_score(y_test, pred)))
    print('Random Forests MSE: {}'.format(mean_squared_error(y_test, pred)))


In [37]:
run_randomForests(X_train,
                  X_test,
                  y_train, y_test)

  This is separate from the ipykernel package so we can avoid doing imports until


Train set
Random Forests R2: 0.8123712843455874
Random Forests MSE: 188099603.68377063
Test set
Random Forests R2: 0.7417169755753901
Random Forests MSE: 225824317.49364617


### Gradient Boosting

In [28]:
def run_gradientboosting(X_train, X_test, y_train, y_test):
    gb = GradientBoostingRegressor(n_estimators=100,random_state=0)
    gb.fit(X_train, y_train)
    print('Train set')
    pred = gb.predict(X_train)
    print('Gradient Boosting R2: {}'.format(r2_score(y_train, pred)))
    print('Gradient Boosting MSE: {}'.format(mean_squared_error(y_train, pred)))
    
    print('Test set')
    pred = gb.predict(X_test)
    print('Gradient Boosting R2: {}'.format(r2_score(y_test, pred)))
    print('Gradient Boosting MSE: {}'.format(mean_squared_error(y_test, pred)))

    

In [29]:
run_gradientboosting(X_train,
                  X_test,
                  y_train, y_test)

  y = column_or_1d(y, warn=True)


Train set
Random Forests R2: 0.9284394227452683
Random Forests MSE: 71740171.40206507
Test set
Random Forests R2: 0.7816665543994454
Random Forests MSE: 190895245.4332629


### Tuning Hyperparameters in the Random Forest model

In [39]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)


{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [40]:

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 16.4min finished
  self.best_estimator_.fit(X, y, **fit_params)


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [41]:
rf_random.best_params_


{'bootstrap': True,
 'max_depth': 70,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 1400}

In [42]:
def evaluate(model, test_features, test_labels):
    pred = model.predict(test_features)
    print('Model Performance')
    print('R2: {}'.format(r2_score(test_labels, pred)))
    print('MSE: {}'.format(mean_squared_error(test_labels, pred)))
    
    return 

base_model = RandomForestRegressor(n_estimators=200, random_state=39, max_depth=4)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)



  # Remove the CWD from sys.path while we load stuff.


Model Performance
R2: 0.7417169755753901
MSE: 225824317.49364617
Model Performance
R2: 0.7910890511885029
MSE: 182656884.00314087


In [46]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
     'max_depth': [60, 70, 80],
     'max_features': ['auto'],
     'min_samples_leaf': [1,2],
     'min_samples_split': [2, 3],
     'n_estimators': [1300, 1400, 1500]
}

# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [47]:
grid_search.fit(X_train, y_train)

grid_search.best_params_


Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed: 10.4min finished
  self.best_estimator_.fit(X, y, **fit_params)


{'bootstrap': True,
 'max_depth': 80,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 3,
 'n_estimators': 1400}

In [49]:
print('Original')
base_model = RandomForestRegressor(n_estimators=200, random_state=39, max_depth=4)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

print('Random Search')
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

print('Grid Search')
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)



Original


  This is separate from the ipykernel package so we can avoid doing imports until


Model Performance
R2: 0.7417169755753901
MSE: 225824317.49364617
Random Search
Model Performance
R2: 0.7910890511885029
MSE: 182656884.00314087
Grid Search
Model Performance
R2: 0.7930482714981615
MSE: 180943881.02807504


### Tuning Hyperparameters in the Gradient Boosting model

In [52]:
# Loss function to be optimized
loss=['ls', 'lad']
# Number of boosting stages to perform
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(2, 50, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'loss': loss}
print(random_grid)


{'n_estimators': [100, 311, 522, 733, 944, 1155, 1366, 1577, 1788, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [2, 6, 11, 16, 21, 26, 30, 35, 40, 45, 50, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'loss': ['ls', 'lad']}


In [53]:

# Use the random grid to search for best hyperparameters
# First create the base model to tune
gb = GradientBoostingRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
gb_random = RandomizedSearchCV(estimator = gb, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
gb_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 47.9min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 100.2min finished
  y = column_or_1d(y, warn=True)


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                       criterion='friedman_mse',
                                                       init=None,
                                                       learning_rate=0.1,
                                                       loss='ls', max_depth=3,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                             

In [54]:
gb_random.best_params_


{'loss': 'lad',
 'max_depth': 45,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'n_estimators': 2000}

In [55]:
def evaluate(model, test_features, test_labels):
    pred = model.predict(test_features)
    print('Model Performance')
    print('R2: {}'.format(r2_score(test_labels, pred)))
    print('MSE: {}'.format(mean_squared_error(test_labels, pred)))
    
    return 

base_model = GradientBoostingRegressor(n_estimators=100,random_state=0)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

best_random = gb_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

  y = column_or_1d(y, warn=True)


Model Performance
R2: 0.7816665543994454
MSE: 190895245.4332629
Model Performance
R2: 0.7681318740901667
MSE: 202729007.83459216


In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
     'max_depth': [95, 100, 105],
     'max_features': ['sqrt'],
     'min_samples_leaf': [1,2],
     'min_samples_split': [5, 6],
     'n_estimators': [1980, 2000, 2020]
}

# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
grid_search.fit(X_train, y_train)

grid_search.best_params_

In [None]:
base_model = GradientBoostingRegressor(n_estimators=100,random_state=0)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

best_random = gb_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)