In [20]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, median_absolute_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV


In [2]:
path = 'https://raw.githubusercontent.com/Codecademy/datasets/master/streeteasy/queens.csv'
df = pd.read_csv(path)
df.head(3)

Unnamed: 0,rental_id,rent,bedrooms,bathrooms,size_sqft,min_to_subway,floor,building_age_yrs,no_fee,has_roofdeck,has_washer_dryer,has_doorman,has_elevator,has_dishwasher,has_patio,has_gym,neighborhood,borough
0,10234,3000,3.0,1,1000,4,1.0,106,0,0,0,0,0,0,0,0,Astoria,Queens
1,10524,1950,1.0,1,950,1,3.0,83,0,0,0,0,0,0,0,0,Forest Hills,Queens
2,5465,3500,1.0,1,996,9,7.0,3,1,0,0,0,0,0,0,0,Astoria,Queens


In [3]:
y = df['rent']
X = df[['bedrooms', 'bathrooms', 'size_sqft', 'min_to_subway', 'floor', 'building_age_yrs', 'no_fee']]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=2019)



# Random Forest

In [7]:
from sklearn.ensemble import RandomForestRegressor

In [13]:
model = RandomForestRegressor(random_state=2019, max_depth=10, n_estimators=20)

In [14]:
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
           oob_score=False, random_state=2019, verbose=0, warm_start=False)

In [15]:
y_pred = model.predict(X_train)

In [16]:
mean_absolute_error(y_train, y_pred)

146.54426509599716

In [17]:
y_pred2 = model.predict(X_test)

In [18]:
mean_absolute_error(y_test, y_pred2)

272.8788839652479

## Hyperparam Optimisation

In [31]:
scorer = make_scorer(mean_absolute_error)

In [42]:
grid = GridSearchCV(model, param_grid={'n_estimators':[5,10,20], 'max_depth':[3, 5,10]}, scoring=scorer, cv=5)

In [43]:
grid.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
           oob_score=False, random_state=2019, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [5, 10, 20], 'max_depth': [3, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(mean_absolute_error), verbose=0)

In [44]:
grid.best_score_

421.56531394260776

In [45]:
pd.DataFrame(grid.cv_results_)[['params', 'mean_test_score']]



Unnamed: 0,params,mean_test_score
0,"{'max_depth': 3, 'n_estimators': 5}",421.565314
1,"{'max_depth': 3, 'n_estimators': 10}",407.624248
2,"{'max_depth': 3, 'n_estimators': 20}",405.734071
3,"{'max_depth': 5, 'n_estimators': 5}",367.581864
4,"{'max_depth': 5, 'n_estimators': 10}",355.444531
5,"{'max_depth': 5, 'n_estimators': 20}",342.966774
6,"{'max_depth': 10, 'n_estimators': 5}",357.606401
7,"{'max_depth': 10, 'n_estimators': 10}",343.806571
8,"{'max_depth': 10, 'n_estimators': 20}",334.831724


# Homework

[Titanik on Kaggle](https://www.kaggle.com/c/titanic)