In [23]:
import sys, os
sys.path.insert(0, os.path.dirname(os.path.abspath('..')))
import import_ipynb
from utils import data_set, pipelines, feature_engineering
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer, KNNImputer


# models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor

random_state = 1

In [2]:
original_data = data_set.get_dataset('listings')
listings_data = data_set.get_dataset('listings')

In [3]:
#calendar_data = data_set.get_dataset('calendar')
#neighbourhoods_data = data_set.get_dataset('neighbourhoods')
#reviews_data = data_set.get_dataset('reviews')

In [4]:
data_set.drop_useless_columns_listings(listings_data)
data_set.convert_column_types(listings_data)

In [5]:
feature_engineering.make_total_characters_feature(listings_data)
feature_engineering.make_total_amenities_feature(listings_data)
feature_engineering.make_specific_amenity_features(listings_data)
listings_data.drop('amenities', axis = 1, inplace = True)

In [6]:
prices = listings_data.pop('price')

In [7]:
x_train, x_test, y_train, y_test = train_test_split(listings_data, prices, test_size = .25, random_state = random_state)

In [8]:
x_train.head()

Unnamed: 0,host_is_superhost,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,number_of_reviews,review_scores_rating,total_amenities,has_pool,has_wifi,has_kitchen
18040,False,40.76171,-73.99968,Entire serviced apartment,Entire home/apt,3,1.0,1.0,2.0,0,,37,True,True,True
5443,False,40.81051,-73.9444,Private room in rental unit,Private room,1,1.0,1.0,1.0,0,,10,False,True,True
31071,False,40.59924,-73.96201,Entire condominium (condo),Entire home/apt,2,1.0,1.0,1.0,0,,21,False,True,True
20721,False,40.69424,-73.97312,Entire rental unit,Entire home/apt,5,1.0,2.0,3.0,72,4.7,27,False,True,True
35641,False,40.58424,-73.93516,Entire residential home,Entire home/apt,6,2.5,3.0,3.0,5,5.0,37,False,True,True


preprocessor = pipelines.make_regressor_pipeline('dummy')
out = pd.DataFrame(preprocessor.fit_transform(x_train))
out.head()

Types of models: 
-  Linear Regression
-  Random Forest
-  SVR
-  KNN
-  XGBRegressor
 

Create a dictionary of pipelines to be used for cross validation.

In [37]:
pipelines_dict = dict()

pipelines_dict['mlp']               = pipelines.make_regressor_pipeline(MLPRegressor())
pipelines_dict['linear_regression'] = pipelines.make_regressor_pipeline(LinearRegression())
pipelines_dict['random_forest']     = pipelines.make_regressor_pipeline(RandomForestRegressor())
pipelines_dict['knn']               = pipelines.make_regressor_pipeline(KNeighborsRegressor())
pipelines_dict['xgb_regressor']     = pipelines.make_regressor_pipeline(XGBRegressor())


Create dictionary of parameter grids for each type of model.

In [51]:
params_dict = dict()

params_dict['linear_regression'] = [{}]

params_dict['random_forest'] = [{
    'regressor__criterion' : ['mse'], 
    'regressor__max_features' : ['auto', 'sqrt', 'log2'], 
    'regressor__random_state' : [random_state] }]

params_dict['mlp'] = [{
    'regressor__hidden_layer_sizes' : [100],
    'regressor__solver' : ['adam'],
    'regressor__activation' : ['identity', 'logistic', 'tanh', 'relu'],
    'regressor__random_state' : [random_state],
    'regressor__max_iter' : [600],
    'regressor__tol' : [.1]
}]

params_dict['knn'] = [{
    'regressor__n_neighbors':[5, 10, 15], 
    'regressor__weights': ['uniform', 'distance']}]

params_dict['xgb_regressor'] = [{
    'regressor__random_state':[random_state], 
    'regressor__n_estimators':[100, 150],
    'regressor__max_depth':[2,3,5]}]


Create the grid search object for each model.

In [52]:
grid_searches = dict()
for model in pipelines_dict:
    grid_searches[model] = GridSearchCV(
        estimator = pipelines_dict[model],
        param_grid = params_dict[model],
        scoring = 'neg_mean_absolute_error',
        cv = 4
    )

Fit and run the grid searches using the training data.

In [53]:
results = dict()
for model in grid_searches:
    print("Working on " + model, end = "... ")
    results[model] = grid_searches[model].fit(x_train, y_train)
    print("Done")
    

Working on mlp... Done
Working on linear_regression... Done
Working on random_forest... Done
Working on knn... Done
Working on xgb_regressor... Done


View the final results.

In [95]:
columns = ['mean_test_score', 'std_test_score', 'mean_fit_time', 'params', 'mean_score_time', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score']
all_results = pd.DataFrame()
for model in results:    
    model_results = pd.DataFrame(results[model].cv_results_)[columns]
    model_name = pd.DataFrame([model] * model_results.shape[0], columns = ["model"]) 
    model_results = model_name.join(model_results)
    all_results = pd.concat([all_results, model_results])
    
test = ['model', 'mean_test_score', 'std_test_score', 'mean_fit_time', 'params']
all_results[test].sort_values(by = ['mean_test_score', 'std_test_score'], ascending = [False, True])

Unnamed: 0,model,mean_test_score,std_test_score,mean_fit_time,params
1,random_forest,-68.530616,2.633288,3.474727,"{'regressor__criterion': 'mse', 'regressor__ma..."
2,random_forest,-68.530616,2.633288,3.439618,"{'regressor__criterion': 'mse', 'regressor__ma..."
0,random_forest,-72.021097,2.626137,11.581788,"{'regressor__criterion': 'mse', 'regressor__ma..."
2,xgb_regressor,-73.24258,2.470464,0.867647,"{'regressor__max_depth': 3, 'regressor__n_esti..."
1,xgb_regressor,-73.481531,2.175416,0.898843,"{'regressor__max_depth': 2, 'regressor__n_esti..."
0,xgb_regressor,-73.486789,1.67884,0.654504,"{'regressor__max_depth': 2, 'regressor__n_esti..."
3,xgb_regressor,-74.083422,1.90645,1.251145,"{'regressor__max_depth': 3, 'regressor__n_esti..."
4,xgb_regressor,-74.392244,1.285609,1.431904,"{'regressor__max_depth': 5, 'regressor__n_esti..."
5,xgb_regressor,-75.425794,1.218554,2.039066,"{'regressor__max_depth': 5, 'regressor__n_esti..."
5,knn,-76.044501,2.486293,0.077938,"{'regressor__n_neighbors': 15, 'regressor__wei..."


Save results as a csv file.

In [71]:
fp = os.getcwd() + "/results/all_results.csv"
all_results.to_csv(fp)

In [104]:
best_estimator = grid_searches['random_forest'].best_estimator_
np.mean(abs(best_estimator.predict(x_test) - y_test))

62.38455616960464

In [107]:
y_test.shape

(9181,)