# Model Creation Process

First, let's import the necessary libraries and load the data.

In [6]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)


In [47]:
#df = pd.read_csv("NY-House-Dataset-Preprocessed.csv")
df = pd.read_csv("datasets/NY-House-Dataset-Preprocessed.csv")
df.head(10)

Unnamed: 0,PRICE,BEDS,BATH,PROPERTYSQFT,Bronx,Brooklyn,Manhattan,Queens,Staten Island
0,265000,1,1,750.0,True,False,False,False,False
1,375000,2,1,850.0,True,False,False,False,False
2,350000,1,1,700.0,True,False,False,False,False
3,199900,1,1,2184.207862,True,False,False,False,False
4,600000,2,2,2184.207862,True,False,False,False,False
5,1295000,4,2,1995.0,True,False,False,False,False
6,1249000,3,2,2184.207862,True,False,False,False,False
7,650000,3,2,1638.0,True,False,False,False,False
8,90000,1,1,450.0,True,False,False,False,False
9,549000,5,3,496.0,True,False,False,False,False


Let's define our dataframe X and Y. X will be the features and Y will be the target variable.

In [8]:
X = df.drop(columns=['PRICE'])
Y = df.PRICE

Let's split the data into training and testing sets.

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=10)

Let's form our Linear Regression model and train it with the training data.

In [10]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train, Y_train)
lr_clf.score(X_test, Y_test)

0.5436968167278664

We can see that the model has been trained with an accuracy of 0.54. Which is not a good accuracy. Let's try score this using cross validation.

In [11]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
cross_val_score(LinearRegression(), X, Y, cv=cv)

array([0.52071553, 0.46100182, 0.47584741, 0.48607382, 0.50625285])

We are getting a mean accuracy of 0.54. Which is not a good accuracy. Let's try to improve the accuracy by using other models.

In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

def find_best_model_using_gridsearchcv(X, Y):
    algos = {
        'linear_regression': {
            'model': Pipeline([
                ('scaler', StandardScaler()),
                ('lr', LinearRegression())
            ]),
            'params': {
                'lr__fit_intercept': [True, False]
            }
        },
        'lasso': {
            'model': Pipeline([
                ('scaler', StandardScaler()),
                ('lasso', Lasso(max_iter=10000))
            ]),
            'params': {
                'lasso__alpha': [100, 200, 500],
                'lasso__selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion': ['squared_error', 'friedman_mse'],
                'splitter': ['best', 'random']
            }
        },
        'random_forest': {
            'model': RandomForestRegressor(),
            'params': {
                'n_estimators': [10, 50, 100],
                'criterion': ['squared_error', 'friedman_mse'],
                'max_features': ['sqrt', 'log2', None]
            }
        },
        'svr': {
            'model': Pipeline([
                ('scaler', StandardScaler()),
                ('svr', SVR())
            ]),
            'params': {
                'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                'svr__C': [0.1, 1, 10],
                'svr__gamma': ['scale', 'auto']
            }
        },
        'gradient_boosting': {
            'model': GradientBoostingRegressor(),
            'params': {
                'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7]
            }
        },
        'knn': {
            'model': Pipeline([
                ('scaler', StandardScaler()),
                ('knn', KNeighborsRegressor())
            ]),
            'params': {
                'knn__n_neighbors': [3, 5, 7],
                'knn__weights': ['uniform', 'distance'],
                'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X, Y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])

find_best_model_using_gridsearchcv(X, Y)


Unnamed: 0,model,best_score,best_params
0,linear_regression,0.490308,{'lr__fit_intercept': True}
1,lasso,0.490015,"{'lasso__alpha': 500, 'lasso__selection': 'cyc..."
2,decision_tree,0.596572,"{'criterion': 'squared_error', 'splitter': 'be..."
3,random_forest,0.716603,"{'criterion': 'squared_error', 'max_features':..."
4,svr,-0.071964,"{'svr__C': 10, 'svr__gamma': 'scale', 'svr__ke..."
5,gradient_boosting,0.734576,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti..."
6,knn,0.68296,"{'knn__algorithm': 'ball_tree', 'knn__n_neighb..."


We can see that the model with the highest accuracy is the Gradient Boosting Regressor model with an accuracy of 0.73. Let's try to improve the accuracy by tuning the hyperparameters of the model.

In [29]:
def find_best_params_for_gradient_boosting(X, Y):
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    gs = GridSearchCV(GradientBoostingRegressor(), {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    }, cv=cv, return_train_score=False)
    gs.fit(X, Y)
    return gs.best_params_

best_params = find_best_params_for_gradient_boosting(X, Y)
best_params


{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}

Using this params, let's test the score of it.

In [30]:
model = GradientBoostingRegressor(
    n_estimators=best_params['n_estimators'],
    learning_rate=best_params['learning_rate'],
    max_depth=best_params['max_depth']
    )
model.fit(X_train, Y_train)
model.score(X_test, Y_test)

0.7148037661770301

We already know that the accuracy of the model using Gradient Boosting Regressor is 0.73. Let's try to improve the accuracy by using XGBoost.

In [32]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

def find_best_params_for_xgboost(X, Y):
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    param_dist = {
        'n_estimators': randint(50, 200),
        'learning_rate': uniform(0.01, 0.19),
        'max_depth': randint(3, 7),
        'subsample': uniform(0.8, 0.2),
        'colsample_bytree': uniform(0.8, 0.2)
    }
    gs = RandomizedSearchCV(xgb.XGBRegressor(), param_dist, n_iter=50, cv=cv, random_state=0, n_jobs=-1)
    gs.fit(X, Y)
    return gs.best_params_

best_params = find_best_params_for_xgboost(X, Y)
best_params

{'colsample_bytree': 0.8947200838693316,
 'learning_rate': 0.04540314523208439,
 'max_depth': 5,
 'n_estimators': 78,
 'subsample': 0.9136867897737297}

Using this params, let's test the score of it.

In [34]:
model = xgb.XGBRegressor(
    n_estimators=best_params['n_estimators'],
    learning_rate=best_params['learning_rate'],
    max_depth=best_params['max_depth'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree']
)
model.fit(X_train, Y_train)
model.score(X_test, Y_test)

0.767428994178772

Now let's do the prediction using the XGBoost model.

In [40]:
X.columns

Index(['BEDS', 'BATH', 'PROPERTYSQFT', 'Bronx', 'Brooklyn', 'Manhattan',
       'Queens', 'Staten Island'],
      dtype='object')

In [53]:
def predict_price(sublocality, sqft, bath, beds):
    loc_index = np.where(X.columns == sublocality)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = beds
    x[1] = bath
    x[2] = sqft

    if loc_index >= 0:
        x[loc_index] = 1
    
    return model.predict([x])[0]

In [54]:
predict_price('Staten Island', 1000, 2, 2)

696619.1

In [55]:
predict_price('Staten Island', 1000, 1, 1)

403649.22

We can observate that the predictions make sense.

In [56]:
import pickle
with open('ny_house_price_model.pickle', 'wb') as f:
    pickle.dump(model, f)

In [57]:
import json
columns = {
    'data_columns': [col.lower() for col in X.columns]
}
with open("columns.json", "w") as f:
    f.write(json.dumps(columns))