# General imports

In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
import statsmodels.api as sm
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score

from lazypredict.Supervised import LazyRegressor
from sklearn import datasets
from sklearn.utils import shuffle

# DF

In [18]:
df = pd.read_csv('/Users/Admin/Experiments/df_dubai_final_for_sale.csv')
df.head()

Unnamed: 0,location,property_type,price_aed,beds,baths,size_sqft,pool,balcony,maid,gym,brand_new,burj_view,furnished,ready,freehold,concierge,security,intercom,sea_view,beach
0,Palm Jumeirah,Apartment,4600000.0,3,4,2266.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,Dubai Marina,Apartment,2000000.0,2,2,1350.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,Arjan,Apartment,740000.0,1,2,776.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,Downtown Dubai,Apartment,9999900.0,3,5,1993.0,0,0,0,0,0,0,1,1,0,0,0,0,0,0
4,Business Bay,Apartment,2450000.0,2,3,1250.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [19]:
df.drop(columns=['freehold','ready','concierge','security','intercom'], inplace=True)

# Categorical variables

In [20]:
df = pd.get_dummies(df)
print(f"Now we have {df.shape[-1]} features!")
df

Now we have 120 features!


Unnamed: 0,price_aed,beds,baths,size_sqft,pool,balcony,maid,gym,brand_new,burj_view,...,location_Umm Suqeim,location_Wadi Al Safa 2,location_Wasl Gate,location_World Trade Centre,location_Za'abeel,property_type_Apartment,property_type_Hotel Apartment,property_type_Penthouse,property_type_Townhouse,property_type_Villa
0,4600000.00,3,4,2266.00,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,2000000.00,2,2,1350.00,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,740000.00,1,2,776.00,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,9999900.00,3,5,1993.00,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,2450000.00,2,3,1250.00,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51223,5999999.00,4,5,4882.00,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
51224,1260000.00,2,3,1438.00,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
51225,930000.00,3,3,1208.00,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
51226,5000000.00,4,5,5200.00,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# Train/Test splits

In [21]:
X = df.drop('price_aed',axis=1)
y = df.price_aed.values
print(f"X: {X.shape}, Y: {y.shape}")

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=12345)
print(f" Size of train: {X_train.shape[0]} \n Size of test: {X_test.shape[0]}")

X: (51228, 119), Y: (51228,)
 Size of train: 40982 
 Size of test: 10246


In [22]:
# Random Forest

rf = RandomForestRegressor()
score = cross_val_score(rf,X_train,y_train,scoring='r2',cv=3,n_jobs=3)
print(f"Mean of CV R2 score: {np.mean(score)}")

Mean of CV R2 score: 0.9384704764867551


In [23]:
parameters = {
    "n_estimators": [10,100],
    'criterion':['squared_error'], 
#     'max_features':[1.0,'sqrt','log']
}

gs = GridSearchCV(rf,parameters,scoring='r2',cv=3)
gs.fit(X_train,y_train)

best_rf_model = gs.best_estimator_
best_rf_score = gs.best_score_
print(f"Best parameters: {gs.best_params_}\nBest score: {gs.best_score_}")

Best parameters: {'criterion': 'squared_error', 'n_estimators': 100}
Best score: 0.9384313087383692


In [24]:
print(f"Test R2 score is: {best_rf_model.score(X_test,y_test)}") 

Test R2 score is: 0.940507471442091


In [25]:
import pickle

filename = 'final_model.p'
pickle.dump(best_rf_model, open(filename, 'wb'))

In [26]:
# Sanity Check

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.940507471442091


0.9477132261851665