In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np

from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

from joblib import dump, load

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/digipodium/Datasets/main/house_pricing.csv')
df.info()
df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 814 entries, 0 to 813
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        814 non-null    object 
 1   Type        814 non-null    object 
 2   Beds        814 non-null    int64  
 3   Baths       814 non-null    int64  
 4   SquareFeet  814 non-null    int64  
 5   Price       814 non-null    float64
dtypes: float64(1), int64(3), object(2)
memory usage: 38.3+ KB


Unnamed: 0,City,Type,Beds,Baths,SquareFeet,Price
0,SACRAMENTO,Residential,2,1,836,138159.85
1,SACRAMENTO,Residential,3,1,1167,167541.46


In [3]:
X = df[['Beds','Baths','SquareFeet']]
y = df['Price']
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=.2, random_state=1)

In [4]:
print('Random Forest')
model2 = RandomForestRegressor()
model2.fit(xtrain,ytrain)
print("score:", model2.score(xtest,ytest) * 100)
pred = model2.predict(X)
print("mse:",mean_squared_error(y,pred))
print("mae:",mean_absolute_error(y,pred))

Random Forest
score: 71.84479403656894
mse: 886771684.4967629
mae: 15308.941511423434


grid search

In [5]:
# we are going to create a dictionary with all the parameter and their value options

In [6]:
params = {
    'n_estimators' : list(range(100,501,200)),
    'criterion' : ['squared_error','absolute_error','poisson'],
    'max_depth' : list(range(5,51,25)),
}
params

{'n_estimators': [100, 300, 500],
 'criterion': ['squared_error', 'absolute_error', 'poisson'],
 'max_depth': [5, 30]}

In [7]:
grid = GridSearchCV(estimator=RandomForestRegressor(),param_grid=params,cv=3,n_jobs=-1,verbose=2)

In [8]:
grid.fit(X,y)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'criterion': ['squared_error', 'absolute_error',
                                       'poisson'],
                         'max_depth': [5, 30],
                         'n_estimators': [100, 300, 500]},
             verbose=2)

In [9]:
gf = pd.DataFrame(grid.cv_results_)

In [10]:
gf.sort_values(by='rank_test_score',inplace=True)
gf

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
7,3.696626,0.060646,0.093399,0.011905,absolute_error,5,300,"{'criterion': 'absolute_error', 'max_depth': 5...",0.825199,0.766921,0.61455,0.735556,0.088811,1
8,5.883177,0.085739,0.173124,0.017975,absolute_error,5,500,"{'criterion': 'absolute_error', 'max_depth': 5...",0.824304,0.767459,0.614042,0.735268,0.088806,2
6,1.278573,0.083738,0.038693,0.0033,absolute_error,5,100,"{'criterion': 'absolute_error', 'max_depth': 5...",0.822383,0.767525,0.61381,0.734573,0.08828,3
1,1.405331,0.031563,0.116416,0.003303,squared_error,5,300,"{'criterion': 'squared_error', 'max_depth': 5,...",0.803843,0.767904,0.609854,0.7272,0.084264,4
0,0.53071,0.01606,0.039696,0.003773,squared_error,5,100,"{'criterion': 'squared_error', 'max_depth': 5,...",0.79823,0.765024,0.617658,0.72697,0.078475,5
2,2.375352,0.01272,0.168453,0.010538,squared_error,5,500,"{'criterion': 'squared_error', 'max_depth': 5,...",0.799472,0.766913,0.613418,0.726601,0.081129,6
11,8.693505,0.07233,0.204813,0.009037,absolute_error,30,500,"{'criterion': 'absolute_error', 'max_depth': 3...",0.760777,0.757983,0.575691,0.69815,0.086599,7
10,5.19202,0.046266,0.115749,0.017606,absolute_error,30,300,"{'criterion': 'absolute_error', 'max_depth': 3...",0.763154,0.756123,0.574352,0.697876,0.087392,8
9,1.738567,0.039231,0.042364,0.006604,absolute_error,30,100,"{'criterion': 'absolute_error', 'max_depth': 3...",0.76459,0.761569,0.567296,0.697819,0.092301,9
3,0.644124,0.027152,0.050369,0.009111,squared_error,30,100,"{'criterion': 'squared_error', 'max_depth': 30...",0.75453,0.754013,0.581251,0.696598,0.081563,10


In [11]:
grid.best_estimator_

RandomForestRegressor(criterion='absolute_error', max_depth=5, n_estimators=300)

In [12]:
from joblib import dump

In [13]:
dump(grid.best_estimator_,'house_pricing_model_73.pkl')

['house_pricing_model_73.pkl']