In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
df= pd.read_csv('new_cat_processed_data.csv')
df

Unnamed: 0,Address,City,Price,Bedroom,Bathroom,Floors,Parking,Face,Year,Area,Road Width,Road Type,Build Area,Amenities
0,Budhanilkantha,Kathmandu,90000000,6,3,2.0,10,West,2073.0,5476.000,20.0,Blacktopped,98568.000,16
1,Budhanilkantha,Kathmandu,80000000,5,3,2.0,9,East,2073.0,7187.250,20.0,Blacktopped,7187.250,15
2,Dhapasi,Kathmandu,70000000,5,3,2.0,12,East,2071.0,5818.250,20.0,Blacktopped,55102.250,16
3,Budhanilkantha,Kathmandu,50000000,5,4,2.0,9,East,2073.0,20963.000,20.0,Blacktopped,20963.000,16
4,Other,Kathmandu,32500000,6,4,2.5,1,South East,2065.0,1369.000,12.0,Concrete,819346.500,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1602,Other,Others,6400000,3,1,1.0,0,North,2074.0,133.350,15.0,Soil Stabilized,133.350,4
1603,Other,Others,2200000,0,0,3.0,0,North,2074.0,23137.000,20.0,Gravelled,23137.000,0
1604,Other,Others,25000000,10,4,3.0,0,East,2074.0,155.575,20.0,Gravelled,155.575,6
1605,Other,Others,10000000,0,0,3.0,0,East,2074.0,21305.250,25.0,,21305.250,0


In [3]:
df.columns

Index(['Address', 'City', 'Price', 'Bedroom', 'Bathroom', 'Floors', 'Parking',
       'Face', 'Year', 'Area', 'Road Width', 'Road Type', 'Build Area',
       'Amenities'],
      dtype='object')

In [4]:
y = df.Price
y.head(3)

0    90000000
1    80000000
2    70000000
Name: Price, dtype: int64

In [5]:
X = df.drop(['Price'],axis='columns')
X.head(3)

Unnamed: 0,Address,City,Bedroom,Bathroom,Floors,Parking,Face,Year,Area,Road Width,Road Type,Build Area,Amenities
0,Budhanilkantha,Kathmandu,6,3,2.0,10,West,2073.0,5476.0,20.0,Blacktopped,98568.0,16
1,Budhanilkantha,Kathmandu,5,3,2.0,9,East,2073.0,7187.25,20.0,Blacktopped,7187.25,15
2,Dhapasi,Kathmandu,5,3,2.0,12,East,2071.0,5818.25,20.0,Blacktopped,55102.25,16


In [6]:
print(X.shape, y.shape)

(1607, 13) (1607,)


In [7]:
df.dtypes

Address        object
City           object
Price           int64
Bedroom         int64
Bathroom        int64
Floors        float64
Parking         int64
Face           object
Year          float64
Area          float64
Road Width    float64
Road Type      object
Build Area    float64
Amenities       int64
dtype: object

In [8]:
# determine categorical and numerical features
numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns

In [9]:
numerical_ix

Index(['Bedroom', 'Bathroom', 'Floors', 'Parking', 'Year', 'Area',
       'Road Width', 'Build Area', 'Amenities'],
      dtype='object')

In [10]:
categorical_ix

Index(['Address', 'City', 'Face', 'Road Type'], dtype='object')

In [11]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
# define the data preparation for the columns
t = [('cat', OneHotEncoder(handle_unknown = 'ignore'), categorical_ix), ('num', MinMaxScaler(), numerical_ix)]
col_transform = ColumnTransformer(transformers=t)
col_transform

ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'),
                                 Index(['Address', 'City', 'Face', 'Road Type'], dtype='object')),
                                ('num', MinMaxScaler(),
                                 Index(['Bedroom', 'Bathroom', 'Floors', 'Parking', 'Year', 'Area',
       'Road Width', 'Build Area', 'Amenities'],
      dtype='object'))])

In [12]:
from sklearn.ensemble import RandomForestRegressor
# define the model
# rf_model = RandomForestRegressor(bootstrap = True, max_depth= 10, max_features = 'auto',
#  min_samples_leaf = 4,
#  min_samples_split = 5,
#  n_estimators = 200)
rf_model = RandomForestRegressor(bootstrap = False, max_depth= None, max_features = 'sqrt',
 min_samples_leaf = 2,
 min_samples_split = 5,
 n_estimators = 1400)
# rf_model = RandomForestRegressor(bootstrap = True, max_depth= None, max_features = 'auto',
#  min_samples_leaf = 2,
#  min_samples_split = 2,
#  n_estimators = 1800)
# define the data preparation and modeling pipeline
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps=[('prep',col_transform), ('m', rf_model)])

In [14]:
from numpy import std
from numpy import absolute
from statistics import mean
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# define the model cross-validation configuration
cv = KFold(n_splits=10, shuffle=True, random_state=1)
# evaluate the pipeline using cross validation and calculate MAE
scores = cross_val_score(pipeline, X, y, scoring='neg_mean_absolute_error', cv=cv,
n_jobs=-1)
# convert MAE scores to positive values
scores = absolute(scores)
# summarize the model performance
print('MAE: %.3f (%.3f)' % (mean(scores), std(scores)))

MAE: 6092321.977 (610840.008)


In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [16]:
X_train.shape

(1285, 13)

In [17]:
X_train

Unnamed: 0,Address,City,Bedroom,Bathroom,Floors,Parking,Face,Year,Area,Road Width,Road Type,Build Area,Amenities
989,Samakhusi,Kathmandu,0,0,3.0,0,West,2074.0,2053.500,10.0,Gravelled,2053.500,0
1564,Other,Others,0,0,0.0,0,South,2074.0,4920.000,32.0,Blacktopped,4920.000,0
935,Other,Kathmandu,0,0,3.0,0,North,2074.0,2395.750,20.0,,2395.750,0
1129,Other,Kathmandu,0,0,3.0,0,East,2074.0,6383.000,16.0,,6383.000,0
723,Budhanilkantha,Kathmandu,0,0,3.0,0,North,2074.0,4791.500,13.0,Soil Stabilized,4791.500,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1393,Imadol,Lalitpur,2,1,1.0,0,East,2074.0,11964.750,13.0,Gravelled,11964.750,0
1344,Bhaisepati,Lalitpur,0,0,3.0,0,North East,2074.0,9343.500,16.0,Blacktopped,9343.500,3
527,Other,Kathmandu,5,6,3.0,3,East,2076.0,2224.625,15.0,Blacktopped,2224.625,10
1149,Other,Pokhara,0,0,3.0,0,South West,2074.0,7187.250,30.0,Blacktopped,7187.250,0


In [18]:
pipeline.fit(X_train,y_train)
pipeline

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  Index(['Address', 'City', 'Face', 'Road Type'], dtype='object')),
                                                 ('num', MinMaxScaler(),
                                                  Index(['Bedroom', 'Bathroom', 'Floors', 'Parking', 'Year', 'Area',
       'Road Width', 'Build Area', 'Amenities'],
      dtype='object'))])),
                ('m',
                 RandomForestRegressor(bootstrap=False, max_features='sqrt',
                                       min_samples_leaf=2, min_samples_split=5,
                                       n_estimators=1400))])

In [19]:
print("Training accuracy:" ,pipeline.score(X_train,y_train))

Training accuracy: 0.9158480141812436


In [20]:
prediction = pipeline.predict(X_test)

In [21]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, prediction)
# rmse = mse**.5
print("MSE", mse)
from sklearn.metrics import mean_absolute_error
print("MAE",mean_absolute_error(y_test,prediction))
print("RMSE",np.sqrt(mean_squared_error(y_test,prediction)))
print("RMSLE",np.log(np.sqrt(mean_squared_error(y_test,prediction))))
from sklearn.metrics import r2_score
r2 = r2_score(y_test,prediction)
print("R2 score(COD)",r2)

MSE 58157308622458.89
MAE 5578873.895300892
RMSE 7626093.929559148
RMSLE 15.847086336339155
R2 score(COD) 0.8025355551033202


In [22]:
from sklearn.ensemble import GradientBoostingRegressor
# Fit the Gradient Boosting Regressor with the best parameters
best_gb = GradientBoostingRegressor(learning_rate=0.039122914019804195,
                                max_depth=5,
                                max_features='log2',
                                min_samples_leaf=2,
                                min_samples_split=5,
                                n_estimators=575)
pipeline2 = Pipeline(steps=[('prep',col_transform), ('m', best_gb)])

In [23]:
pipeline2.fit(X_train,y_train)
pipeline2

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  Index(['Address', 'City', 'Face', 'Road Type'], dtype='object')),
                                                 ('num', MinMaxScaler(),
                                                  Index(['Bedroom', 'Bathroom', 'Floors', 'Parking', 'Year', 'Area',
       'Road Width', 'Build Area', 'Amenities'],
      dtype='object'))])),
                ('m',
                 GradientBoostingRegressor(learning_rate=0.039122914019804195,
                                           max_depth=5, max_features='log2',
                                           min_samples_leaf=2,
                                           min_samples_split=5,
                                           n_estimators=575))])

In [24]:
print("Training accuracy:" ,pipeline2.score(X_train,y_train))

Training accuracy: 0.9014447420358216


In [25]:
prediction = pipeline2.predict(X_test)

In [26]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, prediction)
# rmse = mse**.5
print("MSE", mse)
from sklearn.metrics import mean_absolute_error
print("MAE",mean_absolute_error(y_test,prediction))
print("RMSE",np.sqrt(mean_squared_error(y_test,prediction)))
print("RMSLE",np.log(np.sqrt(mean_squared_error(y_test,prediction))))
from sklearn.metrics import r2_score
r2 = r2_score(y_test,prediction)
print("R2 score(COD)",r2)

MSE 59579030262351.625
MAE 5698153.162754291
RMSE 7718745.381365525
RMSLE 15.859162393426745
R2 score(COD) 0.7977083118716656


In [29]:
import pickle
pickle.dump(pipeline, open('pipeline.pkl', 'wb'))