In [3246]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection

In [3247]:
data = pd.read_csv("./vadodara_house_price_dataset_new.csv")
data.head()

Unnamed: 0,h_type,location,society,size,bathroom,balcony,total_sqft,yr_built,furniture,sale_type,...,college,hospital,population,railway,airport,on_road,air_quality,restaurant,park,price
0,apartment,Maneja,Bakeri Swara,3 BHK,3,1,1550.0,2011.0,0,new,...,0,1,2,0,0,1,1,1,1,4361705
1,apartment,Maneja,Bakeri Swara,3 BHK,3,1,1550.0,2012.0,1,new,...,0,1,2,0,0,1,1,1,1,5001905
2,apartment,Maneja,Bakeri Swara,3 BHK,3,1,1860.0,2010.0,1,new,...,0,1,2,0,0,1,1,1,1,5588795
3,apartment,Maneja,Bakeri Swara,2 BHK,2,1,1015.0,2016.0,1,new,...,0,1,2,0,0,1,1,1,1,3184740
4,apartment,Maneja,Bakeri Swara,2 BHK,2,1,1210.0,2019.0,0,new,...,0,1,2,0,0,1,1,1,1,3498895


In [3248]:
data = data.drop(['society', 'yr_built', 'furniture', 'sale_type', 'amenities', 'market', 'office', 'school', 'college', 'hospital', 'population', 'railway', 'airport', 'on_road', 'air_quality', 'restaurant', 'park'], axis=1)
data.head()

Unnamed: 0,h_type,location,size,bathroom,balcony,total_sqft,price
0,apartment,Maneja,3 BHK,3,1,1550.0,4361705
1,apartment,Maneja,3 BHK,3,1,1550.0,5001905
2,apartment,Maneja,3 BHK,3,1,1860.0,5588795
3,apartment,Maneja,2 BHK,2,1,1015.0,3184740
4,apartment,Maneja,2 BHK,2,1,1210.0,3498895


In [3249]:
data.isnull().sum()

h_type        0
location      0
size          0
bathroom      0
balcony       0
total_sqft    0
price         0
dtype: int64

In [3250]:
df = data.iloc[ : , :-1].values
df

array([['apartment', 'Maneja', '3 BHK', 3, 1, 1550.0],
       ['apartment', 'Maneja', '3 BHK', 3, 1, 1550.0],
       ['apartment', 'Maneja', '3 BHK', 3, 1, 1860.0],
       ...,
       ['apartment', 'Gotri', '3 BHK', 3, 3, 1550.0],
       ['apartment', 'Gotri', '3 BHK', 3, 2, 1750.0],
       ['apartment', 'Gotri', '2 BHK', 2, 3, 1200.0]], dtype=object)

In [3251]:
label_encoder = LabelEncoder()

In [3252]:
df[ : ,0] = label_encoder.fit_transform(df[ : ,0])
df

array([[0, 'Maneja', '3 BHK', 3, 1, 1550.0],
       [0, 'Maneja', '3 BHK', 3, 1, 1550.0],
       [0, 'Maneja', '3 BHK', 3, 1, 1860.0],
       ...,
       [0, 'Gotri', '3 BHK', 3, 3, 1550.0],
       [0, 'Gotri', '3 BHK', 3, 2, 1750.0],
       [0, 'Gotri', '2 BHK', 2, 3, 1200.0]], dtype=object)

In [3253]:
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
mapping

{'apartment': 0,
 'duplex': 1,
 'pent house': 2,
 'tenament': 3,
 'triplex': 4,
 'villa': 5}

In [3254]:
df[ : ,1] = label_encoder.fit_transform(df[ : ,1])
df

array([[0, 15, '3 BHK', 3, 1, 1550.0],
       [0, 15, '3 BHK', 3, 1, 1550.0],
       [0, 15, '3 BHK', 3, 1, 1860.0],
       ...,
       [0, 8, '3 BHK', 3, 3, 1550.0],
       [0, 8, '3 BHK', 3, 2, 1750.0],
       [0, 8, '2 BHK', 2, 3, 1200.0]], dtype=object)

In [3255]:
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
mapping

{'Ajwa Road': 0,
 'Akota': 1,
 'Alkapuri': 2,
 'Atladra': 3,
 'Bhayli': 4,
 'Chhani': 5,
 'Fatehgunj': 6,
 'Gorwa': 7,
 'Gotri': 8,
 'Harni': 9,
 'Karelibaug': 10,
 'Khodiyar Nagar': 11,
 'Laxmipura': 12,
 'Madhav Pura': 13,
 'Mandvi': 14,
 'Maneja': 15,
 'Manjalpur': 16,
 'Navapura': 17,
 'New Alkapuri': 18,
 'New Karelibaugh': 19,
 'New Sama': 20,
 'New VIP Road': 21,
 'Sama': 22,
 'Sayajipura': 23,
 'Soma Talav': 24,
 'Vasant Vihar': 25,
 'Vasna Road': 26,
 'Vasna-Bhayli Road': 27,
 'Waghodia Road': 28}

In [3256]:
df[ : ,2] = label_encoder.fit_transform(df[ : ,2])
df

array([[0, 15, 2, 3, 1, 1550.0],
       [0, 15, 2, 3, 1, 1550.0],
       [0, 15, 2, 3, 1, 1860.0],
       ...,
       [0, 8, 2, 3, 3, 1550.0],
       [0, 8, 2, 3, 2, 1750.0],
       [0, 8, 1, 2, 3, 1200.0]], dtype=object)

In [3257]:
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
mapping

{'1 BHK': 0, '2 BHK': 1, '3 BHK': 2, '4 BHK': 3, '5 BHK': 4}

In [3258]:
y = data.price.values
X = df

In [3259]:
X_train, X_val, y_train, y_val = train_test_split(X, y,test_size=0.2)

In [3260]:
standard_x = StandardScaler()

In [3261]:
X_train = standard_x.fit_transform(X_train)
X_val = standard_x.transform(X_val)

In [3262]:
print('Train_Shape: ',X_train.shape)
print("\nX_train:")
X_train

Train_Shape:  (272, 6)

X_train:


array([[-0.42808807, -1.40957467,  0.32060177, -0.66571012, -0.58368999,
         0.3085257 ],
       [-0.42808807,  0.62525268,  0.32060177, -0.66571012,  1.62136109,
        -0.2147163 ],
       [-0.42808807,  0.19686797,  0.32060177,  0.39942607, -0.58368999,
         0.1486462 ],
       ...,
       [-0.42808807, -0.44570909,  0.32060177,  0.39942607,  0.51883555,
        -0.5577305 ],
       [-0.42808807, -0.76699761, -0.85782636, -0.66571012, -0.58368999,
        -0.8687688 ],
       [ 1.16697981,  0.83944503,  0.32060177,  0.39942607,  0.51883555,
         1.60209621]])

In [3263]:
print('Val_Shape: ',X_val.shape)
print("\nX_val:")
X_val

Val_Shape:  (68, 6)

X_val:


array([[ 0.36944587,  1.26782973,  0.32060177,  0.39942607, -0.58368999,
         1.05705246],
       [-0.42808807, -0.33861291,  1.49902989,  2.52969847,  3.82641217,
         2.95671162],
       [-0.42808807, -1.08828614, -0.85782636, -0.66571012, -0.58368999,
        -0.4327338 ],
       [-0.42808807, -0.33861291, -0.85782636, -0.66571012,  0.51883555,
         0.11230995],
       [-0.42808807,  0.9465412 , -0.85782636, -0.66571012, -0.58368999,
        -1.09696045],
       [-0.42808807,  1.48202208, -0.85782636, -0.66571012,  0.51883555,
        -0.49813905],
       [-0.42808807, -0.44570909, -0.85782636, -0.66571012, -1.68621553,
        -0.28012155],
       [-0.42808807,  0.5181565 ,  1.49902989,  1.46456227,  0.51883555,
         2.11080371],
       [ 2.76204769,  0.30396415,  1.49902989,  0.39942607,  0.51883555,
         1.09338871],
       [-0.42808807, -0.44570909, -0.85782636, -0.66571012, -0.58368999,
        -0.18710075],
       [-0.42808807,  1.37492591, -0.85782636, -0.

In [3264]:
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)

In [3265]:
regressor.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [3266]:
filename = 'vadodara_house_model.pkl'

In [3267]:
pickle.dump(regressor, open(filename, 'wb'))

In [3268]:
loaded_model = pickle.load(open(filename, 'rb'))

In [3269]:
accuracy = loaded_model.score(X_val, y_val)

In [3270]:
print(accuracy*100, '%')

82.23599039594002 %


In [3271]:
predictions = loaded_model.predict(X_val)

In [3272]:
predictions

array([ 7509500. , 17865000. ,  2515100. ,  3264400. ,  2557800. ,
        2582600. ,  3740100. , 11065500. ,  8630300. ,  3122200. ,
        2547600. ,  8291579.5,  3280100. ,  1458100. ,  4014600. ,
        1495300. ,  4082561. ,  7977908.4,  2921800. ,  3707100. ,
        5898800. ,  2850100. ,  3639900. ,  6982100. ,  3227400. ,
        7245100. ,  2322000. ,  2750800. ,  4105200. ,  4054600. ,
        3060100. ,  1875500. ,  3776600. ,  9735000. ,  2897400. ,
        6126100. ,  4488100. ,  2615300. ,  5655000. , 28390300. ,
        3532200. ,  5002500. ,  5419541.5,  3290000. ,  5767000. ,
        1257300. ,  3674000. ,  3919000. ,  1430100. ,  3080000. ,
        2075000. ,  3080000. ,  4570100. , 12801300. , 24500600. ,
        6260300. ,  3261200. ,  7190000. ,  5895200. ,  2572600. ,
        6898000. ,  8050000. ,  4220200. ,  2921800. , 14437200. ,
        3488689.5,  4973238.5,  2838500. ])