# Importing important packages

In [44]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
import pickle

# Reading the CSV file to a dataset

In [45]:
df=pd.read_csv('/content/drive/MyDrive/Datasets/Real estate.csv')
df

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.917,19.5,306.59470,9,24.98034,121.53951,42.2
2,3,2013.583,13.3,561.98450,5,24.98746,121.54391,47.3
3,4,2013.500,13.3,561.98450,5,24.98746,121.54391,54.8
4,5,2012.833,5.0,390.56840,5,24.97937,121.54245,43.1
...,...,...,...,...,...,...,...,...
409,410,2013.000,13.7,4082.01500,0,24.94155,121.50381,15.4
410,411,2012.667,5.6,90.45606,9,24.97433,121.54310,50.0
411,412,2013.250,18.8,390.96960,7,24.97923,121.53986,40.6
412,413,2013.000,8.1,104.81010,5,24.96674,121.54067,52.5


# Check for missing values

In [46]:
df.isna().sum()

No                                        0
X1 transaction date                       0
X2 house age                              0
X3 distance to the nearest MRT station    0
X4 number of convenience stores           0
X5 latitude                               0
X6 longitude                              0
Y house price of unit area                0
dtype: int64

# Checking all datas are numerical

In [47]:
df.dtypes

No                                          int64
X1 transaction date                       float64
X2 house age                              float64
X3 distance to the nearest MRT station    float64
X4 number of convenience stores             int64
X5 latitude                               float64
X6 longitude                              float64
Y house price of unit area                float64
dtype: object

# Dropping unnecessary columns No and X1 transaction date

In [48]:
df.drop(['No','X1 transaction date'],axis=1,inplace=True)

# Splitting the dataset into input and output variables

In [49]:
x=df.iloc[:,:-1]
x

Unnamed: 0,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude
0,32.0,84.87882,10,24.98298,121.54024
1,19.5,306.59470,9,24.98034,121.53951
2,13.3,561.98450,5,24.98746,121.54391
3,13.3,561.98450,5,24.98746,121.54391
4,5.0,390.56840,5,24.97937,121.54245
...,...,...,...,...,...
409,13.7,4082.01500,0,24.94155,121.50381
410,5.6,90.45606,9,24.97433,121.54310
411,18.8,390.96960,7,24.97923,121.53986
412,8.1,104.81010,5,24.96674,121.54067


In [50]:
y=df.iloc[:,-1]
y

0      37.9
1      42.2
2      47.3
3      54.8
4      43.1
       ... 
409    15.4
410    50.0
411    40.6
412    52.5
413    63.9
Name: Y house price of unit area, Length: 414, dtype: float64

# Scaling the input variables

In [51]:
mms=MinMaxScaler()
x_new=mms.fit_transform(x)
x_new

array([[0.73059361, 0.00951267, 1.        , 0.61694135, 0.71932284],
       [0.44520548, 0.04380939, 0.9       , 0.5849491 , 0.71145137],
       [0.30365297, 0.08331505, 0.5       , 0.67123122, 0.75889584],
       ...,
       [0.42922374, 0.05686115, 0.7       , 0.57149782, 0.71522536],
       [0.18493151, 0.0125958 , 0.5       , 0.42014057, 0.72395946],
       [0.14840183, 0.0103754 , 0.9       , 0.51211827, 0.75016174]])

# Splitting the dataset into training and testing datas

In [52]:
x_train,x_test,y_train,y_test=train_test_split(x_new,y,test_size=0.3,random_state=5)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((289, 5), (125, 5), (289,), (125,))

# Performing Regressor Algorithms

In [53]:
knn=KNeighborsRegressor(n_neighbors=5)
dtr=DecisionTreeRegressor(max_depth=6,random_state=1)
rfr=RandomForestRegressor(max_depth=8,random_state=1)
abr=AdaBoostRegressor(random_state=1)
xgb=XGBRegressor(max_depth=6,random_state=1)
names=['KNeighbors Regressor', 'Decision Tree Regressor', 'Random Forest Regressor','AdaBoost Regressor','XGBRegressor']
models=[knn,dtr,rfr,abr,xgb]
print("R2_Score")
for model in models:
  model.fit(x_train,y_train)
  i=models.index(model)
  y_pred=model.predict(x_test)
  print(names[i],"=",r2_score(y_test,y_pred))

R2_Score
KNeighbors Regressor = 0.7031704607250019
Decision Tree Regressor = 0.589966936707681
Random Forest Regressor = 0.7547796026646103
AdaBoost Regressor = 0.6966528030589547
XGBRegressor = 0.74999111356986


# Calculating r2_Score of RandomForestRegressor

In [54]:
rfr1=RandomForestRegressor(max_depth=8,random_state=1)
rfr1.fit(x_train,y_train)
y_pred1=rfr1.predict(x_test)
print('R2_Score of XGBRegressor =',r2_score(y_test,y_pred1))

R2_Score of XGBRegressor = 0.7547796026646103


In [56]:
# The best model is RandomForestRegressor with max_depth=8,random_state=1
# r2_score=0.7547796026646103

#  Saving this regressor model and its regressor for deploy representation


In [55]:
pickle.dump(rfr1,open('model_realestate1.sav','wb'))
pickle.dump(mms,open('scaler_realestate1.sav','wb'))