<a href="https://colab.research.google.com/github/Aparna0112/Aparna0112/blob/main/regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **PROJECT : REAL ESTATE PREDICTION**

**Import required libraries**

In [None]:
import pandas as pd
import pickle
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [None]:
df=pd.read_csv('/content/drive/MyDrive/Datasets/Real estate.csv')
df

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.917,19.5,306.59470,9,24.98034,121.53951,42.2
2,3,2013.583,13.3,561.98450,5,24.98746,121.54391,47.3
3,4,2013.500,13.3,561.98450,5,24.98746,121.54391,54.8
4,5,2012.833,5.0,390.56840,5,24.97937,121.54245,43.1
...,...,...,...,...,...,...,...,...
409,410,2013.000,13.7,4082.01500,0,24.94155,121.50381,15.4
410,411,2012.667,5.6,90.45606,9,24.97433,121.54310,50.0
411,412,2013.250,18.8,390.96960,7,24.97923,121.53986,40.6
412,413,2013.000,8.1,104.81010,5,24.96674,121.54067,52.5


**Drop column**

In [None]:
df.drop(['No','X1 transaction date'],axis=1,inplace=True)
df

Unnamed: 0,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,32.0,84.87882,10,24.98298,121.54024,37.9
1,19.5,306.59470,9,24.98034,121.53951,42.2
2,13.3,561.98450,5,24.98746,121.54391,47.3
3,13.3,561.98450,5,24.98746,121.54391,54.8
4,5.0,390.56840,5,24.97937,121.54245,43.1
...,...,...,...,...,...,...
409,13.7,4082.01500,0,24.94155,121.50381,15.4
410,5.6,90.45606,9,24.97433,121.54310,50.0
411,18.8,390.96960,7,24.97923,121.53986,40.6
412,8.1,104.81010,5,24.96674,121.54067,52.5


**Check missing value**

In [None]:
df.isna().sum()

X2 house age                              0
X3 distance to the nearest MRT station    0
X4 number of convenience stores           0
X5 latitude                               0
X6 longitude                              0
Y house price of unit area                0
dtype: int64

In [None]:
df.dtypes

X2 house age                              float64
X3 distance to the nearest MRT station    float64
X4 number of convenience stores             int64
X5 latitude                               float64
X6 longitude                              float64
Y house price of unit area                float64
dtype: object

**Separating the input and output**

In [None]:
X=df.iloc[:,:-1]
X

Unnamed: 0,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude
0,32.0,84.87882,10,24.98298,121.54024
1,19.5,306.59470,9,24.98034,121.53951
2,13.3,561.98450,5,24.98746,121.54391
3,13.3,561.98450,5,24.98746,121.54391
4,5.0,390.56840,5,24.97937,121.54245
...,...,...,...,...,...
409,13.7,4082.01500,0,24.94155,121.50381
410,5.6,90.45606,9,24.97433,121.54310
411,18.8,390.96960,7,24.97923,121.53986
412,8.1,104.81010,5,24.96674,121.54067


In [None]:
y=df.iloc[:,-1]
y

0      37.9
1      42.2
2      47.3
3      54.8
4      43.1
       ... 
409    15.4
410    50.0
411    40.6
412    52.5
413    63.9
Name: Y house price of unit area, Length: 414, dtype: float64

**Normalizing the data using MinMaxScaler**

In [None]:
scaler=MinMaxScaler()
X_scaled=scaler.fit_transform(X)
X_scaled

array([[0.73059361, 0.00951267, 1.        , 0.61694135, 0.71932284],
       [0.44520548, 0.04380939, 0.9       , 0.5849491 , 0.71145137],
       [0.30365297, 0.08331505, 0.5       , 0.67123122, 0.75889584],
       ...,
       [0.42922374, 0.05686115, 0.7       , 0.57149782, 0.71522536],
       [0.18493151, 0.0125958 , 0.5       , 0.42014057, 0.72395946],
       [0.14840183, 0.0103754 , 0.9       , 0.51211827, 0.75016174]])

**Splitting of dataset into training and testing data**

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=5,test_size=0.3)

**Model Creation**: KNN,SVC,DecisionTree,RandomForest,AdaBoost,XGBregressor

In [None]:
knn=KNeighborsRegressor(n_neighbors=7)
knn.fit(X_train,y_train)

In [None]:
y_pred_knn=knn.predict(X_test)
y_pred_knn

array([25.08571429, 48.84285714, 30.55714286, 30.9       , 37.1       ,
       49.14285714, 43.9       , 44.12857143, 37.14285714, 52.62857143,
       44.22857143, 24.97142857, 40.72857143, 43.11428571, 48.12857143,
       37.84285714, 40.14285714, 38.02857143, 58.2       , 46.41428571,
       37.6       , 18.02857143, 42.92857143, 48.84285714, 49.98571429,
       40.14285714, 39.72857143, 40.04285714, 29.47142857, 45.28571429,
       55.3       , 40.62857143, 40.1       , 51.05714286, 34.7       ,
       18.9       , 50.45714286, 55.3       , 30.3       , 30.04285714,
       24.98571429, 23.58571429, 30.3       , 50.21428571, 51.22857143,
       39.72857143, 28.91428571, 31.34285714, 37.18571429, 37.65714286,
       18.9       , 43.71428571, 42.84285714, 18.02857143, 18.9       ,
       37.65714286, 37.14285714, 38.94285714, 28.64285714, 30.9       ,
       51.94285714, 26.21428571, 49.38571429, 28.24285714, 26.67142857,
       37.3       , 41.94285714, 38.04285714, 51.17142857, 40.14

In [None]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
print('r2_score:',r2_score(y_test,y_pred_knn))
print('mean_absolute_error:',mean_absolute_error(y_test,y_pred_knn))
print('mean_squared_error:',mean_squared_error(y_test,y_pred_knn))


r2_score: 0.6228478610170531
mean_absolute_error: 5.390857142857143
mean_squared_error: 58.213740408163254


In [None]:
dt=DecisionTreeRegressor()
dt.fit(X_train,y_train)
y_pred_dt=dt.predict(X_test)
print('r2_score:',r2_score(y_test,y_pred_dt))
print('mean_absolute_error:',mean_absolute_error(y_test,y_pred_dt))
print('mean_squared_error:',mean_squared_error(y_test,y_pred_dt))

r2_score: 0.600863899690833
mean_absolute_error: 5.5722
mean_squared_error: 61.60698277777777


In [None]:
rf=RandomForestRegressor()
rf.fit(X_train,y_train)
y_pred_rf=rf.predict(X_test)
print('r2_score:',r2_score(y_test,y_pred_rf))
print('mean_absolute_error:',mean_absolute_error(y_test,y_pred_rf))
print('mean_squared_error:',mean_squared_error(y_test,y_pred_rf))

r2_score: 0.7413284225177909
mean_absolute_error: 4.296405082539683
mean_squared_error: 39.92616906038621


In [None]:
ab=AdaBoostRegressor()
ab.fit(X_train,y_train)
y_pred_ab=ab.predict(X_test)
print('r2_score:',r2_score(y_test,y_pred_ab))
print('mean_absolute_error:',mean_absolute_error(y_test,y_pred_ab))
print('mean_squared_error:',mean_squared_error(y_test,y_pred_ab))

r2_score: 0.6806199633907605
mean_absolute_error: 5.275716880167228
mean_squared_error: 49.296569264746


In [None]:
xg=XGBRegressor()
xg.fit(X_train,y_train)
y_pred_xg=xg.predict(X_test)
print('r2_score:',r2_score(y_test,y_pred_xg))
print('mean_absolute_error:',mean_absolute_error(y_test,y_pred_xg))
print('mean_squared_error:',mean_squared_error(y_test,y_pred_xg))

r2_score: 0.74999111356986
mean_absolute_error: 4.618350502014161
mean_squared_error: 38.58907562774343


In [None]:
pickle.dump(xg,open('xg_estate.save','wb'))
pickle.dump(scaler,open('scaler_estate.sav','wb'))