In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('gurgaonpropertyfeatureselection.csv')

In [5]:
df = df.drop('Unnamed: 0', axis=1)

In [6]:
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,Built Up area,store room,servant room,furnishing_type,luxury_category,floor_category,price_in_crores
0,0.0,72.0,2.0,2.0,1.0,3.0,1000.0,0,0,1,1.0,2.0,0.45
1,0.0,34.0,2.0,2.0,1.0,2.0,722.0,0,0,0,1.0,1.0,0.5
2,0.0,100.0,2.0,2.0,3.0,1.0,661.0,0,0,1,1.0,0.0,0.4
3,0.0,64.0,2.0,2.0,2.0,1.0,1333.0,0,0,1,2.0,1.0,1.47
4,0.0,95.0,2.0,2.0,3.0,4.0,1217.0,0,0,1,1.0,2.0,0.7


In [7]:
x = df.drop('price_in_crores', axis=1)
y = df['price_in_crores']

In [8]:
x.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,Built Up area,store room,servant room,furnishing_type,luxury_category,floor_category
0,0.0,72.0,2.0,2.0,1.0,3.0,1000.0,0,0,1,1.0,2.0
1,0.0,34.0,2.0,2.0,1.0,2.0,722.0,0,0,0,1.0,1.0
2,0.0,100.0,2.0,2.0,3.0,1.0,661.0,0,0,1,1.0,0.0
3,0.0,64.0,2.0,2.0,2.0,1.0,1333.0,0,0,1,2.0,1.0
4,0.0,95.0,2.0,2.0,3.0,4.0,1217.0,0,0,1,1.0,2.0


In [9]:
y.head()

0    0.45
1    0.50
2    0.40
3    1.47
4    0.70
Name: price_in_crores, dtype: float64

In [33]:
## Applying the log1p transformation to the target variable

y_transformed = np.log1p(y)

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y_transformed, test_size=0.2, random_state=42)

In [36]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

In [37]:
columns_to_encode = ['sector','balcony','agePossession','furnishing_type','luxury_category','floor_category']

In [44]:
# Creating a column transformer for preprocessing

preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['property_type','bedRoom','bathroom','Built Up area','servant room','store room']),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), columns_to_encode)

    ], remainder='passthrough'
)

In [45]:
# Creating a pipeline

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR())
])

In [46]:
# k-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='r2')



In [47]:
scores.mean()

np.float64(0.8734215595588424)

In [48]:
scores.std()

np.float64(0.019855118464938884)

In [49]:
pipeline.fit(X_train,y_train)

In [50]:
y_pred = pipeline.predict(X_test)

In [51]:
y_pred = np.expm1(y_pred)

In [52]:
y_pred

array([ 3.23617047,  1.15357456,  0.56183399,  1.73918769,  1.5979483 ,
        0.68954145,  3.16629875,  1.91858137,  1.57518576,  3.53067986,
        1.52597449,  1.98347548,  0.77310961,  0.83588221,  1.19843493,
        8.98083664,  2.86312376,  2.08142089,  0.64767972,  1.35082059,
        2.166494  ,  0.28642794,  6.12570604,  5.88533042,  0.39473945,
        0.79390465,  1.93041748,  1.11081159,  0.97208289,  1.66518715,
        0.76102625,  0.42318573,  4.54475206,  6.3286906 ,  1.49842387,
        2.07724374,  1.05813224,  3.66279811,  5.01447352,  2.01334249,
       12.90975645, 10.02505438,  6.84769747,  2.36388215,  2.79925883,
        0.94045901,  1.80148376,  5.76651923,  1.58233059,  0.87670555,
        0.6580732 ,  2.8233214 ,  2.05677588,  1.78871819,  5.25984268,
        1.35834469,  0.74034333,  1.70125594,  1.0922522 , 11.43551587,
       14.04998336,  4.41104176,  1.54154888,  2.43102773,  2.96401388,
        1.68812326,  7.82900069,  2.06673111,  2.61686702,  1.92

In [53]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(np.expm1(y_test), y_pred)

0.48360579879664556