In [24]:
import pandas as pd
import numpy as np

In [25]:
df = pd.read_csv('/content/feature_selection.1.csv')

In [26]:
pd.set_option('display.max_columns',None)

In [27]:
df.shape

(3554, 13)

In [28]:
df.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category', 'price'],
      dtype='object')

In [29]:
df.sample(5)

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category,price
837,0.0,5.0,3,3,4.0,3.0,2500.0,1,0,0,0.0,1.0,1.9
2351,0.0,51.0,4,3,3.0,0.0,10227.0,1,0,0,1.0,2.0,8.0
330,0.0,38.0,2,2,3.0,1.0,1100.0,0,0,0,1.0,2.0,0.71
318,1.0,61.0,6,4,0.0,0.0,240.0,0,0,0,1.0,2.0,1.0
823,0.0,72.0,4,6,4.0,3.0,2842.0,1,0,0,0.0,0.0,3.1


In [30]:
x = df.drop(columns='price')
y = df['price']

In [31]:
x.shape

(3554, 12)

In [32]:
y.shape

(3554,)

In [48]:
from sklearn.model_selection import KFold , cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder , StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

In [49]:
columns_to_encode = ['sector','balcony','agePossession','furnishing_type','luxury_category','floor_category']

In [50]:
#applying log transformation on Target Variable
y_transformed = np.log1p(y)

In [51]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['property_type', 'bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OneHotEncoder(drop='first'), columns_to_encode)
    ],
    remainder='passthrough'
)

In [52]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='rbf'))
])

In [53]:
kfold = KFold(n_splits=10,shuffle=True,random_state=42)
score = cross_val_score(pipeline,x,y_transformed,cv=kfold,scoring='r2')

In [54]:
score.mean() #given by SVM

0.8838108478421324

In [55]:
score.std()

0.014710362471609763

In [56]:
from sklearn.model_selection import train_test_split
x_train , x_test ,y_train, y_test = train_test_split(x,y_transformed,test_size=0.2,random_state=42)

In [57]:
pipeline.fit(x_train,y_train)

In [58]:
y_pred = pipeline.predict(x_test)

In [59]:
y_pred = np.expm1(y_pred)

In [47]:
from sklearn.metrics import mean_absolute_error #This result was given by linear regression
mean_absolute_error(np.expm1(y_test),y_pred)   #our model is doing 64 lakh mistake ex - if real price is 1 cr then output cab be 40 lakh or 1.60 cr ....

0.6483595563797564

In [60]:
from sklearn.metrics import mean_absolute_error #This result was given by SVM (Support Vector Machine)
mean_absolute_error(np.expm1(y_test),y_pred)  #our model is doing 53 lakh mistake ex - if real price is 1 cr then output cab be 40 lakh or

0.5351463989072823