In [41]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [42]:
df = pd.read_csv('/content/99Acres_gurgaon_hyderabad_Secunderabad_SelectedFeatures.csv')

In [43]:
pd.set_option('display.max_columns', None)

In [44]:
X = df.drop(columns=['PRICE'])
y = df['PRICE']/10000000

In [45]:
df.head(3)

Unnamed: 0,PROPERTY_TYPE,OWNTYPE,BEDROOM_NUM,FACING,AGE,TOTAL_FLOOR,PRICE_SQFT,AREA,BALCONY_NUM,FLOOR_NUM,BHK,LATITUDE,LONGITUDE,LUXURY,SWIMMING_POOL,POWER_BACKUP,CLUB_HOUSE,ATM,GYM,WASTE_DISPOSAL,GAS_PIPELINE,PRICE
0,2.0,1,3.0,3,5,30,5428.0,1658.0,3,9,3,17.580482,78.936868,1.0,1,0,1,1,1,0,0,9000000.0
1,1.0,1,2.0,2,3,2,7964.0,565.0,1,1,2,17.5316,78.26729,1.0,0,0,0,0,0,0,0,4500000.0
2,2.0,1,2.0,3,1,5,4771.0,1205.0,1,4,2,17.620662,77.953509,1.0,1,0,0,0,0,1,0,57500000.0


In [46]:
columns_to_encode = ['AGE', 'OWNTYPE', 'FACING', 'PROPERTY_TYPE', 'LUXURY', 'SWIMMING_POOL','SWIMMING_POOL','CLUB_HOUSE','ATM','GYM','WASTE_DISPOSAL','GAS_PIPELINE']

In [47]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

In [48]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['BEDROOM_NUM', 'AGE', 'TOTAL_FLOOR', 'PRICE_SQFT', 'AREA', 'BALCONY_NUM','FLOOR_NUM','BHK','LATITUDE','LONGITUDE']),
        ('cat', OneHotEncoder(drop='first'), columns_to_encode)
    ],
    remainder='passthrough')

In [49]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='rbf'))
])

In [50]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [51]:
scores.mean()

0.8149568852111322

In [52]:
scores.std()

0.03476284559106954

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [54]:
pipeline.fit(X_train,y_train)

In [55]:
y_pred = pipeline.predict(X_test)

In [56]:
y_pred = np.expm1(y_pred)

In [57]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.3915842497938895