In [5]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [3]:
df = pd.read_csv('C:/Users/abhil/OneDrive/Desktop/real_estate_project/Real_Estate_ML_Project/data/feature_selection/post_feature_selection_data.csv')

In [4]:
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

# dropping these values becuase there is only one record related to this sector type because of which ordinalencoding
#throw error i.e we are removing these rows
index_of_sector17a = df[df['sector'].str.contains("sector 17a")].index
df.drop(index=index_of_sector17a,inplace=True)

#same reason as above only one value of record containing sector 37 as sector type
df.drop(index=893,inplace=True)

X = df.drop(columns=['price'])
y = df['price']
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)
### Ordinal Encoding

# Creating a column transformer for preprocessing
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [7]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500))
])

In [8]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [9]:
scores = cross_val_score(pipeline,X,y_transformed,cv=kfold,scoring='r2')

In [10]:
scores.mean()

np.float64(0.898070793273811)

In [13]:
pipeline_2 = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',XGBRegressor(n_estimators=500))
])

In [14]:
kfold = KFold(n_splits=10,shuffle=True,random_state=42)

In [15]:
scores_2 = cross_val_score(pipeline_2,X,y_transformed,cv=kfold,scoring='r2')

In [16]:
scores_2.mean()

np.float64(0.9002438284287173)

In [17]:
pipeline.fit(X,y_transformed)

In [18]:
y_hat = np.expm1(pipeline.predict(X))

In [19]:
mae = mean_absolute_error(y,y_hat)

In [20]:
mae

np.float64(0.17583693360513936)

In [23]:
pipeline_2.fit(X,y_transformed)

In [24]:
y_hat_xg_boost = np.expm1(pipeline_2.predict(X))

In [27]:
mae = mean_absolute_error(y,y_hat_xg_boost)
print(mae)

0.04602724725016076
