In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, ShuffleSplit

In [2]:
df = pd.read_csv("data/cleaned_data.csv")
df.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,Devarachikkanahalli,1250.0,2.0,40.0,2
1,Devarachikkanahalli,1200.0,2.0,83.0,2
2,Devarachikkanahalli,1170.0,2.0,40.0,2
3,Devarachikkanahalli,1425.0,2.0,65.0,3
4,Devarachikkanahalli,947.0,2.0,43.0,2


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7375 entries, 0 to 7374
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    7375 non-null   object 
 1   total_sqft  7375 non-null   float64
 2   bath        7375 non-null   float64
 3   price       7375 non-null   float64
 4   bhk         7375 non-null   int64  
dtypes: float64(3), int64(1), object(1)
memory usage: 288.2+ KB


In [4]:
numerical_features = ['total_sqft','bath','bhk']
categorical_features = ['location']

In [5]:
transformer = ColumnTransformer(
    transformers = [
        ('encoder',OneHotEncoder(),categorical_features),
        ('scaler',StandardScaler(),numerical_features)
    ]
)

In [6]:
x = df.drop('price',axis=1)
y = df['price']

In [7]:
scaled_x = transformer.fit_transform(x)

In [8]:
x_train,x_test,y_train,y_test = train_test_split(scaled_x,y,test_size=0.2,random_state=42)

In [9]:
models = {
    'LinearRegression':LinearRegression(),
    'Ridge':Ridge(),
    'Lasso':Lasso(),
    'DecisionTreeRegressor':DecisionTreeRegressor(),
    'CatBoostRegressor':CatBoostRegressor(verbose=False),
    'RandomForestRegressor':RandomForestRegressor(),
}

In [10]:
params = {
    'LinearRegression':{},
    'Ridge':{
        'alpha': np.logspace(-3, 2, 6)
    },
    'Lasso':{
        'alpha': np.logspace(-3, 2, 6)
    },
    'DecisionTreeRegressor':{
        'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson']
    },
    'CatBoostRegressor':{
        'depth': [6,8,10],
        'learning_rate': [0.01, 0.05, 0.1],
        'iterations': [30, 50, 100]
    },
    'RandomForestRegressor':{
        'n_estimators': [8,16,32,64,128,256]
    }
}

In [11]:
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
model_dict = {}
for model_name,model_obj in models.items():
    param = params[model_name]
    gs = GridSearchCV(estimator=model_obj,cv=cv,param_grid=param)
    gs.fit(x_train,y_train)
    model_obj.set_params(**gs.best_params_)
    model_obj.fit(x_train,y_train)
    pred = model_obj.predict(x_test)
    score = r2_score(y_test,pred)
    model_dict[model_name] = [score,gs.best_params_]

In [23]:
models_df = pd.DataFrame(list(zip(model_dict.keys(),model_dict.values())),columns=['Model','r2_score'])

In [24]:
models_df.sort_values(by='r2_score' ,ascending=False)

Unnamed: 0,Model,r2_score
2,Lasso,"[0.836634583289142, {'alpha': 0.001}]"
0,LinearRegression,"[0.8365160034613472, {}]"
4,CatBoostRegressor,"[0.8362414975992027, {'depth': 8, 'iterations'..."
1,Ridge,"[0.8338540513388483, {'alpha': 0.01}]"
5,RandomForestRegressor,"[0.7796670206741712, {'n_estimators': 128}]"
3,DecisionTreeRegressor,"[0.5943272121696499, {'criterion': 'squared_er..."


In [27]:
import pickle
with open('model.pkl','wb') as file:
    pickle.dump(models['Lasso'],file)

In [28]:
with open('preprocessor.pkl','wb') as file:
    pickle.dump(transformer,file)

In [36]:
with open('location.txt','w') as file:
    for line in list(df['location'].unique()):
        file.write(line+"\n")

In [39]:
import json
locations = {'location' : list(df.location.unique())}
with open('locations.json','w') as file:
    file.write(json.dumps(locations))