## Setup

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score  
import joblib
import os 

from src.utils import preprocess_data
from src.utils import feature_engg


## Data Ingestion

In [2]:
model_df = pd.read_csv(r"data\processed\cleaned.csv")

## Linear Regression Model

In [3]:
X= model_df[['area_type', 'total_sqft', 'bath', 'balcony', 'extract']]
Y = model_df['price']

# x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=50)

column_trans = ColumnTransformer(transformers=
                                 [('onehot', OneHotEncoder(), ['area_type', 'extract']),
                                  ('scaler', StandardScaler(), ['total_sqft', 'bath', 'balcony'])],
                                  remainder='passthrough')

pipeline = make_pipeline(column_trans, LinearRegression())

pipeline.fit(X, Y)

# y_pred = pipeline.predict(x_test)

filename = os.path.join("models", "1st_model_LR.joblib")
joblib.dump(pipeline, filename)

loaded_model_LR = joblib.load(filename)
result = loaded_model_LR.score(X, Y)
print(result)

0.6067557185468784


## Random Forest Regressor Model

In [4]:
X = model_df[['area_type', 'total_sqft', 'bath', 'balcony', 'extract']]
Y = model_df['price']

# x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=50)

column_trans = ColumnTransformer(transformers=
                                 [('onehot', OneHotEncoder(), ['area_type', 'extract']),
                                  ('scaler', StandardScaler(), ['total_sqft', 'bath', 'balcony'])],
                                  remainder='passthrough')

pipeline = make_pipeline(column_trans, RandomForestRegressor())

pipeline.fit(X, Y)

# y_pred = pipeline.predict(x_test)

filename = os.path.join("models", "1st_model_RF.joblib")
joblib.dump(pipeline, filename)

loaded_model_RF1 = joblib.load(filename)
result = loaded_model_RF1.score(X, Y)
print(result)


0.9008407235508414


In [5]:
X = model_df[['area_type','size','total_sqft', 'bath', 'balcony', 'extract']]
Y = model_df['price']

# x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=50)

column_trans = ColumnTransformer(transformers=
                                 [('onehot', OneHotEncoder(), ['area_type', 'extract']),
                                  ('scaler', StandardScaler(), ['size','total_sqft', 'bath', 'balcony'])],
                                  remainder='passthrough')

pipeline = make_pipeline(column_trans, RandomForestRegressor())

pipeline.fit(X, Y)

# y_pred = pipeline.predict(x_test)

filename = os.path.join("models", "2nd_model_RF.joblib")
joblib.dump(pipeline, filename)

loaded_model_RF2 = joblib.load(filename)
result = loaded_model_RF2.score(X, Y)
print(result)

0.9176388795419264


## Evaluation

In [6]:
test_data = pd.read_csv(r"data/raw/Test.csv")
clean_test_data = preprocess_data(test_data)
clean_test_data = feature_engg(clean_test_data)

In [7]:
clean_test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1480 entries, 0 to 1479
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     1480 non-null   object 
 1   availability  1480 non-null   object 
 2   location      1480 non-null   object 
 3   size          1480 non-null   float64
 4   society       1480 non-null   object 
 5   total_sqft    1480 non-null   float64
 6   bath          1480 non-null   float64
 7   balcony       1480 non-null   float64
 8   price         0 non-null      float64
 9   month         316 non-null    object 
 10  ready         1164 non-null   object 
 11  extract       1480 non-null   object 
dtypes: float64(5), object(7)
memory usage: 138.9+ KB


### Linear  reg

In [8]:
x_test = clean_test_data[['area_type','total_sqft', 'bath', 'balcony', 'extract']]
y_pred = clean_test_data['price']

y_pred_lr = loaded_model_LR.predict(x_test)
result = loaded_model_LR.score(x_test, y_pred_lr)
print("R-squared value on test data:", result)

y_pred_rf1 = loaded_model_RF1.predict(x_test)
result = loaded_model_RF1.score(x_test, y_pred_rf1)
print("R-squared value on test data:", result)

R-squared value on test data: 1.0


R-squared value on test data: 1.0


In [9]:
x_test = clean_test_data[['area_type','size','total_sqft', 'bath', 'balcony', 'extract']]
y_pred = clean_test_data['price']

y_pred_rf2 = loaded_model_RF2.predict(x_test)
result = loaded_model_RF2.score(x_test, y_pred_rf2)
print("R-squared value on test data:", result)

R-squared value on test data: 1.0
