In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import mean_squared_error,r2_score

In [2]:
df=pd.read_csv("Bengaluru_House_Data.csv")
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [4]:
df['location'].fillna(df['location'].mode().values[0],inplace=True)

df['bath'].fillna(df['bath'].median(),inplace=True)
df['balcony'].fillna(df['balcony'].median(),inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13320 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13320 non-null  float64
 7   balcony       13320 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


## area_type

In [6]:
df['area_type'].value_counts()

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64

In [7]:
df=pd.get_dummies(df,columns=['area_type'])
df.head()

Unnamed: 0,availability,location,size,society,total_sqft,bath,balcony,price,area_type_Built-up Area,area_type_Carpet Area,area_type_Plot Area,area_type_Super built-up Area
0,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07,0,0,0,1
1,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0,0,0,1,0
2,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0,1,0,0,0
3,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0,0,0,0,1
4,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0,0,0,0,1


## availability

In [8]:
availability = {i:0 for i in df['availability'].unique().tolist()}
availability['Ready To Move']=1
availability['Immediate Possession']=1
availability
df['availability'].replace(availability,inplace =True)
df.head()

Unnamed: 0,availability,location,size,society,total_sqft,bath,balcony,price,area_type_Built-up Area,area_type_Carpet Area,area_type_Plot Area,area_type_Super built-up Area
0,0,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07,0,0,0,1
1,1,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0,0,0,1,0
2,1,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0,1,0,0,0
3,1,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0,0,0,0,1
4,1,Kothanur,2 BHK,,1200,2.0,1.0,51.0,0,0,0,1


In [9]:
location = df['location'].value_counts().to_dict()
num=0
for i in location.keys():
    location[i] = num
    num +=1
df['location'].replace(location,inplace=True)
    

In [10]:
df.drop('society',axis=1,inplace=True)

In [11]:
df["size"].replace({'2 BHK': 2, '3 BHK': 3, '4 Bedroom': 4, '4 BHK': 4, '3 Bedroom': 3, 
                    '1 BHK': 1, '2 Bedroom': 2, '5 Bedroom': 5, '6 Bedroom': 6, '1 Bedroom': 1,
                    '8 Bedroom': 8, '7 Bedroom': 7, '5 BHK': 5, '9 Bedroom': 9, '6 BHK': 6, '7 BHK': 7,
                    '1 RK': 0, '10 Bedroom': 10, '9 BHK': 9, '8 BHK': 8, '11 BHK': 11, '11 Bedroom': 11, 
                    '10 BHK': 10, '14 BHK': 14, '13 BHK': 13, '12 Bedroom': 12, '27 BHK': 27, '43 Bedroom': 43,
                    '16 BHK': 16, '19 BHK': 19, '18 Bedroom': 18},inplace=True)

In [12]:
size = {'2 BHK': 2, '3 BHK': 3, '4 Bedroom': 4, '4 BHK': 4, '3 Bedroom': 3, 
                    '1 BHK': 1, '2 Bedroom': 2, '5 Bedroom': 5, '6 Bedroom': 6, '1 Bedroom': 1,
                    '8 Bedroom': 8, '7 Bedroom': 7, '5 BHK': 5, '9 Bedroom': 9, '6 BHK': 6, '7 BHK': 7,
                    '1 RK': 0, '10 Bedroom': 10, '9 BHK': 9, '8 BHK': 8, '11 BHK': 11, '11 Bedroom': 11, 
                    '10 BHK': 10, '14 BHK': 14, '13 BHK': 13, '12 Bedroom': 12, '27 BHK': 27, '43 Bedroom': 43,
                    '16 BHK': 16, '19 BHK': 19, '18 Bedroom': 18}

In [13]:
def convert(x):
    tokel = x.split("-")
    if len(tokel) == 2:
        return(float(tokel[0])+float(tokel[1]))/2
    try:
        return float(x)
    except:
        return None

In [14]:
df['total_sqft']=df['total_sqft'].apply(convert)

In [15]:
df['total_sqft'].fillna(df['total_sqft'].median(),inplace=True)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 11 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   availability                    13320 non-null  int64  
 1   location                        13320 non-null  int64  
 2   size                            13304 non-null  float64
 3   total_sqft                      13320 non-null  float64
 4   bath                            13320 non-null  float64
 5   balcony                         13320 non-null  float64
 6   price                           13320 non-null  float64
 7   area_type_Built-up  Area        13320 non-null  uint8  
 8   area_type_Carpet  Area          13320 non-null  uint8  
 9   area_type_Plot  Area            13320 non-null  uint8  
 10  area_type_Super built-up  Area  13320 non-null  uint8  
dtypes: float64(5), int64(2), uint8(4)
memory usage: 780.6 KB


In [17]:
df['size'].fillna(df['size'].mode().values[0],inplace=True)

## dictionaries

## Model Train

In [18]:
x=df.drop('price',axis=1)
y=df['price']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20,random_state=1)

In [19]:
x_train.shape

(10656, 10)

In [20]:
LR = LinearRegression()
LR.fit(x_train,y_train)

LinearRegression()

In [21]:
y_pred_train=LR.predict(x_train)

In [22]:
r2 = r2_score(y_train,y_pred_train)
r2

0.4361034940503178

In [23]:
y_pred_test=LR.predict(x_test)
r2 = r2_score(y_test,y_pred_test)
r2

0.3459515708808102

## create dictionary,json,p[ickle file]

In [24]:
project_dict = {'availability':availability,'location':location,'size':size,
               'column':x.columns.tolist()}
project_dict

{'availability': {'19-Dec': 0,
  'Ready To Move': 1,
  '18-May': 0,
  '18-Feb': 0,
  '18-Nov': 0,
  '20-Dec': 0,
  '17-Oct': 0,
  '21-Dec': 0,
  '19-Sep': 0,
  '20-Sep': 0,
  '18-Mar': 0,
  '20-Feb': 0,
  '18-Apr': 0,
  '20-Aug': 0,
  '18-Oct': 0,
  '19-Mar': 0,
  '17-Sep': 0,
  '18-Dec': 0,
  '17-Aug': 0,
  '19-Apr': 0,
  '18-Jun': 0,
  '22-Dec': 0,
  '22-Jan': 0,
  '18-Aug': 0,
  '19-Jan': 0,
  '17-Jul': 0,
  '18-Jul': 0,
  '21-Jun': 0,
  '20-May': 0,
  '19-Aug': 0,
  '18-Sep': 0,
  '17-May': 0,
  '17-Jun': 0,
  '21-May': 0,
  '18-Jan': 0,
  '20-Mar': 0,
  '17-Dec': 0,
  '16-Mar': 0,
  '19-Jun': 0,
  '22-Jun': 0,
  '19-Jul': 0,
  '21-Feb': 0,
  'Immediate Possession': 1,
  '19-May': 0,
  '17-Nov': 0,
  '20-Oct': 0,
  '20-Jun': 0,
  '19-Feb': 0,
  '21-Oct': 0,
  '21-Jan': 0,
  '17-Mar': 0,
  '17-Apr': 0,
  '22-May': 0,
  '19-Oct': 0,
  '21-Jul': 0,
  '21-Nov': 0,
  '21-Mar': 0,
  '16-Dec': 0,
  '22-Mar': 0,
  '20-Jan': 0,
  '21-Sep': 0,
  '21-Aug': 0,
  '14-Nov': 0,
  '19-Nov': 0,
  '

In [25]:
import json
with open("LR_Model.json",'w') as f:
    json.dump(project_dict,f)

In [26]:
import pickle
with open("LR_Model.pkl",'wb') as f:
    pickle.dump(LR,f)