In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso

In [11]:
data = pd.read_csv('MagicBricks.csv')

In [12]:
data.head(10)

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Price,Status,Transaction,Type,Per_Sqft
0,800.0,3,2.0,Semi-Furnished,Rohini Sector 25,1.0,6500000,Ready_to_move,New_Property,Builder_Floor,
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",1.0,5000000,Ready_to_move,New_Property,Apartment,6667.0
2,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",1.0,15500000,Ready_to_move,Resale,Apartment,6667.0
3,600.0,2,2.0,Semi-Furnished,Rohini Sector 24,1.0,4200000,Ready_to_move,Resale,Builder_Floor,6667.0
4,650.0,2,2.0,Semi-Furnished,Rohini Sector 24 carpet area 650 sqft status R...,1.0,6200000,Ready_to_move,New_Property,Builder_Floor,6667.0
5,1300.0,4,3.0,Semi-Furnished,Rohini Sector 24,1.0,15500000,Ready_to_move,New_Property,Builder_Floor,6667.0
6,1350.0,4,3.0,Semi-Furnished,Rohini Sector 24,1.0,10000000,Ready_to_move,Resale,Builder_Floor,6667.0
7,650.0,2,2.0,Semi-Furnished,"Delhi Homes, Rohini Sector 24",1.0,4000000,Ready_to_move,New_Property,Apartment,6154.0
8,985.0,3,3.0,Unfurnished,Rohini Sector 21,1.0,6800000,Almost_ready,New_Property,Builder_Floor,6154.0
9,1300.0,4,4.0,Semi-Furnished,Rohini Sector 22,1.0,15000000,Ready_to_move,New_Property,Builder_Floor,6154.0


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Area         1259 non-null   float64
 1   BHK          1259 non-null   int64  
 2   Bathroom     1257 non-null   float64
 3   Furnishing   1254 non-null   object 
 4   Locality     1259 non-null   object 
 5   Parking      1226 non-null   float64
 6   Price        1259 non-null   int64  
 7   Status       1259 non-null   object 
 8   Transaction  1259 non-null   object 
 9   Type         1254 non-null   object 
 10  Per_Sqft     1018 non-null   float64
dtypes: float64(4), int64(2), object(5)
memory usage: 108.3+ KB


### There are few missing values in the data so let's find out and perform preprocessing on the dataframe.

## Preprocessing

In [48]:
#function for onehot encoding to tackle furnishing (code analysis must be from bottom to top)
#functions are created on top after getting analysis in the bottom cells
def onehotenoder(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis =1)
    df = df.drop(column, axis=1)
    return df

In [49]:
#create a function to create a copy of the dataframe
def preprocessing_inputs(df):
    df = df.copy()
    #dropping the missing values in the target column 'Per_Sqft'
    missing_target_rows = df.loc[df['Per_Sqft'].isna(), :].index
    df = df.drop(missing_target_rows, axis=0).reset_index(drop=True)
    #fill missing values in the categorical features
    for column in ['Parking', 'Type']:
        df[column] = df[column].fillna(df[column].mode()[0])
    #first do binary encoding
    df['status'] = df['Status'].replace({'Almost_Ready': 0, 'Ready_to_move':1})
    df['Transaction'] = df['Transaction'].replace({'New_Property': 0, 'Resale':1})
    df['Type'] = df['Type'].replace({'Apartment': 0, 'Builder_Floor':1})
    return df

In [50]:
X = preprocessing_inputs(data)

In [51]:
X.head(10)

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Price,Status,Transaction,Type,Per_Sqft
0,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",1.0,5000000,Ready_to_move,New_Property,Apartment,6667.0
1,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",1.0,15500000,Ready_to_move,Resale,Apartment,6667.0
2,600.0,2,2.0,Semi-Furnished,Rohini Sector 24,1.0,4200000,Ready_to_move,Resale,Builder_Floor,6667.0
3,650.0,2,2.0,Semi-Furnished,Rohini Sector 24 carpet area 650 sqft status R...,1.0,6200000,Ready_to_move,New_Property,Builder_Floor,6667.0
4,1300.0,4,3.0,Semi-Furnished,Rohini Sector 24,1.0,15500000,Ready_to_move,New_Property,Builder_Floor,6667.0
5,1350.0,4,3.0,Semi-Furnished,Rohini Sector 24,1.0,10000000,Ready_to_move,Resale,Builder_Floor,6667.0
6,650.0,2,2.0,Semi-Furnished,"Delhi Homes, Rohini Sector 24",1.0,4000000,Ready_to_move,New_Property,Apartment,6154.0
7,985.0,3,3.0,Unfurnished,Rohini Sector 21,1.0,6800000,Almost_ready,New_Property,Builder_Floor,6154.0
8,1300.0,4,4.0,Semi-Furnished,Rohini Sector 22,1.0,15000000,Ready_to_move,New_Property,Builder_Floor,6154.0
9,1100.0,3,2.0,Semi-Furnished,Rohini Sector 20,1.0,6200000,Ready_to_move,New_Property,Builder_Floor,6154.0


In [52]:
#Find out the missing values
X.isna()

#if there will be any missing values, it will return True as it is a function of matrix of the dataframe

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Price,Status,Transaction,Type,Per_Sqft
0,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
1013,False,False,False,False,False,False,False,False,False,False,False
1014,False,False,False,False,False,False,False,False,False,False,False
1015,False,False,False,False,False,False,False,False,False,False,False
1016,False,False,False,False,False,False,False,False,False,False,False


In [53]:
#Total count in all the columns
X.isna().sum()

Area           0
BHK            0
Bathroom       0
Furnishing     4
Locality       0
Parking        0
Price          0
Status         0
Transaction    0
Type           0
Per_Sqft       0
dtype: int64

In [54]:
#check for one column
X['Per_Sqft'].isna()

0       False
1       False
2       False
3       False
4       False
        ...  
1013    False
1014    False
1015    False
1016    False
1017    False
Name: Per_Sqft, Length: 1018, dtype: bool

In [55]:
X['Per_Sqft'].isna().sum()

0

#### So we dropped all those rows which contained missing values in the target column 'Per_Sqft'

## Let's handle rest of the columns

In [56]:
{column: len(X[column].unique()) for column in X.select_dtypes('object').columns} #To see unique values in these cols

{'Furnishing': 4, 'Locality': 311, 'Status': 2, 'Transaction': 2, 'Type': 2}

In [57]:
{column: list(X[column].unique()) for column in X.select_dtypes('object').columns.drop('Locality')}

{'Furnishing': ['Semi-Furnished', 'Furnished', 'Unfurnished', nan],
 'Status': ['Ready_to_move', 'Almost_ready'],
 'Transaction': ['New_Property', 'Resale'],
 'Type': ['Apartment', 'Builder_Floor']}

### Above 'Status', 'Transaction', 'Type'(if we remove nan) are the binary features so easy to encode

In [58]:
#Handle 'Furnishing' with One-hot encoder for 3 values inside it and that will take care of missing value also
pd.get_dummies(X['Furnishing'])

Unnamed: 0,Furnished,Semi-Furnished,Unfurnished
0,0,1,0
1,1,0,0
2,0,1,0
3,0,1,0
4,0,1,0
...,...,...,...
1013,0,0,1
1014,0,1,0
1015,0,1,0
1016,0,0,1
