In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
df = pd.read_csv('../datasets/df_dropped_unnecessary.csv')
df.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [3]:
df.shape

(13320, 5)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [5]:
df_copy = df.copy()

In [6]:
df_copy[df_copy['location'].isnull()]

Unnamed: 0,location,size,total_sqft,bath,price
568,,3 BHK,1600,3.0,86.0


In [7]:
df['location'].mode()

0    Whitefield
Name: location, dtype: object

**There is a NaN value in location column - We would replace it with the mode of the location (Whitefield).**

In [8]:
df_copy['location'].fillna(df_copy['location'].mode()[0],inplace=True)
df['location'].fillna(df['location'].mode()[0],inplace=True)

In [9]:
df.isnull().sum()

location       0
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [11]:
df['bath'].median()

2.0

In [12]:
df['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', nan, '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [13]:
df['size'].isnull().sum()

16

In [29]:
(pd.to_numeric(df_copy['size'].str.split().str[0], errors='coerce'))

0        2.0
1        4.0
2        3.0
3        3.0
4        2.0
        ... 
13315    5.0
13316    4.0
13317    2.0
13318    4.0
13319    1.0
Name: size, Length: 13320, dtype: float64

**1 RK = 1 BHK**

In [30]:
df_copy['BHK'] = pd.to_numeric(df_copy['size'].str.split().str[0], errors='coerce')

In [32]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
 5   BHK         13304 non-null  float64
dtypes: float64(3), object(3)
memory usage: 624.5+ KB


In [39]:
df = df_copy

In [48]:
df.isnull().sum()

location       0
size          16
total_sqft     0
bath          73
price          0
BHK           16
dtype: int64

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
 5   BHK         13304 non-null  float64
dtypes: float64(3), object(3)
memory usage: 624.5+ KB


**There are a few non-float values in total_sqft.We would assign the mean of the range as the total_sqft**

In [50]:
df_copy1 = df.copy()

In [51]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [52]:
df_copy1[~df_copy1['total_sqft'].apply(is_float)]

Unnamed: 0,location,size,total_sqft,bath,price,BHK
30,Yelahanka,4 BHK,2100 - 2850,4.0,186.000,4.0
56,Devanahalli,4 Bedroom,3010 - 3410,,192.000,4.0
81,Hennur Road,4 Bedroom,2957 - 3450,,224.500,4.0
122,Hebbal,4 BHK,3067 - 8156,4.0,477.000,4.0
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,54.005,2.0
...,...,...,...,...,...,...
12990,Talaghattapura,3 BHK,1804 - 2273,3.0,122.000,3.0
13059,Harlur,2 BHK,1200 - 1470,2.0,72.760,2.0
13240,Devanahalli,1 BHK,1020 - 1130,,52.570,1.0
13265,Hoodi,2 BHK,1133 - 1384,2.0,59.135,2.0


In [53]:
def convert_sqft_to_float(x):
    token = x.split('-')
    if(len(token) == 2):
        return (float(token[0])+float(token[1]))/2
    try:
        return float(x)
    except:
        return None

In [54]:
convert_sqft_to_float('2100')

2100.0

In [55]:
convert_sqft_to_float('2100 - 2900')

2500.0

In [56]:
df_copy1['total_sqft'] = df_copy1['total_sqft'].apply(convert_sqft_to_float)

In [57]:
df_copy1['total_sqft'].isnull().sum()

46

In [58]:
df_copy1.sample(10)

Unnamed: 0,location,size,total_sqft,bath,price,BHK
8198,Lakshminarayana Pura,3 BHK,3050.0,2.0,90.0,3.0
5195,Varthur Road,4 Bedroom,1300.0,3.0,75.0,4.0
2155,Ramagondanahalli,2 BHK,1235.0,2.0,46.8,2.0
7639,Sarjapur Road,4 BHK,1864.0,3.0,105.0,4.0
8853,HAL Layout,3 BHK,1675.0,3.0,65.0,3.0
7491,Kenchenahalli,4 Bedroom,1200.0,3.0,125.0,4.0
13205,Sarjapur Road,3 Bedroom,3500.0,3.0,275.0,3.0
11124,Sonnenahalli,2 BHK,1268.0,2.0,56.0,2.0
7814,Kenchenahalli,3 BHK,1410.0,2.0,73.0,3.0
10815,Basavangudi,3 BHK,2350.0,3.0,300.0,3.0


In [59]:
df = df_copy1

In [60]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,BHK
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2.0
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4.0
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3.0
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3.0
4,Kothanur,2 BHK,1200.0,2.0,51.0,2.0


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13274 non-null  float64
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
 5   BHK         13304 non-null  float64
dtypes: float64(4), object(2)
memory usage: 624.5+ KB


In [62]:
df[df['total_sqft'].isnull()]

Unnamed: 0,location,size,total_sqft,bath,price,BHK
410,Kengeri,1 BHK,,1.0,18.5,1.0
648,Arekere,9 Bedroom,,9.0,265.0,9.0
775,Basavanagara,1 BHK,,2.0,93.0,1.0
872,Singapura Village,2 BHK,,2.0,45.0,2.0
1019,Marathi Layout,1 Bedroom,,1.0,110.0,1.0
1086,Narasapura,2 Bedroom,,2.0,29.5,2.0
1400,Chamrajpet,9 BHK,,9.0,296.0,9.0
1712,Singena Agrahara,3 Bedroom,,3.0,95.0,3.0
1743,Hosa Road,3 BHK,,3.0,115.0,3.0
1821,Sarjapur,3 Bedroom,,3.0,76.0,3.0


In [63]:
df.dropna(inplace=True)

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13201 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13201 non-null  object 
 1   size        13201 non-null  object 
 2   total_sqft  13201 non-null  float64
 3   bath        13201 non-null  float64
 4   price       13201 non-null  float64
 5   BHK         13201 non-null  float64
dtypes: float64(4), object(2)
memory usage: 721.9+ KB


In [65]:
df.to_csv('../datasets/df_data_cleaned.csv',index=False)