In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [3]:
df = pd.read_csv('../datasets/main_df.csv')
df.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [4]:
df.shape

(13320, 5)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [6]:
df_copy = df.copy()

In [7]:
df_copy[df_copy['location'].isnull()]

Unnamed: 0,location,size,total_sqft,bath,price
568,,3 BHK,1600,3.0,86.0


In [8]:
df['location'].mode()

0    Whitefield
Name: location, dtype: object

**There is a NaN value in location column - We would replace it with the mode of the location (Whitefield).**

In [9]:
df_copy['location'].fillna(df_copy['location'].mode()[0],inplace=True)
df['location'].fillna(df['location'].mode()[0],inplace=True)

In [10]:
df.isnull().sum()

location       0
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [12]:
df['bath'].unique()

array([ 2.,  5.,  3.,  4.,  6.,  1.,  9., nan,  8.,  7., 11., 10., 14.,
       27., 12., 16., 40., 15., 13., 18.])

**We'll assign random values within the range of 1 to 5 to fill in the NaN entries for the 'bathrooms' column, considering that the typical number of bathrooms falls within this range.**

In [13]:
random_values_bath = np.random.randint(1, 6, size=df['bath'].isnull().sum())
random_values_bath

array([2, 2, 4, 3, 2, 4, 5, 3, 5, 2, 2, 3, 2, 4, 5, 2, 1, 1, 2, 2, 5, 1,
       4, 3, 4, 5, 1, 1, 2, 5, 1, 2, 5, 5, 3, 4, 4, 2, 2, 1, 2, 4, 3, 3,
       3, 1, 3, 3, 3, 5, 2, 4, 3, 1, 2, 3, 1, 4, 1, 4, 1, 4, 2, 1, 4, 5,
       4, 3, 3, 5, 4, 3, 5])

In [14]:
df_copy.loc[df['bath'].isnull(), 'bath'] = random_values_bath

In [15]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [16]:
df.loc[df['bath'].isnull(), 'bath'] = random_values_bath

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [18]:
df['bath'].median()

2.0

In [19]:
df['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', nan, '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [20]:
df['size'].isnull().sum()

16

In [21]:
df_copy['BHK'] = df_copy['size'].str.split().str[0]

In [22]:
df_copy1 = df_copy.copy()

In [23]:
df_copy1

Unnamed: 0,location,size,total_sqft,bath,price,BHK
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.00,4
2,Uttarahalli,3 BHK,1440,2.0,62.00,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.00,3
4,Kothanur,2 BHK,1200,2.0,51.00,2
...,...,...,...,...,...,...
13315,Whitefield,5 Bedroom,3453,4.0,231.00,5
13316,Richards Town,4 BHK,3600,5.0,400.00,4
13317,Raja Rajeshwari Nagar,2 BHK,1141,2.0,60.00,2
13318,Padmanabhanagar,4 BHK,4689,4.0,488.00,4


In [24]:
RK = df_copy1[df_copy1['size'].str.split().str[1] == 'RK'].index

In [25]:
df_copy1.loc[RK,'BHK'] = 0

In [26]:
df_copy1[df_copy1['BHK'] == 0]

Unnamed: 0,location,size,total_sqft,bath,price,BHK
24,Thanisandra,1 RK,510,1.0,25.25,0
782,Thanisandra,1 RK,445,1.0,28.0,0
1363,Thanisandra,1 RK,510,1.0,25.25,0
2486,Bhoganhalli,1 RK,296,1.0,22.89,0
2557,Anekal,1 RK,351,1.0,16.0,0
2788,Rachenahalli,1 RK,440,1.0,28.0,0
4876,Electronic City,1 RK,435,1.0,19.5,0
5079,Whitefield,1 RK,905,1.0,52.0,0
5285,Rachenahalli,1 RK,385 - 440,1.0,19.8,0
6586,Electronics City Phase 1,1 RK,360,1.0,16.9,0


In [27]:
df_copy1.isnull().sum()

location       0
size          16
total_sqft     0
bath           0
price          0
BHK           16
dtype: int64

In [28]:
df['BHK'] = df['size'].str.split().str[0]

In [29]:
RK = df[df['size'].str.split().str[1] == 'RK'].index

In [30]:
df.loc[RK,'BHK'] = 0

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
 5   BHK         13304 non-null  object 
dtypes: float64(2), object(4)
memory usage: 624.5+ KB


In [32]:
random_values_size = np.random.randint(1, 6, size=df['size'].isnull().sum())
random_values_size

array([5, 2, 1, 1, 4, 5, 1, 5, 2, 4, 4, 3, 4, 3, 3, 4])

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
 5   BHK         13304 non-null  object 
dtypes: float64(2), object(4)
memory usage: 624.5+ KB


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
 5   BHK         13304 non-null  object 
dtypes: float64(2), object(4)
memory usage: 624.5+ KB


In [35]:
df_copy2 = df.copy()

In [36]:
random_values_BHK = df_copy2['BHK'].isnull().sum()
random_values_BHK

16

In [37]:
df_copy3 = df.copy()

In [38]:
df_copy3.fillna(df['BHK'].median())['BHK'].isnull().sum()

0

In [39]:
df.fillna(df['BHK'].median(),inplace=True)

In [40]:
df['BHK'].isnull().sum()

0

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
 5   BHK         13320 non-null  object 
dtypes: float64(2), object(4)
memory usage: 624.5+ KB


In [42]:
df.drop(columns=['size'],inplace=True)

In [43]:
df.head()

Unnamed: 0,location,total_sqft,bath,price,BHK
0,Electronic City Phase II,1056,2.0,39.07,2
1,Chikka Tirupathi,2600,5.0,120.0,4
2,Uttarahalli,1440,2.0,62.0,3
3,Lingadheeranahalli,1521,3.0,95.0,3
4,Kothanur,1200,2.0,51.0,2


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   total_sqft  13320 non-null  object 
 2   bath        13320 non-null  float64
 3   price       13320 non-null  float64
 4   BHK         13320 non-null  object 
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [45]:
df.sample(5)

Unnamed: 0,location,total_sqft,bath,price,BHK
1478,Binny Pete,2465.0,5.0,234.0,3
7790,Kaikondrahalli,1253.0,2.0,81.5,2
13311,Ramamurthy Nagar,1500.0,9.0,250.0,7
7494,Hosa Road,1369.1,2.0,98.0,2
8875,Electronic City Phase II,1000.0,2.0,28.88,2


**There are a few non-float values in total_sqft.We would assign the mean of the range as the total_sqft**

In [46]:
df_copy5 = df.copy()

In [47]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [48]:
df_copy5[~df_copy5['total_sqft'].apply(is_float)]['total_sqft'].str.split('-')

30       [2100 ,  2850]
56       [3010 ,  3410]
81       [2957 ,  3450]
122      [3067 ,  8156]
137      [1042 ,  1105]
              ...      
12990    [1804 ,  2273]
13059    [1200 ,  1470]
13240    [1020 ,  1130]
13265    [1133 ,  1384]
13299    [2830 ,  2882]
Name: total_sqft, Length: 247, dtype: object

In [49]:
def convert_sqft_to_float(x):
    token = x.split('-')
    if(len(token) == 2):
        return (float(token[0])+float(token[1]))/2
    try:
        return float(x)
    except:
        return None
     

In [50]:
convert_sqft_to_float('2100')

2100.0

In [51]:
convert_sqft_to_float('2100 - 2900')

2500.0

In [52]:
df_copy5['total_sqft'] = df_copy5['total_sqft'].apply(convert_sqft_to_float)

In [62]:
df_copy5['total_sqft'].isnull().sum()

46

In [54]:
df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_float)

In [55]:
df.head()

Unnamed: 0,location,total_sqft,bath,price,BHK
0,Electronic City Phase II,1056.0,2.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,120.0,4
2,Uttarahalli,1440.0,2.0,62.0,3
3,Lingadheeranahalli,1521.0,3.0,95.0,3
4,Kothanur,1200.0,2.0,51.0,2


In [56]:
df.to_csv('../datasets/df_data_cleaned.csv',index=False)

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   total_sqft  13274 non-null  float64
 2   bath        13320 non-null  float64
 3   price       13320 non-null  float64
 4   BHK         13320 non-null  object 
dtypes: float64(3), object(2)
memory usage: 520.4+ KB


In [58]:
df[df['total_sqft'].isnull()]

Unnamed: 0,location,total_sqft,bath,price,BHK
410,Kengeri,,1.0,18.5,1
648,Arekere,,9.0,265.0,9
775,Basavanagara,,2.0,93.0,1
872,Singapura Village,,2.0,45.0,2
1019,Marathi Layout,,1.0,110.0,1
1086,Narasapura,,2.0,29.5,2
1400,Chamrajpet,,9.0,296.0,9
1712,Singena Agrahara,,3.0,95.0,3
1743,Hosa Road,,3.0,115.0,3
1821,Sarjapur,,3.0,76.0,3


In [59]:
df2 = pd.read_csv('../datasets/main_df.csv')


In [61]:
df2['total_sqft'].isnull().sum()

0

In [65]:
df_copy5.dropna(inplace=True)

In [67]:
df.dropna(inplace=True)

In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13274 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13274 non-null  object 
 1   total_sqft  13274 non-null  float64
 2   bath        13274 non-null  float64
 3   price       13274 non-null  float64
 4   BHK         13274 non-null  object 
dtypes: float64(3), object(2)
memory usage: 622.2+ KB


In [69]:
df.to_csv('../datasets/df_data_cleaned.csv',index=False)