In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
df = pd.read_csv('../datasets/main_df.csv')
df.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [4]:
df.shape

(13320, 5)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [20]:
df_copy = df.copy()

In [21]:
df_copy[df_copy['location'].isnull()]

Unnamed: 0,location,size,total_sqft,bath,price
568,,3 BHK,1600,3.0,86.0


In [22]:
df['location'].mode()

0    Whitefield
Name: location, dtype: object

**There is a NaN value in location column - We would replace it with the mode of the location (Whitefield).**

In [28]:
df_copy['location'].fillna(df_copy['location'].mode()[0],inplace=True)
df['location'].fillna(df['location'].mode()[0],inplace=True)

In [29]:
df.isnull().sum()

location       0
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [33]:
df['bath'].unique()

array([ 2.,  5.,  3.,  4.,  6.,  1.,  9., nan,  8.,  7., 11., 10., 14.,
       27., 12., 16., 40., 15., 13., 18.])

**We'll assign random values within the range of 1 to 5 to fill in the NaN entries for the 'bathrooms' column, considering that the typical number of bathrooms falls within this range.**

In [55]:
random_values_bath = np.random.randint(1, 6, size=df['bath'].isnull().sum())
random_values_bath

array([4, 1, 5, 2, 5, 4, 4, 4, 3, 2, 5, 1, 4, 3, 3, 2, 1, 4, 1, 5, 3, 4,
       5, 3, 2, 3, 3, 5, 2, 4, 1, 1, 4, 3, 4, 5, 2, 2, 4, 2, 1, 5, 2, 2,
       4, 3, 2, 4, 2, 1, 5, 5, 2, 4, 1, 3, 1, 4, 5, 2, 2, 4, 3, 3, 3, 1,
       3, 1, 2, 5, 1, 2, 4])

In [56]:
df_copy.loc[df['bath'].isnull(), 'bath'] = random_values_bath

In [57]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [58]:
df.loc[df['bath'].isnull(), 'bath'] = random_values_bath

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [61]:
df['bath'].median()

2.0

In [62]:
df['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', nan, '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [63]:
df['size'].isnull().sum()

16

In [85]:
df_copy['BHK'] = df_copy['size'].str.split().str[0]

In [89]:
df_copy1 = df_copy.copy()

In [93]:
df_copy1

Unnamed: 0,location,size,total_sqft,bath,price,BHK
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.00,4
2,Uttarahalli,3 BHK,1440,2.0,62.00,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.00,3
4,Kothanur,2 BHK,1200,2.0,51.00,2
...,...,...,...,...,...,...
13315,Whitefield,5 Bedroom,3453,4.0,231.00,5
13316,Richards Town,4 BHK,3600,5.0,400.00,4
13317,Raja Rajeshwari Nagar,2 BHK,1141,2.0,60.00,2
13318,Padmanabhanagar,4 BHK,4689,4.0,488.00,4


In [100]:
RK = df_copy1[df_copy1['size'].str.split().str[1] == 'RK'].index

In [101]:
df_copy1.loc[RK,'BHK'] = 0

In [104]:
df_copy1[df_copy1['BHK'] == 0]

Unnamed: 0,location,size,total_sqft,bath,price,BHK
24,Thanisandra,1 RK,510,1.0,25.25,0
782,Thanisandra,1 RK,445,1.0,28.0,0
1363,Thanisandra,1 RK,510,1.0,25.25,0
2486,Bhoganhalli,1 RK,296,1.0,22.89,0
2557,Anekal,1 RK,351,1.0,16.0,0
2788,Rachenahalli,1 RK,440,1.0,28.0,0
4876,Electronic City,1 RK,435,1.0,19.5,0
5079,Whitefield,1 RK,905,1.0,52.0,0
5285,Rachenahalli,1 RK,385 - 440,1.0,19.8,0
6586,Electronics City Phase 1,1 RK,360,1.0,16.9,0


In [105]:
df_copy1.isnull().sum()

location       0
size          16
total_sqft     0
bath           0
price          0
BHK           16
dtype: int64

In [106]:
df['BHK'] = df['size'].str.split().str[0]

In [107]:
RK = df[df['size'].str.split().str[1] == 'RK'].index

In [108]:
df.loc[RK,'BHK'] = 0

In [127]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
 5   BHK         13304 non-null  object 
dtypes: float64(2), object(4)
memory usage: 624.5+ KB


In [129]:
random_values_size = np.random.randint(1, 6, size=df['size'].isnull().sum())
random_values_size

array([4, 3, 2, 3, 4, 5, 2, 3, 5, 1, 1, 5, 1, 3, 3, 1])

In [131]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
 5   BHK         13304 non-null  object 
dtypes: float64(2), object(4)
memory usage: 624.5+ KB


In [135]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
 5   BHK         13304 non-null  object 
dtypes: float64(2), object(4)
memory usage: 624.5+ KB


In [136]:
df_copy2 = df.copy()

In [138]:
random_values_BHK = df_copy2['BHK'].isnull().sum()
random_values_BHK

16

In [145]:
df_copy3 = df.copy()

In [149]:
df_copy3.fillna(df['BHK'].median())['BHK'].isnull().sum()

0

In [150]:
df.fillna(df['BHK'].median(),inplace=True)

In [151]:
df['BHK'].isnull().sum()

0

In [152]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
 5   BHK         13320 non-null  object 
dtypes: float64(2), object(4)
memory usage: 624.5+ KB


In [153]:
df.drop(columns=['size'],inplace=True)

In [154]:
df.head()

Unnamed: 0,location,total_sqft,bath,price,BHK
0,Electronic City Phase II,1056,2.0,39.07,2
1,Chikka Tirupathi,2600,5.0,120.0,4
2,Uttarahalli,1440,2.0,62.0,3
3,Lingadheeranahalli,1521,3.0,95.0,3
4,Kothanur,1200,2.0,51.0,2


In [155]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   total_sqft  13320 non-null  object 
 2   bath        13320 non-null  float64
 3   price       13320 non-null  float64
 4   BHK         13320 non-null  object 
dtypes: float64(2), object(3)
memory usage: 520.4+ KB
