In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
from scipy.cluster.vq import whiten

In [2]:
def print_nan_percentage(df: pd.DataFrame, onlyNaNs=False):
    nan_percentage = df.isna().mean() * 100
    for feature, percentage in nan_percentage.items():
        if onlyNaNs and percentage == 0:
            continue
        print(f"{feature}: {percentage:.2f}% NaN values")

In [3]:
df = pd.read_csv("Bengaluru_House_Data.csv")
df

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00


In [4]:
print_nan_percentage(df, True)

location: 0.01% NaN values
size: 0.12% NaN values
society: 41.31% NaN values
bath: 0.55% NaN values
balcony: 4.57% NaN values


In [5]:
df = df.dropna(subset=["location"])

In [25]:
df["size"].mode()

0    2 BHK
Name: size, dtype: object

In [29]:
df["size"] = df["size"].fillna(df["size"].mode()[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["size"] = df["size"].fillna(df["size"].mode()[0])


In [47]:
df = df.drop(columns=["society"])

In [51]:
df["bath"] = df["bath"].fillna(df["bath"].median())

In [58]:
df["balcony"] = df["balcony"].fillna(df["balcony"].median())

In [60]:
print_nan_percentage(df, True)

In [65]:
data_numeric = df._get_numeric_data()
data_numeric

Unnamed: 0,bath,balcony,price
0,2.0,1.0,39.07
1,5.0,3.0,120.00
2,2.0,3.0,62.00
3,3.0,1.0,95.00
4,2.0,1.0,51.00
...,...,...,...
13315,4.0,0.0,231.00
13316,5.0,2.0,400.00
13317,2.0,1.0,60.00
13318,4.0,1.0,488.00


In [69]:
data_whitened = whiten(data_numeric)
data_normal = pd.DataFrame(normalize(data_whitened), columns=data_numeric.columns)
data_normal

Unnamed: 0,bath,balcony,price
0,0.761256,0.634531,0.133641
1,0.698943,0.699109,0.150747
2,0.369344,0.923579,0.102894
3,0.848258,0.471367,0.241394
4,0.756515,0.630579,0.173362
...,...,...,...
13314,0.887588,0.000000,0.460638
13315,0.714010,0.476120,0.513323
13316,0.752186,0.626971,0.202789
13317,0.648788,0.270392,0.711310


In [79]:
print(set(data_normal.columns))
print(set(df.columns).difference(set(data_normal.columns)))

{'balcony', 'price', 'bath'}
{'availability', 'area_type', 'total_sqft', 'size', 'location'}
