In [1]:
import pandas as pd

df = pd.read_csv('/content/Bengaluru_House_Data (2).csv')
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [2]:
missing_society = df['society'].isnull().sum()
percentage_missing_society = (missing_society / len(df)) * 100

print(f"Number of missing values in 'society' column: {missing_society}")
print(f"Percentage of missing values in 'society' column: {percentage_missing_society:.2f}%")

print("\nUnique values and their counts for 'society' column:")
print(df['society'].value_counts(dropna=False).head(20))

Number of missing values in 'society' column: 5502
Percentage of missing values in 'society' column: 41.31%

Unique values and their counts for 'society' column:
society
NaN        5502
GrrvaGr      80
PrarePa      76
Prtates      59
Sryalan      59
GMown E      56
Bhmesy       51
Prtanha      51
Prityel      50
Prarkun      49
PrityTr      49
Dhalsh       47
IBityin      45
SNity S      40
Soresea      39
Adeatlm      37
Soitya       36
Rosha I      35
Bhe 2ko      35
SunceEs      34
Name: count, dtype: int64


In [3]:
df['society'].fillna('Unknown', inplace=True)

print("Unique values and their counts for 'society' column after imputation:")
print(df['society'].value_counts(dropna=False).head(20))

Unique values and their counts for 'society' column after imputation:
society
Unknown    5502
GrrvaGr      80
PrarePa      76
Prtates      59
Sryalan      59
GMown E      56
Bhmesy       51
Prtanha      51
Prityel      50
Prarkun      49
PrityTr      49
Dhalsh       47
IBityin      45
SNity S      40
Soresea      39
Adeatlm      37
Soitya       36
Rosha I      35
Bhe 2ko      35
SunceEs      34
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['society'].fillna('Unknown', inplace=True)


**Reasoning**:
The previous code generated a `FutureWarning` because `inplace=True` with chained assignment is deprecated. To resolve this, I will directly assign the result of the `fillna` operation back to the 'society' column, which is the recommended way to perform this operation.



In [4]:
df['society'] = df['society'].fillna('Unknown')

print("Unique values and their counts for 'society' column after imputation:")
print(df['society'].value_counts(dropna=False).head(20))

Unique values and their counts for 'society' column after imputation:
society
Unknown    5502
GrrvaGr      80
PrarePa      76
Prtates      59
Sryalan      59
GMown E      56
Bhmesy       51
Prtanha      51
Prityel      50
Prarkun      49
PrityTr      49
Dhalsh       47
IBityin      45
SNity S      40
Soresea      39
Adeatlm      37
Soitya       36
Rosha I      35
Bhe 2ko      35
SunceEs      34
Name: count, dtype: int64


In [5]:
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['price'] < lower_bound) | (df['price'] > upper_bound)]

print(f"Number of outliers in 'price' column: {len(outliers)}")
print("\nHead of DataFrame containing outliers in 'price' column:")
print(outliers[['price']].head())

Number of outliers in 'price' column: 1276

Head of DataFrame containing outliers in 'price' column:
    price
7   600.0
9   370.0
11  295.0
18  290.0
22  380.0


In [6]:
price_outlier_indices = outliers.index

print(f"Number of rows identified as price outliers: {len(price_outlier_indices)}")
print("First 5 indices of price outliers:")
print(price_outlier_indices[:5])

Number of rows identified as price outliers: 1276
First 5 indices of price outliers:
Index([7, 9, 11, 18, 22], dtype='int64')


**Reasoning**:
Now that the outliers' indices have been identified, I will drop these rows from the main DataFrame to clean the dataset. This action is irreversible, and it's justified because these values are extreme and can negatively impact model training.



In [7]:
df_cleaned = df.drop(price_outlier_indices)

print(f"Original DataFrame shape: {df.shape}")
print(f"Cleaned DataFrame shape after dropping price outliers: {df_cleaned.shape}")

Original DataFrame shape: (13320, 9)
Cleaned DataFrame shape after dropping price outliers: (12044, 9)


In [8]:
print("Preview of the cleaned dataset (first 5 rows):")
print(df_cleaned.head())

print(f"\nTotal number of rows in the cleaned dataset: {len(df_cleaned)}")

Preview of the cleaned dataset (first 5 rows):
              area_type   availability                  location       size  \
0  Super built-up  Area         19-Dec  Electronic City Phase II      2 BHK   
1            Plot  Area  Ready To Move          Chikka Tirupathi  4 Bedroom   
2        Built-up  Area  Ready To Move               Uttarahalli      3 BHK   
3  Super built-up  Area  Ready To Move        Lingadheeranahalli      3 BHK   
4  Super built-up  Area  Ready To Move                  Kothanur      2 BHK   

   society total_sqft  bath  balcony   price  
0   Coomee       1056   2.0      1.0   39.07  
1  Theanmp       2600   5.0      3.0  120.00  
2  Unknown       1440   2.0      3.0   62.00  
3  Soiewre       1521   3.0      1.0   95.00  
4  Unknown       1200   2.0      1.0   51.00  

Total number of rows in the cleaned dataset: 12044
