In [2]:
import pandas as pd
import numpy as np

# ===========================
# Load Dataset
# ===========================
df = pd.read_csv("../Datasets/housing.csv")

print("Before Cleaning — Missing Values:")
print(df.isnull().sum())

Before Cleaning — Missing Values:
longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64


In [3]:
# ===========================
# Imputation
# ===========================

# Only total_bedrooms has missing
median_bedrooms = df["total_bedrooms"].median()

print(f"Median value for 'total_bedrooms' = {median_bedrooms}")

df["total_bedrooms"].fillna(median_bedrooms, inplace=True)

Median value for 'total_bedrooms' = 435.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["total_bedrooms"].fillna(median_bedrooms, inplace=True)


In [4]:
# ===========================
# Validation
# ===========================
print("\nAfter Cleaning — Missing Values:")
print(df.isnull().sum())

# Sanity check
assert df.isnull().sum().sum() == 0, "There are still missing values!"

print("\nAll missing values handled successfully.")


After Cleaning — Missing Values:
longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

All missing values handled successfully.


In [5]:
df.to_csv("housing_cleaned.csv", index=False)