# 02 - Data Cleaning
This notebook handles missing values and prepares data for feature engineering.

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
%matplotlib inline

In [10]:
# Load training data
train_df = pd.read_csv("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/data/train.csv")
print(f"Shape before cleaning: {train_df.shape}")

Shape before cleaning: (1460, 81)


In [11]:
# Check total missing values again
missing_values = train_df.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)
missing_values

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
BsmtExposure      38
BsmtFinType2      38
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
MasVnrArea         8
Electrical         1
dtype: int64

In [12]:
# Drop columns with too many missing values (threshold = 80%)
threshold = 0.8 * train_df.shape[0]
cols_to_drop = missing_values[missing_values > threshold].index
train_df.drop(columns=cols_to_drop, inplace=True)
print(f"Dropped columns: {list(cols_to_drop)}")
print(f"Shape after dropping: {train_df.shape}")

Dropped columns: ['PoolQC', 'MiscFeature', 'Alley', 'Fence']
Shape after dropping: (1460, 77)


In [13]:
# Fill missing numerical columns with median
num_cols = train_df.select_dtypes(include=[np.number]).columns
for col in num_cols:
    if train_df[col].isnull().sum() > 0:
        train_df[col].fillna(train_df[col].median())

In [14]:
# Fill missing categorical columns with mode
cat_cols = train_df.select_dtypes(include=['object']).columns
for col in cat_cols:
    if train_df[col].isnull().sum() > 0:
        train_df[col].fillna(train_df[col].mode()[0])

In [15]:
# Final check for any remaining missing values
train_df.isnull().sum().sum()

np.int64(2422)

In [16]:
# Save cleaned data to use in next step
train_df.to_csv("/Users/alaminbinshafiq/Desktop/Projects/housing-price-prediction/data/train_cleaned.csv", index=False)
print("Cleaned dataset saved.")

Cleaned dataset saved.
