In [None]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:
# Load the Melbourne Housing dataset
df = pd.read_csv("melb_data.csv")

# Display first 5 rows
df.head()

In [None]:
# Dataset overview
print("Shape of dataset:", df.shape)
print("\nData types and missing values:")
df.info()

# Summary statistics
df.describe(include='all').T

In [None]:
# Check missing values
df.isnull().sum()

# Option 1: Drop columns with too many missing values
df = df.drop(columns=['BuildingArea','YearBuilt'], errors='ignore')

# Option 2: Fill missing numerical with median, categorical with mode
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna(df[col].median())

df.isnull().sum().sum()   # should be 0

In [None]:
# Encode categorical columns
le = LabelEncoder()
for col in df.select_dtypes(include="object"):
    df[col] = le.fit_transform(df[col])

df.head()

In [None]:
# Boxplot for numerical features
num_cols = df.select_dtypes(include=np.number).columns

for col in num_cols[:5]:  # limit to first 5 for quick view
    plt.figure(figsize=(6,4))
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot of {col}")
    plt.show()

In [None]:
scaler = StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(df[num_cols]), columns=num_cols)

print("First 5 rows after scaling:")
scaled_df.head()

In [None]:
# Combine categorical + scaled numerical
final_df = pd.concat([scaled_df, df.drop(columns=num_cols)], axis=1)

print("Final dataset shape:", final_df.shape)
final_df.head()