In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load dataset
df = pd.read_csv("melb_data.csv")

# Show first 5 rows
df.head()

In [None]:
# Dataset shape
print("Dataset Shape:", df.shape)

# Data types & non-null values
print("\nDataset Info:")
print(df.info())

# Summary statistics
df.describe(include="all").transpose()

In [None]:
# Example: Remove duplicates if any
df.drop_duplicates(inplace=True)

# Example: Drop rows with unrealistic values (Price <= 0, Rooms <= 0 etc.)
df = df[df["Price"] > 0]
df = df[df["Rooms"] > 0]

print("Dataset shape after removing inappropriate data:", df.shape)

In [None]:
# Check missing values
print("Missing values before treatment:\n", df.isnull().sum())

# Fill numeric missing values with median
for col in df.select_dtypes(include=[np.number]).columns:
    df[col].fillna(df[col].median(), inplace=True)

# Fill categorical missing values with mode
for col in df.select_dtypes(include=[object]).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

print("\nMissing values after treatment:\n", df.isnull().sum().sum())

In [None]:
from sklearn.preprocessing import LabelEncoder

label_enc = LabelEncoder()

# Encode all categorical columns
for col in df.select_dtypes(include=[object]).columns:
    df[col] = label_enc.fit_transform(df[col])

df.head()

In [None]:
# Save processed dataset
df.to_csv("melb_data_cleaned.csv", index=False)
print("Cleaned dataset saved as melb_data_cleaned.csv")