In [None]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load dataset
data = pd.read_csv("melb_data.csv")

# Display first 5 rows
data.head()

In [None]:
# Dataset information
print("Dataset Info:")
print(data.info())

# Shape of dataset
print("\nShape of dataset:", data.shape)

# Statistical summary
data.describe(include="all").transpose()

In [None]:
# Check missing values
print("Missing Values Count:\n", data.isnull().sum())

# Visualize missing values
plt.figure(figsize=(10,5))
sns.heatmap(data.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Values Heatmap")
plt.show()

In [None]:
# Fill numeric missing values with median
for col in data.select_dtypes(include=[np.number]).columns:
    data[col].fillna(data[col].median(), inplace=True)

# Fill categorical missing values with mode
for col in data.select_dtypes(include=[object]).columns:
    data[col].fillna(data[col].mode()[0], inplace=True)

print("Missing values after treatment:\n", data.isnull().sum().sum())

In [None]:
# Encode categorical columns using Label Encoding
from sklearn.preprocessing import LabelEncoder

label_enc = LabelEncoder()
for col in data.select_dtypes(include=[object]).columns:
    data[col] = label_enc.fit_transform(data[col])

data.head()

In [None]:
# Function to detect outliers using IQR
def detect_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower) | (df[col] > upper)]
    return outliers

# Example: detect outliers in Price
outliers_price = detect_outliers(data, "Price")
print("Outliers in Price:", len(outliers_price))

In [None]:
# Boxplot for Price
plt.figure(figsize=(8,5))
sns.boxplot(x=data["Price"])
plt.title("Boxplot of House Prices")
plt.show()

# Histogram for Rooms
plt.figure(figsize=(8,5))
sns.histplot(data["Rooms"], bins=20, kde=True)
plt.title("Distribution of Rooms")
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(data.corr(), annot=False, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# Save preprocessed dataset
data.to_csv("melb_data_cleaned.csv", index=False)
print("Cleaned dataset saved as melb_data_cleaned.csv")