Cell 1 — Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import os

os.makedirs("visuals", exist_ok=True)
os.makedirs("insights", exist_ok=True)


Cell 2 — Load dataset

In [5]:
dataset = pd.read_csv("D:/ai-ml-learning-journey/Day_6ML/Housing.csv")
print("Review Data :")
dataset.head()

Review Data :


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


Cell 3 - Cleaning data

In [10]:
dataset.dropna(inplace=True)

# Remove duplicates if any
dataset.drop_duplicates(inplace=True)

Cell 4 - Basic Summary

In [11]:
print("Dataset Summary:")
dataset.describe()

Dataset Summary:


Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0


Cell 5 - Define X (features) and y (target)

In [19]:
X = dataset[['area', 'bedrooms', 'bathrooms', 'stories', 'parking']]
y = dataset['price']


Cell 6 - Train-test split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("\n Data split completed successfully!")
print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")


 Data split completed successfully!
Training set size: (436, 5)
Testing set size: (109, 5)


Cell 7 - Visualization

In [24]:
# Area vs Price
plt.figure(figsize=(8,6))
sns.scatterplot(x=dataset["area"], y=dataset["price"], hue=dataset["bedrooms"], palette="coolwarm")
plt.title("Area vs Price (colored by Bedrooms)", fontsize=13, weight='bold')
plt.xlabel("Area (sqft)")
plt.ylabel("Price")
plt.tight_layout()
plt.savefig("visuals/area_vs_price.png")
plt.close()

# Bedrooms distribution
plt.figure(figsize=(7,5))
sns.countplot(x="bedrooms", data=dataset, hue="bedrooms", palette="viridis", legend=False)
plt.title("Bedroom Distribution", fontsize=13, weight='bold')
plt.xlabel("No. of Bedrooms")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig("visuals/bedrooms_distribution.png")
plt.close()
