# Housing Prices Prediction - Regression Problem

# Import Libraries

In [1]:
import numpy as np
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.preprocessing import LabelEncoder,StandardScaler,OrdinalEncoder

# Loading Data

In [None]:
plt.style.use("ggplot")
data=pd.read_csv('Housing.csv')
data=pd.DataFrame(data)
data

# EDA -> Exploratory Data Analysis

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.nunique().sort_values()

In [None]:
data.duplicated().sum()

# Data Preprocessing

## Delete Outliers

In [None]:
sns.boxplot(x=data['price'])
plt.show()

In [None]:
sns.boxplot(x=data['area'])
plt.show()

In [None]:
sns.histplot(x=data['area'])
plt.show()

In [None]:
sns.histplot(x=data['price'])
plt.show()

In [11]:
Q1 = data['area'].quantile(0.25)
Q3 = data['area'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5*IQR
upper = Q3 + 1.5*IQR
data = data[(data['area'] >= lower) & (data['area'] <= upper)]

In [12]:
Q1 = data['price'].quantile(0.25)
Q3 = data['price'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5*IQR
upper = Q3 + 1.5*IQR
data = data[(data['price'] >= lower) & (data['price'] <= upper)]

## Encoding Categorical columns

In [None]:
# mainroad	guestroom	basement	hotwaterheating	airconditioning	prefarea
cols=data.loc[:,['mainroad','guestroom','basement','hotwaterheating','airconditioning','prefarea']]
encoder=LabelEncoder()
for col in cols:
    data[col]=encoder.fit_transform(data[col])
data

In [None]:
encoder=OrdinalEncoder(categories=[['unfurnished','semi-furnished','furnished']])
data['furnishingstatus']=encoder.fit_transform(data[['furnishingstatus']])
data

# Feature Engineering

In [15]:
# if 'area' in data.columns and 'price' in data.columns:
#     data['price_per_area'] = data['price'] / data['area']

In [16]:
if 'bedrooms' in data.columns and 'bathrooms' in data.columns:
    data['room_ratio'] = data['bedrooms'] / (data['bathrooms'] + 1e-6)
    data['total_rooms'] = data['bedrooms'] + data['bathrooms']

In [None]:
features = ["mainroad","guestroom","basement","hotwaterheating","airconditioning","prefarea"]
data['num_features']=data[features].sum(axis=1)
data

#  Data Visualization

In [None]:
sns.distplot(x=data['area'])
plt.show()

In [None]:
sns.distplot(x=data['price'])
plt.show()

In [None]:
plt.figure(figsize=(6,4))
sns.barplot(x="mainroad", y="price", data=data)
plt.title(" Price by Main Road")
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data.corr(),annot=True)
plt.show()

In [None]:
plt.scatter(x=data['area'],y=data['price'])
plt.show()

# Model Building and Evaluation

## Import Libraries

In [23]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import  r2_score

## Split the data (Features, Target)

In [24]:
X = data.drop('price',axis=1)
y = data['price']

## Split the data (Train, Validation_test, Test)

In [25]:
X_train, X_temp, y_train, y_temp = train_test_split(X,y,test_size=0.2, random_state=42, shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_temp,y_temp,test_size=0.5, random_state=42, shuffle=True)

## Features Scaling

In [26]:
scaler=StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

## Linear Regression Model

### Model Training

In [27]:
model=LinearRegression()

model.fit(X_train_scaled,y_train)

y_predict_train=model.predict(X_train_scaled)
y_predict_val=model.predict(X_val_scaled)
y_predict_test=model.predict(X_test_scaled)

### Model Evaluation

In [None]:
rmse_train=np.sqrt(mean_squared_error(y_train,y_predict_train))
rmse_val=np.sqrt(mean_squared_error(y_val,y_predict_val))
rmse_test=np.sqrt(mean_squared_error(y_test,y_predict_test))

print("test:",rmse_test)
print("validation:",rmse_val)
print("train:",rmse_train)

In [None]:
train_r2_linear = r2_score(y_train, y_predict_train)
val_r2_linear = r2_score(y_val, y_predict_val)
test_r2_linear = r2_score(y_test, y_predict_test)

print(f"Training R²: {train_r2_linear:.2f}")
print(f"Validation R²: {val_r2_linear:.2f}")
print(f"Test R²: {test_r2_linear:.2f}")

## Lasso Regression (L1 Regularization)

### Model Training

In [None]:
lasso=Lasso(alpha=0.01, random_state=42)

lasso.fit(X_train_scaled,y_train)

ly_predict_train=lasso.predict(X_train_scaled)
ly_predict_val=lasso.predict(X_val_scaled)
ly_predict_test=lasso.predict(X_test_scaled)

### Model Evaluation

In [None]:
rmse_lasso_train=np.sqrt(mean_squared_error(y_train,ly_predict_train))
rmse_lasso_val=np.sqrt(mean_squared_error(y_val,ly_predict_val))
rmse_lasso_test=np.sqrt(mean_squared_error(y_test,ly_predict_test))

print("train:",rmse_lasso_train)
print("validation:",rmse_lasso_val)
print("test:",rmse_lasso_test)

In [None]:
train_r2_lasso = r2_score(y_train, ly_predict_train)
val_r2_lasso = r2_score(y_val, ly_predict_val)
test_r2_lasso = r2_score(y_test, ly_predict_test)

print(f"Lasso (L1) Training R²: {train_r2_lasso:.2f}")
print(f"Lasso (L1) Validation R²: {val_r2_lasso:.2f}")
print(f"Lasso (L1) Test R²: {test_r2_lasso:.2f}")

## Ridge Regression (L2 Regularization)

### Model Training

In [33]:
ridge=Ridge(alpha=0.01,random_state=42)

ridge.fit(X_train_scaled,y_train)

ry_predict_train=ridge.predict(X_train_scaled)
ry_predict_val=ridge.predict(X_val_scaled)
ry_predict_test=ridge.predict(X_test_scaled)


### Model Evaluation

In [None]:
rmse_ridge_train=np.sqrt(mean_squared_error(y_train,ry_predict_train))
rmse_ridge_val=np.sqrt(mean_squared_error(y_val,ry_predict_val))
rmse_ridge_test=np.sqrt(mean_squared_error(y_test,ry_predict_test))

print("train:",rmse_ridge_train)
print("validation:",rmse_ridge_val)
print("test:",rmse_ridge_test)

In [None]:
train_r2_ridge = r2_score(y_train, ry_predict_train)
val_r2_ridge = r2_score(y_val, ry_predict_val)
test_r2_ridge = r2_score(y_test, ry_predict_test)

print(f"Ridge Training R²: {train_r2_ridge:.2f}")
print(f"Ridge Validation R²: {val_r2_ridge:.2f}")
print(f"Ridge Test R²: {test_r2_ridge:.2f}")

## Decision Tree Regressor

### Model Training

In [36]:
tree=DecisionTreeRegressor(random_state=42,max_depth=3)

tree.fit(X_train_scaled,y_train)

y_train_tree_pred=tree.predict(X_train_scaled)
y_val_tree_pred=tree.predict(X_val_scaled)
y_test_tree_pred=tree.predict(X_test_scaled)

### Model Evaluation

In [None]:
rmse_tree_test=np.sqrt(mean_squared_error(y_test,y_test_tree_pred))
rmse_tree_val=np.sqrt(mean_squared_error(y_val,y_val_tree_pred))
rmse_tree_train=np.sqrt(mean_squared_error(y_train,y_train_tree_pred))

print("train:",rmse_tree_train)
print("validation:",rmse_tree_val)
print("test:",rmse_tree_test)

In [None]:
r2_train_tree = r2_score(y_train, y_train_tree_pred)
r2_val_tree = r2_score(y_val, y_val_tree_pred)
r2_test_tree = r2_score(y_test, y_test_tree_pred)

print(f"Tree Training R²: {r2_train_tree:.2f}")
print(f"Tree validation R²: {r2_val_tree:.2f}")
print(f"Tree Test R²: {r2_test_tree:.2f}")