## Description: Build and evaluate a regression model to predict a continuous variable (e.g., house prices).

In [113]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm

In [96]:
Housing = pd.read_csv('Housing.csv')
Housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [97]:
Housing.shape,Housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


((545, 13), None)

In [98]:
# List of categorical columns
categorical_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']

# Convert to category
for col in categorical_cols:
    Housing[col] = Housing[col].astype('category')

# Verify the changes
Housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   price             545 non-null    int64   
 1   area              545 non-null    int64   
 2   bedrooms          545 non-null    int64   
 3   bathrooms         545 non-null    int64   
 4   stories           545 non-null    int64   
 5   mainroad          545 non-null    category
 6   guestroom         545 non-null    category
 7   basement          545 non-null    category
 8   hotwaterheating   545 non-null    category
 9   airconditioning   545 non-null    category
 10  parking           545 non-null    int64   
 11  prefarea          545 non-null    category
 12  furnishingstatus  545 non-null    category
dtypes: category(7), int64(6)
memory usage: 30.3 KB


In [99]:
Housing.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [100]:
# One-hot encode categorical variables
Housing_encoded = pd.get_dummies(Housing, columns=categorical_cols, drop_first=True)
Housing_encoded.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad_yes,guestroom_yes,basement_yes,hotwaterheating_yes,airconditioning_yes,prefarea_yes,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,2,True,False,False,False,True,True,False,False
1,12250000,8960,4,4,4,3,True,False,False,False,True,False,False,False
2,12250000,9960,3,2,2,2,True,False,True,False,False,True,True,False
3,12215000,7500,4,2,2,3,True,False,True,False,True,True,False,False
4,11410000,7420,4,1,2,2,True,True,True,False,True,False,False,False


In [101]:
# Define features (X) and target (y)
X = Housing_encoded.drop('price', axis=1) 
y = Housing_encoded['price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape

((436, 13), (109, 13))

In [102]:
# Fit the model
model = LinearRegression()
model.fit(X_train, y_train)

In [103]:
# Predict and evaluate
y_pred = model.predict(X_test)

In [107]:
R2 = r2_score(y_test, y_pred)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
R2, RMSE

(0.6529242642153184, 1324506.9600914388)

In [108]:
bool_cols = X_train.select_dtypes(include=['bool']).columns
X_train[bool_cols] = X_train[bool_cols].astype('int64')
X_train.dtypes

area                               int64
bedrooms                           int64
bathrooms                          int64
stories                            int64
parking                            int64
mainroad_yes                       int64
guestroom_yes                      int64
basement_yes                       int64
hotwaterheating_yes                int64
airconditioning_yes                int64
prefarea_yes                       int64
furnishingstatus_semi-furnished    int64
furnishingstatus_unfurnished       int64
dtype: object

In [109]:
X_train_sm = sm.add_constant(X_train)
model_sm = sm.OLS(y_train, X_train_sm)
results = model_sm.fit()


In [110]:
results.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.686
Model:,OLS,Adj. R-squared:,0.676
Method:,Least Squares,F-statistic:,70.9
Date:,"Thu, 12 Jun 2025",Prob (F-statistic):,2.1600000000000001e-97
Time:,23:16:27,Log-Likelihood:,-6635.2
No. Observations:,436,AIC:,13300.0
Df Residuals:,422,BIC:,13360.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.6e+05,2.74e+05,0.949,0.343,-2.79e+05,7.99e+05
area,235.9688,24.903,9.476,0.000,187.020,284.918
bedrooms,7.678e+04,7.54e+04,1.018,0.309,-7.14e+04,2.25e+05
bathrooms,1.094e+06,1.14e+05,9.564,0.000,8.7e+05,1.32e+06
stories,4.075e+05,6.88e+04,5.919,0.000,2.72e+05,5.43e+05
parking,2.248e+05,6.18e+04,3.639,0.000,1.03e+05,3.46e+05
mainroad_yes,3.679e+05,1.49e+05,2.464,0.014,7.44e+04,6.61e+05
guestroom_yes,2.316e+05,1.41e+05,1.648,0.100,-4.47e+04,5.08e+05
basement_yes,3.903e+05,1.18e+05,3.320,0.001,1.59e+05,6.21e+05

0,1,2,3
Omnibus:,74.015,Durbin-Watson:,1.89
Prob(Omnibus):,0.0,Jarque-Bera (JB):,191.938
Skew:,0.833,Prob(JB):,2.0999999999999997e-42
Kurtosis:,5.791,Cond. No.,34800.0


#### Comparing different models

In [114]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42)
}

# Dictionary to store performance metrics
results = {}

# Train and evaluate each model
for name, model in models.items():
    # Fit the model
    model.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Store results
    results[name] = {'R²': r2, 'RMSE': rmse}
    
    print(f"\n{name}:")
    print(f"Test R²: {r2:.4f}")
    print(f"Test RMSE: {rmse:.2f}")


Linear Regression:
Test R²: 0.6529
Test RMSE: 1324506.96

Decision Tree:
Test R²: 0.4771
Test RMSE: 1625669.90

Random Forest:
Test R²: 0.6119
Test RMSE: 1400565.97
