<a href="https://colab.research.google.com/github/Arun-Varghese2312/California-Housing-/blob/main/California_Housing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [63]:
# regression
# California Housing Project Data Set

In [64]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load Dataset
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [65]:
# Convert to DataFrame
df = pd.DataFrame(housing.data, columns=housing.feature_names)

In [66]:
# Add target column
df['MedHouseVal'] = housing.target

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [68]:
df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [69]:
# Missing Values
df.isnull().sum()

Unnamed: 0,0
MedInc,0
HouseAge,0
AveRooms,0
AveBedrms,0
Population,0
AveOccup,0
Latitude,0
Longitude,0
MedHouseVal,0


In [70]:
# Feature Scaling (Standardization)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = df.drop('MedHouseVal', axis=1)
y = df['MedHouseVal']
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,2.344766,0.982143,0.628559,-0.153758,-0.974429,-0.049597,1.052548,-1.327835
1,2.332238,-0.607019,0.327041,-0.263336,0.861439,-0.092512,1.043185,-1.322844
2,1.782699,1.856182,1.15562,-0.049016,-0.820777,-0.025843,1.038503,-1.332827
3,0.932968,1.856182,0.156966,-0.049833,-0.766028,-0.050329,1.038503,-1.337818
4,-0.012881,1.856182,0.344711,-0.032906,-0.759847,-0.085616,1.038503,-1.337818


In [71]:
# Split dataset into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [72]:
# Explanation of processing steps
# Loaded and Converted dataset into pandas DataFrame
# Checked for missing values
# Separated features and target variable
# Applied StandardScaler to scale features
# Split dataset into training and testing sets

In [73]:
# Regression Algorithm Implementation

# Import regression models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

In [74]:
# Implement Linear Regression

lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

In [75]:
# Linear regression predicts the housde price based on features and target

In [76]:
# Decision Tree Regressor

dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test) # splits data based on feature values

In [77]:
# Random forest regressor

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test) # Random forest uses multiple decision trees and improves accuracy

In [78]:
# Gradient Boosting Regressor

gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train, y_train)
gb_pred = dt.predict(X_test) # Gradient boosting improves predictions by correcting past errors

In [79]:
# Support Vector Regressor

svr = SVR()
svr.fit(X_train, y_train)
svr_pred = svr.predict(X_test) # SVR find better boundaries to predict contineous values

In [80]:
# Model Evaluation and Comparison
# Import Evaluation matrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [81]:
# EValuate Linear Regression

lr_mse = mean_squared_error(y_test, lr_pred)
lr_mae = mean_absolute_error(y_test, lr_pred)
lr_r2 = r2_score(y_test, lr_pred)

In [82]:
# Evaluate Decision Tree

dt_mse = mean_squared_error(y_test, dt_pred)
dt_mae = mean_absolute_error(y_test, dt_pred)
dt_r2 = r2_score(y_test, dt_pred)

In [83]:
# Evaluate Random Forest

rf_mse = mean_absolute_error(y_test, rf_pred)
rf_mae = mean_absolute_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)

In [84]:
# Evaluate Gradient Boosting

gb_mse = mean_absolute_error(y_test, gb_pred)
gb_mae = mean_absolute_error(y_test, gb_pred)
gb_r2 = r2_score(y_test, gb_pred)

In [85]:
# Evaluate SVR

svr_mse = mean_absolute_error(y_test, svr_pred)
svr_mae = mean_absolute_error(y_test, svr_pred)
svr_r2 = r2_score(y_test, svr_pred)

In [86]:
# Compare result of all models and identify the best and worst

# Create dictionary of all model results

results = {
    "Linear Regression": {
        "MSE": lr_mse,
        "MAE": lr_mae,
        "R2": lr_r2
    },
    "Decision Tree": {
        "MSE": dt_mse,
        "MAE": dt_mae,
        "R2": dt_r2
    },
    "Random Forest": {
        "MSE": rf_mse,
        "MAE": rf_r2
    },
    "Gradient Boosting": {
        "MSE": gb_mse,
        "MAE": gb_mae,
        "R2": gb_r2
    },
    "Support Vector Regressor": {
        "MSE": svr_mse,
        "MAE": svr,
        "R2": svr_r2
    },

}


In [87]:
# Convert to dataframe for comparison

results_df = pd.DataFrame(results).T
results_df

Unnamed: 0,MSE,MAE,R2
Linear Regression,0.555892,0.5332,0.575788
Decision Tree,0.494272,0.453784,0.622811
Random Forest,0.327613,0.805024,
Gradient Boosting,0.453784,0.453784,0.622811
Support Vector Regressor,0.397763,SVR(),0.728941


In [89]:
# Find best model

best_model = results_df["R2"].idxmax()
best_model

'Support Vector Regressor'

In [90]:
# # Find Worst Model

worst_model = results_df["R2"].idxmin()
worst_model

'Linear Regression'

In [None]:
# Best Model is SVR and Worst model is Linear Regression