# California Housing 

# Modelling 

## Here's an outline of the steps we will follow:

__1. Scaling the Data:__

> Scaling the data is beneficial, especially when using machine learning algorithms that are sensitive to the scale of the features. You can use techniques like Standardization or Min-Max scaling to scale your numerical features.


__2. Train-Test Split:__

> Splitting the data into training and testing sets helps evaluate the performance of your machine learning models on unseen data. Typically, you allocate a certain percentage of the data for testing (e.g., 20% or 30%) and the rest for training. You can use the train_test_split function from scikit-learn to perform the split.

__3. Modelling with a varity of classical ML models__

# Libraries 

In [48]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd 

# Data scaling, splitting, and tuning  

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# ML models 

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb


# ML evaluation metrics 

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [None]:
df = pd.read_csv("GDSC-California-Housing/Data/Processed_data/processed_data.csv")

In [34]:
df.head()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_encoded
0,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,1
1,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,1
2,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,1
3,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,1
4,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,1


In [33]:
df.drop("ocean_proximity", axis=1, inplace=True)

## Splitting our data into target and features 

In [42]:
X = df.drop('median_house_value', axis=1)  # Features
y = df['median_house_value']  # Target

In [43]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Splitting our data into train and test splits 

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear regression 

In [45]:
model_lr = LinearRegression()
param_grid_lr = {'fit_intercept': [True, False], 'normalize': [True, False]}
grid_search_lr = GridSearchCV(model_lr, param_grid_lr, cv=5)
grid_search_lr.fit(X_train, y_train)

best_model_lr = grid_search_lr.best_estimator_

In [46]:
y_train_pred_lr = best_model_lr.predict(X_train)
y_test_pred_lr = best_model_lr.predict(X_test)
mse_lr = mean_squared_error(y_test, y_test_pred_lr)
rmse_lr = np.sqrt(mse_lr)
mae_lr = mean_absolute_error(y_test, y_test_pred_lr)
r2_lr = r2_score(y_test, y_test_pred_lr)

print("Linear Regression Metrics:")
print("Mean Squared Error (MSE):", mse_lr)
print("Root Mean Squared Error (RMSE):", rmse_lr)
print("Mean Absolute Error (MAE):", mae_lr)
print("R-squared (R2):", r2_lr)


Linear Regression Metrics:
Mean Squared Error (MSE): 5862414935.141154
Root Mean Squared Error (RMSE): 76566.40866033325
Mean Absolute Error (MAE): 56186.57299769971
R-squared (R2): 0.5526270780736795


# Support vector regressor 

> taking lots of time to run 

In [None]:
model_svm = SVR()
param_grid_svm = {'kernel': ['linear', 'rbf'], 'C': [0.1, 1, 10], 'gamma': ['scale', 'auto']}
grid_search_svm = GridSearchCV(model_svm, param_grid_svm, cv=5)
grid_search_svm.fit(X_train, y_train)

best_model_svm = grid_search_svm.best_estimator_

In [None]:
y_train_pred_svm = best_model_svm.predict(X_train)
y_test_pred_svm = best_model_svm.predict(X_test)
mse_svm = mean_squared_error(y_test, y_test_pred_svm)
rmse_svm = np.sqrt(mse_svm)
mae_svm = mean_absolute_error(y_test, y_test_pred_svm)
r2_svm = r2_score(y_test, y_test_pred_svm)

print("SVM Metrics:")
print("Mean Squared Error (MSE):", mse_svm)
print("Root Mean Squared Error (RMSE):", rmse_svm)
print("Mean Absolute Error (MAE):", mae_svm)
print("R-squared (R2):", r2_svm)

# Random Forest regressor 

In [None]:
model_rf = RandomForestRegressor()

In [None]:
param_grid_rf = {'n_estimators': [100, 200, 300], 'max_depth': [3, 5, 10]}

In [None]:
grid_search_rf = GridSearchCV(model_rf, param_grid_rf, cv=5)
grid_search_rf.fit(X_train, y_train)
best_model_rf = grid_search_rf.best_estimator_

In [None]:
y_test_pred_rf = best_model_rf.predict(X_test)

In [None]:
mse_rf = mean_squared_error(y_test, y_test_pred_rf)
rmse_rf = np.sqrt(mse_rf)
mae_rf = mean_absolute_error(y_test, y_test_pred_rf)
r2_rf = r2_score(y_test, y_test_pred_rf)


In [None]:
print("Random Forest Metrics:")
print("Mean Squared Error (MSE):", mse_rf)
print("Root Mean Squared Error (RMSE):", rmse_rf)
print("Mean Absolute Error (MAE):", mae_rf)
print("R-squared (R2):", r2_rf)
print()

# Gradient boosting regressor

In [None]:
model_gb = GradientBoostingRegressor()

In [None]:
param_grid_gb = {'n_estimators': [100, 200, 300], 'learning_rate': [0.1, 0.01, 0.001]}

In [None]:
grid_search_gb = GridSearchCV(model_gb, param_grid_gb, cv=5)

grid_search_gb.fit(X_train, y_train)

best_model_gb = grid_search_gb.best_estimator_

y_test_pred_gb = best_model_gb.predict(X_test)

In [None]:
mse_gb = mean_squared_error(y_test, y_test_pred_gb)
rmse_gb = np.sqrt(mse_gb)
mae_gb = mean_absolute_error(y_test, y_test_pred_gb)
r2_gb = r2_score(y_test, y_test_pred_gb)

In [None]:
print("Gradient Boosting Metrics:")
print("Mean Squared Error (MSE):", mse_gb)
print("Root Mean Squared Error (RMSE):", rmse_gb)
print("Mean Absolute Error (MAE):", mae_gb)
print("R-squared (R2):", r2_gb)