In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [4]:
california = fetch_california_housing()
df = pd.DataFrame(california.data, columns=california.feature_names)
df['Target'] = california.target  

# Display first few rows
df.head()


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [6]:
# Check for missing values
print(df.isnull().sum())


MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
Target        0
dtype: int64


In [14]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.drop(columns=['Target']))  
y = df['Target']

# Splitting data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Print shapes of datasets
print("\nShapes of datasets:")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)


Shapes of datasets:
X_train: (16512, 8)
X_test: (4128, 8)
y_train: (16512,)
y_test: (4128,)


## Preprocessing Steps & Justification

### Loading the Dataset  
- The dataset is loaded using `fetch_california_housing()` from `sklearn.datasets`.  
- It is converted into a Pandas DataFrame for easier analysis and manipulation.  

### Checking for Missing Values  
- Missing values can cause issues in training models.  
- We checked for missing values (`df.isnull().sum()`) and found none in this dataset.

### Feature Scaling (Standardization)  
- The dataset has features with different scales (e.g., `MedInc` vs. `HouseAge`).  
- Standardization (`StandardScaler()`) is applied to ensure all features have a mean of **0** and a standard deviation of **1**.
- This is necessary for models like **Linear Regression, SVR, and Gradient Boosting**, which are sensitive to feature scales.

### Splitting Data into Training & Testing Sets  
- **80% training, 20% testing** helps evaluate model performance.  
- `random_state=42` ensures that the split remains the same every time for consistency.



## Regression Algorithm Implementation

In [27]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "Support Vector Regressor": SVR()
}

# Train & evaluate models
results = {}

for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store results
    results[name] = {"MSE": mse, "MAE": mae, "R²": r2}

# Convert results to DataFrame
results_df = pd.DataFrame(results).T
results_df


Unnamed: 0,MSE,MAE,R²
Linear Regression,0.555892,0.5332,0.575788
Decision Tree,0.499243,0.45422,0.619017
Random Forest,0.255498,0.327613,0.805024
Gradient Boosting,0.293999,0.37165,0.775643
Support Vector Regressor,0.355198,0.397763,0.728941


## Implementing Regression Models
We implemented **5 regression models**:
️1. **Linear Regression**: Assumes a linear relationship between features and the target. 
️2. **Decision Tree Regressor**: Splits data into regions and makes predictions based on leaf nodes.  
️3. **Random Forest Regressor**: Uses multiple decision trees for better accuracy and reduced overfitting.  
4..**Gradient Boosting Regressor**: Sequentially improves weak models to create a strong model.  
️5. **Support Vector Regressor (SVR)**: Uses support vectors to minimize prediction errors.

###  Why are these models suitable?
- **Linear Regression** is simple and interpretable.
- **Tree-based models** handle non-linearity well.
- **SVR** is useful for datasets with complex relationships.


In [31]:
results_df.sort_values(by="R²", ascending=False)



Unnamed: 0,MSE,MAE,R²
Random Forest,0.255498,0.327613,0.805024
Gradient Boosting,0.293999,0.37165,0.775643
Support Vector Regressor,0.355198,0.397763,0.728941
Decision Tree,0.499243,0.45422,0.619017
Linear Regression,0.555892,0.5332,0.575788


In [33]:
results_df.sort_values(by="R²", ascending=False)

Unnamed: 0,MSE,MAE,R²
Random Forest,0.255498,0.327613,0.805024
Gradient Boosting,0.293999,0.37165,0.775643
Support Vector Regressor,0.355198,0.397763,0.728941
Decision Tree,0.499243,0.45422,0.619017
Linear Regression,0.555892,0.5332,0.575788


##  Model Evaluation & Comparison
###  Performance Metrics:
- **Mean Squared Error (MSE)**: Measures average squared differences between actual and predicted values. **Lower is better.**
- **Mean Absolute Error (MAE)**: Measures the average magnitude of errors. **Lower is better.**
- **R² Score**: Measures how well the model explains variance in data. **Higher is better.**

### Best Performing Model:
- The model with the **highest R² score** is the best at predicting house prices.

### Worst Performing Model:
- The model with the **highest MSE and lowest R² score** performs the worst.
