In [2]:
#1

In [10]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [12]:
# Load the dataset
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['Target'] = data.target
print(df)

       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0      8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1      8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2      7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3      5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4      3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
...       ...       ...       ...        ...         ...       ...       ...   
20635  1.5603      25.0  5.045455   1.133333       845.0  2.560606     39.48   
20636  2.5568      18.0  6.114035   1.315789       356.0  3.122807     39.49   
20637  1.7000      17.0  5.205543   1.120092      1007.0  2.325635     39.43   
20638  1.8672      18.0  5.329513   1.171920       741.0  2.123209     39.43   
20639  2.3886      16.0  5.254717   1.162264      1387.0  2.616981     39.37   

       Longitude  Target  
0        -12

In [13]:
# Checking for missing values
print("Missing values:")
print(df.isnull().sum())

Missing values:
MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
Target        0
dtype: int64


In [14]:
# Feature Scaling
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)


In [15]:
print(df_scaled.head())

     MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  2.344766  0.982143  0.628559  -0.153758   -0.974429 -0.049597  1.052548   
1  2.332238 -0.607019  0.327041  -0.263336    0.861439 -0.092512  1.043185   
2  1.782699  1.856182  1.155620  -0.049016   -0.820777 -0.025843  1.038503   
3  0.932968  1.856182  0.156966  -0.049833   -0.766028 -0.050329  1.038503   
4 -0.012881  1.856182  0.344711  -0.032906   -0.759847 -0.085616  1.038503   

   Longitude    Target  
0  -1.327835  2.129631  
1  -1.322844  1.314156  
2  -1.332827  1.258693  
3  -1.337818  1.165100  
4  -1.337818  1.172900  


In [17]:
# Splitting data into training and testing sets
X = df_scaled.drop(columns=['Target'])
y = df_scaled['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree Regressor": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Gradient Boosting Regressor": GradientBoostingRegressor(),
    "Support Vector Regressor": SVR()
}
print(models)

{'Linear Regression': LinearRegression(), 'Decision Tree Regressor': DecisionTreeRegressor(), 'Random Forest Regressor': RandomForestRegressor(), 'Gradient Boosting Regressor': GradientBoostingRegressor(), 'Support Vector Regressor': SVR()}


In [26]:
# Training and evaluating models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MSE': mse, 'MAE': mae, 'R2 Score': r2}
    print(results)

{'Linear Regression': {'MSE': 0.41747698052494986, 'MAE': 0.46207392160821764, 'R2 Score': 0.575787706032451}}
{'Linear Regression': {'MSE': 0.41747698052494986, 'MAE': 0.46207392160821764, 'R2 Score': 0.575787706032451}, 'Decision Tree Regressor': {'MSE': 0.37587369355211436, 'MAE': 0.3952482583321125, 'R2 Score': 0.6180621945111806}}
{'Linear Regression': {'MSE': 0.41747698052494986, 'MAE': 0.46207392160821764, 'R2 Score': 0.575787706032451}, 'Decision Tree Regressor': {'MSE': 0.37587369355211436, 'MAE': 0.3952482583321125, 'R2 Score': 0.6180621945111806}, 'Random Forest Regressor': {'MSE': 0.19153462897515125, 'MAE': 0.2831715360628276, 'R2 Score': 0.8053752706805437}}
{'Linear Regression': {'MSE': 0.41747698052494986, 'MAE': 0.46207392160821764, 'R2 Score': 0.575787706032451}, 'Decision Tree Regressor': {'MSE': 0.37587369355211436, 'MAE': 0.3952482583321125, 'R2 Score': 0.6180621945111806}, 'Random Forest Regressor': {'MSE': 0.19153462897515125, 'MAE': 0.2831715360628276, 'R2 Score

In [29]:
# Convert results into a DataFrame
results_df = pd.DataFrame(results).T
print(results_df)

                                  MSE       MAE  R2 Score
Linear Regression            0.417477  0.462074  0.575788
Decision Tree Regressor      0.375874  0.395248  0.618062
Random Forest Regressor      0.191535  0.283172  0.805375
Gradient Boosting Regressor  0.220795  0.322074  0.775643
Support Vector Regressor     0.264398  0.343314  0.731336


In [30]:
# Identifying best and worst performing models
best_model = results_df['R2 Score'].idxmax()
worst_model = results_df['R2 Score'].idxmin()

In [28]:
print(f"Best performing model: {best_model}")
print(f"Worst performing model: {worst_model}")

Best performing model: Random Forest Regressor
Worst performing model: Linear Regression
