In [None]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn imports
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Set global style and seed for reproducibility
plt.style.use('seaborn-v0_8')
np.random.seed(42)

In [None]:
# Load the California Housing dataset
california = fetch_california_housing()
X = pd.DataFrame(california.data, columns=california.feature_names)
y = pd.Series(california.target, name='MedHouseVal')

# Combine into a single DataFrame for EDA
df = pd.concat([X, y], axis=1)

# Display basic info
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
display(df.head())

# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

In [None]:
# Summary statistics
print("Summary Statistics:")
display(df.describe())

# Plot feature distributions
plt.figure(figsize=(15, 10))
for i, col in enumerate(df.columns):
    plt.subplot(3, 3, i + 1)
    sns.histplot(df[col], kde=True, bins=30, color='steelblue')
    plt.title(f'{col}', fontsize=12)
plt.tight_layout()
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Feature Correlation Matrix', fontsize=14)
plt.show()

In [None]:
# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Standardize features (important for SVR and Linear Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrames for clarity (optional but helpful)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)

print("Preprocessing complete. Data is split and scaled.")

In [None]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'SVR': SVR()
}

# Store predictions
base_predictions = {}

# Train and predict
for name, model in models.items():
    if name in ['Linear Regression', 'SVR']:
        # Use scaled data
        model.fit(X_train_scaled, y_train)
        pred = model.predict(X_test_scaled)
    else:
        # Tree-based models: use original scale
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
    base_predictions[name] = pred

print("All base models trained and predictions generated.")

In [None]:
# Compute evaluation metrics
results = {}
for name, pred in base_predictions.items():
    mse = mean_squared_error(y_test, pred)
    mae = mean_absolute_error(y_test, pred)
    r2 = r2_score(y_test, pred)
    results[name] = {'MSE': mse, 'MAE': mae, 'R¬≤': r2}

# Display results
results_df = pd.DataFrame(results).T.round(4)
results_df = results_df.sort_values('R¬≤', ascending=False)
print("Base Model Performance:")
display(results_df)

# Identify best and worst
best_base = results_df.index[0]
worst_base = results_df.index[-1]
print(f"\nBest base model: {best_base} (R¬≤ = {results_df.loc[best_base, 'R¬≤']:.4f})")
print(f"Worst base model: {worst_base} (R¬≤ = {results_df.loc[worst_base, 'R¬≤']:.4f})")

In [None]:
# Perform 5-fold cross-validation
cv_results = {}
for name, model in models.items():
    X_cv = X_train_scaled if name in ['Linear Regression', 'SVR'] else X_train
    scores = cross_val_score(model, X_cv, y_train, cv=5, scoring='r2')
    cv_results[name] = scores.mean()

# Display CV scores
cv_df = pd.DataFrame(list(cv_results.items()), columns=['Model', 'CV R¬≤']).sort_values('CV R¬≤', ascending=False)
print("5-Fold Cross-Validation R¬≤ Scores:")
display(cv_df.round(4))

In [None]:
# Define parameter grids
param_grids = {
    'Decision Tree': {
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10]
    },
    'Random Forest': {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None]
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 5]
    },
    'SVR': {
        'C': [1, 10],
        'gamma': ['scale', 'auto'],
        'epsilon': [0.1, 0.2]
    }
}

# Tune models
tuned_models = {}
print("Starting hyperparameter tuning...\n")

for name in ['Decision Tree', 'Random Forest', 'Gradient Boosting', 'SVR']:
    print(f"Tuning {name}...")
    grid = GridSearchCV(
        models[name],
        param_grids[name],
        cv=3,
        scoring='r2',
        n_jobs=-1,
        verbose=0
    )
    if name == 'SVR':
        grid.fit(X_train_scaled, y_train)
    else:
        grid.fit(X_train, y_train)
    tuned_models[name] = grid.best_estimator_
    print(f"  Best CV R¬≤: {grid.best_score_:.4f}")
    print(f"  Best params: {grid.best_params_}\n")

# Add Linear Regression (no tuning needed)
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
tuned_models['Linear Regression'] = lr

In [None]:
# Generate predictions with tuned models
tuned_predictions = {}
for name, model in tuned_models.items():
    if name in ['Linear Regression', 'SVR']:
        pred = model.predict(X_test_scaled)
    else:
        pred = model.predict(X_test)
    tuned_predictions[name] = pred

# Evaluate tuned models
tuned_results = {}
for name, pred in tuned_predictions.items():
    mse = mean_squared_error(y_test, pred)
    mae = mean_absolute_error(y_test, pred)
    r2 = r2_score(y_test, pred)
    tuned_results[name] = {'MSE': mse, 'MAE': mae, 'R¬≤': r2}

# Display final results
final_df = pd.DataFrame(tuned_results).T.round(4)
final_df = final_df.sort_values('R¬≤', ascending=False)
print("Final Model Performance After Tuning:")
display(final_df)

# Identify the best model
best_model_name = final_df.index[0]
print(f"\nüèÜ Best Model: {best_model_name}")
print(f"   R¬≤: {final_df.loc[best_model_name, 'R¬≤']:.4f}")
print(f"   MSE: {final_df.loc[best_model_name, 'MSE']:.4f}")
print(f"   MAE: {final_df.loc[best_model_name, 'MAE']:.4f}")