<a href="https://colab.research.google.com/github/Chigo56/okorieassignment.github.io/blob/main/Untitled9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q scikit-learn pandas numpy matplotlib seaborn gradio joblib shap


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import gradio as gr
import shap

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [None]:
# Load California Housing dataset
data = fetch_california_housing(as_frame=True)
df = data.frame.copy()
TARGET = "MedHouseVal"
X = df.drop(columns=[TARGET])
y = df[TARGET]

# Quick look
print(df.head())


   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
numeric_features = X_train.columns.tolist()
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features)
])


In [None]:
# Ridge model
pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", Ridge())
])

param_grid = {"regressor__alpha": [0.01, 0.1, 1.0, 10.0, 50.0, 100.0]}
grid = GridSearchCV(pipe, param_grid, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
print("Best params:", grid.best_params_)


Best params: {'regressor__alpha': 0.01}


In [None]:
!pip install --upgrade scikit-learn




In [None]:
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))  # compute manually
r2 = r2_score(y_test, y_pred)

print(f"Test MAE: {mae:.4f} (100k units)")
print(f"Test RMSE: {rmse:.4f} (100k units)")
print(f"Test R2: {r2:.4f}")


Test MAE: 0.5332 (100k units)
Test RMSE: 0.7456 (100k units)
Test R2: 0.5758


In [None]:
explainer = shap.Explainer(best_model.named_steps["regressor"], X_train_transformed)


In [None]:
feature_stats = {}
for col in numeric_features:
    feature_stats[col] = {
        "min": float(X[col].min()),
        "max": float(X[col].max()),
        "median": float(X[col].median())
    }


In [None]:
def predict_and_explain(
    MedInc, HouseAge, AveRooms, AveBedrms, Population, AveOccup, Latitude, Longitude
):
    input_dict = {
        "MedInc": MedInc,
        "HouseAge": HouseAge,
        "AveRooms": AveRooms,
        "AveBedrms": AveBedrms,
        "Population": Population,
        "AveOccup": AveOccup,
        "Latitude": Latitude,
        "Longitude": Longitude,
    }
    df_input = pd.DataFrame([input_dict])

    # Predict and clip at zero
    pred = best_model.predict(df_input)[0]
    pred = max(pred, 0)  # ensures prediction is not negative
    pred_dollars = pred * 100000

    pred_text = f"Predicted median house value: ${pred_dollars:,.0f} (dataset units: {pred:.3f})"

    # SHAP values
    x_trans = best_model.named_steps["preprocessor"].transform(df_input)
    shap_values = explainer.shap_values(x_trans)
    expected_value = explainer.expected_value

    contributions = dict(zip(numeric_features, shap_values[0] if shap_values.ndim==2 else shap_values))
    # Sort by absolute contribution
    order = np.argsort(np.abs(list(contributions.values())))[::-1]
    sorted_feats = [numeric_features[i] for i in order]
    sorted_vals = [list(contributions.values())[i] for i in order]

    # Bar plot
    fig, ax = plt.subplots(figsize=(7,4))
    y_pos = np.arange(len(sorted_feats))
    ax.barh(y_pos, sorted_vals, align='center')
    ax.set_yticks(y_pos)
    ax.set_yticklabels(sorted_feats)
    ax.invert_yaxis()
    ax.set_xlabel("SHAP contribution (in target units, 100k)")
    ax.set_title("Feature contributions to this prediction (SHAP)")
    plt.tight_layout()

    baseline_text = f"Model baseline (expected): {expected_value:.3f} (√ó100k units)"
    perf_text = f"Test RMSE: {rmse:.4f} √ó100k, MAE: {mae:.4f} √ó100k, R¬≤: {r2:.3f}"

    combined_text = f"{pred_text}\n{baseline_text}\n{perf_text}"

    return combined_text, fig


In [None]:
inputs = [
    gr.Slider(feature_stats["MedInc"]["min"], feature_stats["MedInc"]["max"], value=feature_stats["MedInc"]["median"], step=0.01, label="Median income ‚Äî MedInc"),
    gr.Slider(feature_stats["HouseAge"]["min"], feature_stats["HouseAge"]["max"], value=feature_stats["HouseAge"]["median"], step=1, label="House age ‚Äî HouseAge"),
    gr.Slider(feature_stats["AveRooms"]["min"], feature_stats["AveRooms"]["max"], value=feature_stats["AveRooms"]["median"], step=0.01, label="Average rooms ‚Äî AveRooms"),
    gr.Slider(feature_stats["AveBedrms"]["min"], feature_stats["AveBedrms"]["max"], value=feature_stats["AveBedrms"]["median"], step=0.01, label="Average bedrooms ‚Äî AveBedrms"),
    gr.Slider(feature_stats["Population"]["min"], feature_stats["Population"]["max"], value=feature_stats["Population"]["median"], step=1, label="Population ‚Äî Population"),
    gr.Slider(feature_stats["AveOccup"]["min"], feature_stats["AveOccup"]["max"], value=feature_stats["AveOccup"]["median"], step=0.01, label="Average occupants ‚Äî AveOccup"),
    gr.Number(value=feature_stats["Latitude"]["median"], label="Latitude", precision=5),
    gr.Number(value=feature_stats["Longitude"]["median"], label="Longitude", precision=5),
]

output_text = gr.Textbox(label="Prediction & model info")
output_plot = gr.Plot(label="SHAP contributions")


In [22]:
demo = gr.Interface(
    fn=predict_and_explain,
    inputs=inputs,
    outputs=[output_text, output_plot],
    title="üè† California Housing Price Predictor",
    description="Sliders with defaults. Prediction is always SHAP chart shows feature contributions.",
)

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2723903d5362e6809e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


