In [28]:
# minimal_california_compare.py
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [29]:
# 1) load dataset
data = fetch_california_housing(as_frame=True)
X = data.data
y = data.target

In [30]:
# 2) split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [31]:
# 3) define models
models = {
    "LinearRegression": make_pipeline(StandardScaler(), LinearRegression()),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42)
}

In [32]:
# 4) train, predict, evaluate
results = {}
for name, mdl in models.items():
    mdl.fit(X_train, y_train)
    preds = mdl.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))  # root MSE
    r2 = r2_score(y_test, preds)
    results[name] = {"rmse": rmse, "r2": r2}
    print(f"{name:15s} RMSE = {rmse:.4f}    R2 = {r2:.4f}")

LinearRegression RMSE = 0.7456    R2 = 0.5758
RandomForest    RMSE = 0.5053    R2 = 0.8051
GradientBoosting RMSE = 0.5422    R2 = 0.7756


In [34]:
# 5) report best by RMSE
best = min(results.items(), key=lambda kv: kv[1]["rmse"])
print("\nBest model (by RMSE):", best[0], "-> RMSE =", f"{best[1]['rmse']:.4f}", " R2 =", f"{best[1]['r2']:.4f}")


Best model (by RMSE): RandomForest -> RMSE = 0.5053  R2 = 0.8051


In [35]:
from sklearn.metrics import mean_squared_error, mean_absolute_error # Import mean_absolute_error

# 4) train, predict, evaluate
results = {}
for name, mdl in models.items():
    mdl.fit(X_train, y_train)
    preds = mdl.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds)) # Calculate RMSE by taking square root
    mae = mean_absolute_error(y_test, preds)
    results[name] = {"rmse": rmse, "mae": mae}
    print(f"{name:15s} RMSE = {rmse:.4f}    MAE = {mae:.4f}")

# 5) best model by RMSE
best = min(results.items(), key=lambda kv: kv[1]["rmse"])
print("\nBest model (by RMSE):", best[0], "-> RMSE =", f"{best[1]['rmse']:.4f}", " MAE =", f"{best[1]['mae']:.4f}")

LinearRegression RMSE = 0.7456    MAE = 0.5332
RandomForest    RMSE = 0.5053    MAE = 0.3275
GradientBoosting RMSE = 0.5422    MAE = 0.3716

Best model (by RMSE): RandomForest -> RMSE = 0.5053  MAE = 0.3275


In [36]:
# helper: compute AIC & BIC
def compute_aic_bic(y_true, y_pred, k):
    n = len(y_true)
    rss = np.sum((y_true - y_pred)**2)

    aic = n * np.log(rss / n) + 2 * k
    bic = n * np.log(rss / n) + k * np.log(n)
    return aic, bic

# 4) train, predict, evaluate
results = {}
for name, mdl in models.items():
    mdl.fit(X_train, y_train)
    preds = mdl.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae  = mean_absolute_error(y_test, preds)

    # number of parameters (approx.): number of input features
    k = X_train.shape[1]

    aic, bic = compute_aic_bic(y_test, preds, k)

    results[name] = {"rmse": rmse, "mae": mae, "aic": aic, "bic": bic}

    print(f"\n{name}")
    print(f"RMSE = {rmse:.4f}")
    print(f"MAE  = {mae:.4f}")
    print(f"AIC  = {aic:.2f}")
    print(f"BIC  = {bic:.2f}")

# 5) best model by AIC (or choose RMSE)
best_aic = min(results.items(), key=lambda kv: kv[1]["aic"])
print("\nBest model by AIC:", best_aic[0])

best_rmse = min(results.items(), key=lambda kv: kv[1]["rmse"])
print("Best model by RMSE:", best_rmse[0])


LinearRegression
RMSE = 0.7456
MAE  = 0.5332
AIC  = -2407.89
BIC  = -2357.28

RandomForest
RMSE = 0.5053
MAE  = 0.3275
AIC  = -5618.92
BIC  = -5568.31

GradientBoosting
RMSE = 0.5422
MAE  = 0.3716
AIC  = -5037.43
BIC  = -4986.83

Best model by AIC: RandomForest
Best model by RMSE: RandomForest


In [39]:
import pickle
import os
import pandas as pd

# Load the models
loaded_models = {}
model_dir = 'trained_models'
for filename in os.listdir(model_dir):
    if filename.endswith('.pkl'):
        model_name = filename.replace('_model.pkl', '')
        filepath = os.path.join(model_dir, filename)
        with open(filepath, 'rb') as file:
            loaded_models[model_name] = pickle.load(file)
print("Models loaded successfully:", list(loaded_models.keys()))

Models loaded successfully: ['LinearRegression', 'GradientBoosting', 'RandomForest']


Now, let's prepare a sample input for prediction. We'll use the first row of your `X_test` dataset as an example. Remember that the input features must be in the same format (column order and scaling) as the data used for training.

In [40]:
# Take the first sample from X_test for prediction
sample_input = X_test.iloc[[0]]
print("Sample input features:")
display(sample_input)

Sample input features:


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
20046,1.6812,25.0,4.192201,1.022284,1392.0,3.877437,36.06,-119.01


Now, let's make predictions using each of the loaded models.

In [41]:
predictions = {}
for name, model in loaded_models.items():
    # Some models like LinearRegression (with StandardScaler in pipeline) expect 2D array-like input
    # while others might work with a Series directly if it matches feature names.
    # Using .iloc[[0]] ensures it's always a DataFrame for consistent input.
    prediction = model.predict(sample_input)
    predictions[name] = prediction[0] # Assuming single prediction, take the first element

print("Predictions for the sample input:")
for name, pred_value in predictions.items():
    print(f"{name:15s} predicted price: {pred_value:.4f}")

# You can also get the actual value for comparison
actual_value = y_test.iloc[0]
print(f"\nActual price (from y_test): {actual_value:.4f}")

Predictions for the sample input:
LinearRegression predicted price: 0.7191
GradientBoosting predicted price: 0.5052
RandomForest    predicted price: 0.5095

Actual price (from y_test): 0.4770


In [37]:
import pickle
import os

# Create a directory to store the models if it doesn't exist
model_dir = 'trained_models'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Save each model to a pickle file
for name, mdl in models.items():
    filename = os.path.join(model_dir, f'{name}_model.pkl')
    with open(filename, 'wb') as file:
        pickle.dump(mdl, file)
    print(f'Model "{name}" saved to {filename}')

Model "LinearRegression" saved to trained_models/LinearRegression_model.pkl
Model "RandomForest" saved to trained_models/RandomForest_model.pkl
Model "GradientBoosting" saved to trained_models/GradientBoosting_model.pkl


You can verify the created files by listing the contents of the `trained_models` directory:

In [38]:
import os
print(os.listdir('trained_models'))

['LinearRegression_model.pkl', 'GradientBoosting_model.pkl', 'RandomForest_model.pkl']


In [59]:
# app.py
import streamlit as st
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

st.set_page_config(page_title="California Housing — Model compare", layout="wide")

st.title("California Housing — Compare 3 Regression Models")
st.markdown(
    "Loads `fetch_california_housing` (no Kaggle auth required). Train and compare "
    "Linear Regression, Random Forest, and Gradient Boosting. Metrics: RMSE, MAE, AIC, BIC."
)

# Sidebar controls
st.sidebar.header("Settings")
test_size = st.sidebar.slider("Test set fraction", 0.1, 0.4, 0.2, step=0.05)
random_state = st.sidebar.number_input("Random seed", value=42, step=1)
n_estimators = st.sidebar.slider("n_estimators (for tree models)", 10, 500, 100, step=10)

st.sidebar.markdown("Choose which models to train:")
train_lr = st.sidebar.checkbox("Linear Regression", value=True)
train_rf = st.sidebar.checkbox("Random Forest", value=True)
train_gb = st.sidebar.checkbox("Gradient Boosting", value=True)

if not any([train_lr, train_rf, train_gb]):
    st.sidebar.error("Select at least one model to train.")

# Load data
@st.cache_data
def load_data():
    data = fetch_california_housing(as_frame=True)
    X = data.data
    y = data.target
    return X, y, data.target_names if hasattr(data, "target_names") else None

X, y, _ = load_data()
st.sidebar.write(f"Dataset samples: {X.shape[0]}   Features: {X.shape[1]}")

# Show sample of dataset
if st.checkbox("Show data sample"):
    st.dataframe(pd.concat([X, y.rename("target")], axis=1).sample(10, random_state=0))

# Train / evaluate
def compute_aic_bic(y_true, y_pred, k):
    """
    RSS-based AIC/BIC approximation (common quick approach).
    For Gaussian errors: AIC ≈ n*ln(RSS/n) + 2k ; BIC ≈ n*ln(RSS/n) + k*ln(n)
    Note: For LinearRegression k = p + 1 (p = features). For tree-based models this k is an approximation.
    """
    n = len(y_true)
    rss = np.sum((y_true - y_pred) ** 2)
    # protect against zero rss (numerical)
    rss = max(rss, 1e-12)
    aic = n * np.log(rss / n) + 2 * k
    bic = n * np.log(rss / n) + k * np.log(n)
    return float(aic), float(bic)

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=float(test_size), random_state=int(random_state)
)

models_to_run = {}
if train_lr:
    # pipeline to standardize features for linear model
    models_to_run["LinearRegression"] = make_pipeline(StandardScaler(), LinearRegression())
if train_rf:
    models_to_run["RandomForest"] = RandomForestRegressor(
        n_estimators=int(n_estimators), random_state=int(random_state), n_jobs=-1
    )
if train_gb:
    models_to_run["GradientBoosting"] = GradientBoostingRegressor(
        n_estimators=int(n_estimators), random_state=int(random_state)
    )

results = []
progress = st.progress(0)
total = len(models_to_run)
i = 0

for name, mdl in models_to_run.items():
    i += 1
    st.write(f"### Training: {name}")
    mdl.fit(X_train, y_train)
    preds = mdl.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)

    # parameter count k:
    # - For LinearRegression we use p + 1 (features + intercept)
    # - For tree models we use p (number of features) as a simple approximation
    p = X_train.shape[1]
    if name == "LinearRegression":
        k = p + 1
    else:
        k = p  # approximate for tree models; see notes in UI

    aic, bic = compute_aic_bic(y_test.values if hasattr(y_test, "values") else y_test, preds, k)

    results.append({"model": name, "rmse": rmse, "mae": mae, "aic": aic, "bic": bic})
    st.write(f"RMSE: {rmse:.4f}    MAE: {mae:.4f}")
    st.write(f"AIC: {aic:.2f}    BIC: {bic:.2f}")
    progress.progress(int(i / total * 100))

# Results table
if results:
    df_res = pd.DataFrame(results).set_index("model")
    st.markdown("## Summary metrics (lower is better)")
    st.dataframe(df_res.style.format({"rmse": "{:.4f}", "mae": "{:.4f}", "aic": "{:.2f}", "bic": "{:.2f}"}))

    # Best models by metrics
    best_rmse = df_res["rmse"].idxmin()
    best_mae = df_res["mae"].idxmin()
    best_aic = df_res["aic"].idxmin()
    best_bic = df_res["bic"].idxmin()

    st.markdown("### Best models")
    st.write(f"- Best (RMSE): **{best_rmse}**")
    st.write(f"- Best (MAE) : **{best_mae}**")
    st.write(f"- Best (AIC) : **{best_aic}**")
    st.write(f"- Best (BIC) : **{best_bic}**")

    # plot metric comparisons
    st.markdown("### Metric comparison")
    st.line_chart(df_res[["rmse", "mae"]])

    st.markdown("### AIC vs BIC (bar)")
    st.bar_chart(df_res[["aic", "bic"]])

    # Option to show predictions for chosen model
    st.markdown("### Inspect predictions")
    chosen = st.selectbox("Choose model to inspect predictions", df_res.index.tolist())
    # find model instance to predict again
    chosen_model = models_to_run[chosen]
    preds = chosen_model.predict(X_test)
    sample_df = pd.DataFrame({"y_true": y_test, "y_pred": preds})
    st.dataframe(sample_df.reset_index(drop=True).head(20))
    st.download_button(
        "Download predictions (csv)", sample_df.to_csv(index=False).encode("utf-8"), file_name="predictions.csv"
    )

# Notes
st.markdown(
    """
**Notes on AIC/BIC used above**
- For Linear Regression the AIC/BIC formula and parameter count (p + 1) are appropriate.
- For tree-based models (RandomForest / GradientBoosting) there is no simple likelihood-based AIC/BIC. The implementation above uses an RSS-based approximation and a simple parameter-count heuristic (k = p). Use these values for rough model-comparison only.
"""
)

2025-12-08 04:45:33.128 No runtime found, using MemoryCacheStorageManager


DeltaGenerator()

In [65]:
!streamlit run app.py --server.headless true

Usage: streamlit run [OPTIONS] [TARGET] [ARGS]...
Try 'streamlit run --help' for help.

Error: Invalid value: File does not exist: app.py


In [61]:
!pip install streamlit



In [64]:
streamlit run app.py

SyntaxError: invalid syntax (ipython-input-3737097518.py, line 1)