In [14]:
%pip install -q ipykernel scikit-learn pandas numpy joblib matplotlib fastapi uvicorn


Note: you may need to restart the kernel to use updated packages.


In [15]:
%pip install -q scikit-learn pandas numpy joblib matplotlib
import pandas as pd, numpy as np, joblib, matplotlib.pyplot as plt
from pathlib import Path
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
MODELS_DIR = Path("models"); MODELS_DIR.mkdir(parents=True, exist_ok=True)


Note: you may need to restart the kernel to use updated packages.


In [16]:
X, y = fetch_california_housing(as_frame=True, return_X_y=True)
display(X.head())
display(X.describe())
X.info()
y.head()


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
dtypes: float64(8)
memory usage: 1.3 MB


0    4.526
1    3.585
2    3.521
3    3.413
4    3.422
Name: MedHouseVal, dtype: float64

In [17]:
# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# define preprocessing: scale numeric features
numeric = X.columns.tolist()
pre = ColumnTransformer(
    [("num", StandardScaler(), numeric)], remainder="drop"
)

In [18]:
# LINEAR REGRESSION BASELINE
lin = Pipeline([("pre", pre), ("lin", LinearRegression())])
lin.fit(X_train, y_train)

lin_preds = lin.predict(X_test)
lin_rmse = (((y_test - lin_preds) ** 2).mean()) ** 0.5  # RMSE
lin_rmse


np.float64(0.7455813830127764)

In [19]:
# RANDOM FOREST + GRID SEARCH
rf = Pipeline([("pre", pre), ("rf", RandomForestRegressor(n_jobs=-1, random_state=42))])

param_grid = {
    "rf__n_estimators": [100, 300],
    "rf__max_depth": [None, 10],
}

rf_cv = GridSearchCV(
    rf, param_grid,
    cv=3, n_jobs=-1,
    scoring="neg_mean_squared_error"  # use MSE; we'll sqrt to get RMSE
)
rf_cv.fit(X_train, y_train)

rf_best = rf_cv.best_estimator_
rf_preds = rf_best.predict(X_test)
rf_rmse = (((y_test - rf_preds) ** 2).mean()) ** 0.5  # RMSE

(lin_rmse, rf_rmse, rf_cv.best_params_)


(np.float64(0.7455813830127764),
 np.float64(0.5034852619333121),
 {'rf__max_depth': None, 'rf__n_estimators': 300})

In [20]:
MODELS_DIR = Path('models')
MODELS_DIR.mkdir(parents=True, exist_ok=True)

# save the best model
joblib.dump(rf_best, MODELS_DIR / 'model.joblib')

['models/model.joblib']

In [22]:
import os
print(os.listdir("models"))


['model.joblib']


In [21]:
model = joblib.load(MODELS_DIR / "model.joblib")

# Test prediction on one sample
sample = X_test.iloc[[0]]
prediction = model.predict(sample)[0]

print("Prediction:", prediction)
print("True value:", y_test.iloc[0])


Prediction: 0.49566666666666614
True value: 0.477
