In [1]:
# 📊 Linear Regression – California Housing

# 📁 1. Data Loading
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
import joblib

# Load preprocessed dataset (24 features)
data_path = "../data/train/housing_train_processed.csv"
df = pd.read_csv(data_path)

# Separate features and target
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"]

# 🤖 2. Model Fitting
lin_reg = LinearRegression()
lin_reg.fit(X, y)

# 🧪 3. Cross-Validation
scores = cross_val_score(lin_reg, X, y, scoring="neg_mean_squared_error", cv=5)
rmse_scores = (-scores)**0.5

print("Cross-validation RMSE scores:", rmse_scores)
print("Average RMSE:", rmse_scores.mean())

# 🛠️ 4. Hyperparameter Tuning
# Note: LinearRegression has no hyperparameters to tune, but this section is required.
print("No hyperparameters to tune for LinearRegression.")

# 💾 5. Model Saving
joblib.dump(lin_reg, "../models/linear_regression_model.pkl")
print("✅ Model saved to models/linear_regression_model.pkl")

Cross-validation RMSE scores: [68864.86412522 68174.85986952 67996.75813918 68657.57383387
 67968.91105469]
Average RMSE: 68332.59340449455
No hyperparameters to tune for LinearRegression.
✅ Model saved to models/linear_regression_model.pkl
