In [1]:
# 🌳 Decision Tree – California Housing

# 📁 1. Data Loading
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
import joblib

# Load preprocessed dataset
df = pd.read_csv("../data/train/housing_train_processed.csv")
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"]

# 🤖 2. Model Fitting
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(X, y)

# 🧪 3. Cross-Validation
scores = cross_val_score(tree_reg, X, y, scoring="neg_mean_squared_error", cv=5)
rmse_scores = (-scores) ** 0.5

print("Cross-validation RMSE scores:", rmse_scores)
print("Average RMSE:", rmse_scores.mean())

# 🔍 4. Hyperparameter Tuning
param_grid = {
    "max_depth": [5, 10, 20, None],
    "min_samples_split": [2, 5, 10]
}
grid_search = GridSearchCV(DecisionTreeRegressor(random_state=42), param_grid, cv=5,
                           scoring="neg_mean_squared_error", return_train_score=True)

grid_search.fit(X, y)
best_tree = grid_search.best_estimator_

print("Best parameters:", grid_search.best_params_)

# 💾 5. Model Saving
joblib.dump(best_tree, "../models/decision_tree_model.pkl")
print("✅ Model saved to models/decision_tree_model.pkl")

Cross-validation RMSE scores: [67952.75771091 67807.23997178 68905.54664003 68492.09764736
 69894.31839668]
Average RMSE: 68610.3920733527
Best parameters: {'max_depth': 10, 'min_samples_split': 10}
✅ Model saved to models/decision_tree_model.pkl
