# Decision Tree and Random Forest Regression

This notebook demonstrates Decision Tree and Random Forest regression using Scikit-Learn Pipelines to prevent data leakage.

In [28]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

### 1️⃣ Load & Split Data

In [29]:
df = pd.read_csv("./data/dataset_processed.csv")
# Assuming the structure is consistent with other notebooks
target_column = 'median_house_value'
X = df.drop(columns=[target_column])
y = df[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set:", X_train.shape, "Testing set:", X_test.shape)

Training set: (16512, 12) Testing set: (4128, 12)


### 2️⃣ Preprocessing Pipeline
Using OneHotEncoder for categorical variables and Imputation for missing values.

In [30]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Tree models technically don't need scaling, but imputation is required.
# We will skip scaling for Trees to keep it raw, but OneHot is crucial.
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])

### 3️⃣ Decision Tree Regressor

In [31]:
dt_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', DecisionTreeRegressor(random_state=42))])

dt_pipeline.fit(X_train, y_train)
y_pred_dt = dt_pipeline.predict(X_test)

print("--- Decision Tree (Default) ---")
print("MSE:", mean_squared_error(y_test, y_pred_dt))
print("MAE:", mean_absolute_error(y_test, y_pred_dt))
print("R2:", r2_score(y_test, y_pred_dt))

--- Decision Tree (Default) ---
MSE: 0.3678142093817326
MAE: 0.3857863777119708
R2: 0.6262517053766988


### 4️⃣ Tuned Decision Tree

In [32]:
dt_tuned_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('regressor', DecisionTreeRegressor(max_depth=20, min_samples_leaf=10, random_state=42))])

dt_tuned_pipeline.fit(X_train, y_train)
y_pred_tuned = dt_tuned_pipeline.predict(X_test)

print("--- Decision Tree (Tuned) ---")
print("MSE:", mean_squared_error(y_test, y_pred_tuned))
print("MAE:", mean_absolute_error(y_test, y_pred_tuned))
print("R2:", r2_score(y_test, y_pred_tuned))

--- Decision Tree (Tuned) ---
MSE: 0.2776691150991211
MAE: 0.34355424631422
R2: 0.7178511444343025


### 5️⃣ Random Forest Regressor

In [33]:
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', RandomForestRegressor(n_estimators=100, max_depth=30, min_samples_leaf=5, random_state=42, n_jobs=-1))])

rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)

print("--- Random Forest ---")
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("R2:", r2_score(y_test, y_pred_rf))

--- Random Forest ---
MAE: 0.284503275663723
R2: 0.8015792793156713


### 6️⃣ Cross-Validation (Random Forest)

In [34]:
cv_r2 = cross_val_score(rf_pipeline, X_train, y_train, cv=5, scoring='r2')
print("CV R2 Mean:", cv_r2.mean())

CV R2 Mean: 0.8089284235623875


### 7️⃣ Feature Importance
Extracting feature importance from the pipeline.

In [35]:
# Update the preprocessor pipeline to handle only numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols)
    ])

# Fit the preprocessor pipeline on the training data
preprocessor.fit(X_train)

# Get importances
importances = rf_pipeline.named_steps['regressor'].feature_importances_

feature_importances = pd.Series(importances, index=num_cols)
feature_importances = feature_importances.sort_values(ascending=False)
feature_importances.head(10)

median_income         0.556659
longitude             0.157195
latitude              0.145720
housing_median_age    0.061812
population            0.029209
total_bedrooms        0.021513
total_rooms           0.015891
households            0.012001
dtype: float64