# Decision Tree and Random Forest Regression

This notebook demonstrates Decision Tree and Random Forest regression using Scikit-Learn Pipelines to prevent data leakage.

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

### 1️⃣ Load & Split Data

In [None]:
df = pd.read_csv("../Linear Regression/data/dataset.csv") # Adjust path if needed or use absolute
# Assuming the structure is consistent with other notebooks
target_column = 'median_house_value'
X = df.drop(columns=[target_column])
y = df[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set:", X_train.shape, "Testing set:", X_test.shape)

### 2️⃣ Preprocessing Pipeline
Using OneHotEncoder for categorical variables and Imputation for missing values.

In [None]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Tree models technically don't need scaling, but imputation is required.
# We will skip scaling for Trees to keep it raw, but OneHot is crucial.
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])

### 3️⃣ Decision Tree Regressor

In [None]:
dt_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', DecisionTreeRegressor(random_state=42))])

dt_pipeline.fit(X_train, y_train)
y_pred_dt = dt_pipeline.predict(X_test)

print("--- Decision Tree (Default) ---")
print("MSE:", mean_squared_error(y_test, y_pred_dt))
print("MAE:", mean_absolute_error(y_test, y_pred_dt))
print("R2:", r2_score(y_test, y_pred_dt))

### 4️⃣ Tuned Decision Tree

In [None]:
dt_tuned_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('regressor', DecisionTreeRegressor(max_depth=20, min_samples_leaf=10, random_state=42))])

dt_tuned_pipeline.fit(X_train, y_train)
y_pred_tuned = dt_tuned_pipeline.predict(X_test)

print("--- Decision Tree (Tuned) ---")
print("MSE:", mean_squared_error(y_test, y_pred_tuned))
print("MAE:", mean_absolute_error(y_test, y_pred_tuned))
print("R2:", r2_score(y_test, y_pred_tuned))

### 5️⃣ Random Forest Regressor

In [None]:
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', RandomForestRegressor(n_estimators=100, max_depth=30, min_samples_leaf=5, random_state=42, n_jobs=-1))])

rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)

print("--- Random Forest ---")
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("R2:", r2_score(y_test, y_pred_rf))

### 6️⃣ Cross-Validation (Random Forest)

In [None]:
cv_r2 = cross_val_score(rf_pipeline, X_train, y_train, cv=5, scoring='r2')
print("CV R2 Mean:", cv_r2.mean())

### 7️⃣ Feature Importance
Extracting feature importance from the pipeline.

In [None]:
# Get feature names from preprocessor
ohe = preprocessor.named_transformers_['cat']['onehot']
cat_feature_names = ohe.get_feature_names_out(cat_cols)
all_feature_names = num_cols + list(cat_feature_names)

# Get importances
importances = rf_pipeline.named_steps['regressor'].feature_importances_

feature_importances = pd.Series(importances, index=all_feature_names)
feature_importances = feature_importances.sort_values(ascending=False)
feature_importances.head(10)