### CONFIG

In [1]:
RANDOM_STATE = 42
DATA_PATH = "../data/Social_Network_Ads.csv"
TARGET_COL = "Purchased"
TEST_SIZE = 0.25
N_ESTIMATORS = 100

### Importing the libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

### Importing the dataset

In [3]:
dataset = pd.read_csv(DATA_PATH)
print(dataset.shape)
print(dataset.dtypes)

(400, 3)
Age                int64
EstimatedSalary    int64
Purchased          int64
dtype: object


In [4]:
X = dataset.drop([TARGET_COL],axis=1)
y = dataset[TARGET_COL]

In [5]:
print(y.value_counts(normalize=True))

Purchased
0    0.6425
1    0.3575
Name: proportion, dtype: float64


### Train/Test split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=TEST_SIZE,
                                                    random_state=RANDOM_STATE,
                                                    stratify=y)

### Column typing

In [7]:
numeric_features = X_train.columns.tolist()

### Preprocessing skeleton

In [8]:
# Random Forests Trees do not need feature scaling as they only care about ordering, not magnitude
preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features)
    ],
    remainder="drop"
)

### Training the baseline Random Forest model on the Training set

In [9]:
pipeline_rf_baseline = Pipeline(
    [
        ("preprocess", preprocessor),
        ("model", RandomForestClassifier(n_estimators=N_ESTIMATORS, 
                                         random_state=RANDOM_STATE,
                                         n_jobs=-1))
    ]
)
pipeline_rf_baseline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### Predicting the Test set results

In [10]:
y_pred = pipeline_rf_baseline.predict(X_test)
y_proba = pipeline_rf_baseline.predict_proba(X_test)[:,1]

### Baseline Evaluation

- The dataset shows class imbalance (~64% vs ~36%)
- Therefore, ROC-AUC will be used as the primary evaluation metric instead of accuracy

In [11]:
rf_baseline_roc_auc = roc_auc_score(y_test, y_proba)
rf_baseline_pr_auc = average_precision_score(y_test, y_proba)
rf_baseline_accuracy = accuracy_score(y_test, y_pred)

print(f"Random Forest Baseline Test ROC-AUC: {rf_baseline_roc_auc:.4f}")
print(f"Random Forest Baseline Test PR-AUC: {rf_baseline_pr_auc:.4f}")
print(f"Random Forest Baseline Test Accuracy: {rf_baseline_accuracy:.4f}")

Random Forest Baseline Test ROC-AUC: 0.9238
Random Forest Baseline Test PR-AUC: 0.8429
Random Forest Baseline Test Accuracy: 0.9100


**Summary: Baseline (Metric Choice + Results)**
- Primary metric: ROC-AUC
    - Used for model comparison because it measures overall ranking quality and is stable under class imbalance
- Secondary metric: PR-AUC
    - Reported to evaluate performance on the minority (positive) class, which is more business-relevant for imbalanced datasets

Baseline results:
- Decision Tree baseline Test ROC-AUC ~ 0.8837
- Random Forest baseline Test ROC-AUC ~ 0.9238
- Random Forest baseline Test PR-AUC ~ 0.8429

This shows:
- Random Forest significantly outperforms a single Decision Tree even without tuning
- The improvement comes from variance reduction through ensembling:
    - Multiple trees trained on bootstrapped samples
    - Feature randomness at each split
    - Averaging stabilizes predictions and reduces overfitting
- The strong PR-AUC (~0.84) indicates that Random Forest not only ranks samples better overall (high ROC-AUC) but is also effective at identifying the minority class
- Random Forest provides strong generalization out of the box, whereas a single Decision Tree is highly sensitive to overfitting

### Random Forest Depth and Leaf-size Control

In [12]:
# Tuning tree complexity inside the Random Forest
# Trees are already variance-reduced by ensembling, so this is fine control
param_grid = {
    "model__max_depth": [3, 4, 5, 8, None],
    "model__min_samples_leaf": [1, 5, 10, 20]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

grid = GridSearchCV(
    estimator=pipeline_rf_baseline,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    return_train_score=True
)

grid.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__max_depth': [3, 4, ...], 'model__min_samples_leaf': [1, 5, ...]}"
,scoring,'roc_auc'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,4
,min_samples_split,2
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [13]:
cv_results = pd.DataFrame(grid.cv_results_)
display(
    cv_results[
        ["param_model__max_depth", "param_model__min_samples_leaf", "mean_train_score", "mean_test_score"]
    ]
)

Unnamed: 0,param_model__max_depth,param_model__min_samples_leaf,mean_train_score,mean_test_score
0,3.0,1,0.982593,0.951971
1,3.0,5,0.97943,0.954173
2,3.0,10,0.974842,0.958051
3,3.0,20,0.962212,0.947585
4,4.0,1,0.991338,0.956615
5,4.0,5,0.985536,0.960523
6,4.0,10,0.978164,0.958918
7,4.0,20,0.961204,0.94697
8,5.0,1,0.994773,0.953665
9,5.0,5,0.986792,0.957348


In [14]:
tuned_rf_pipeline = grid.best_estimator_
tuned_rf_cv_roc_auc = grid.best_score_

print("Best params:", grid.best_params_)
print("Best RF CV ROC-AUC:", tuned_rf_cv_roc_auc)

Best params: {'model__max_depth': 4, 'model__min_samples_leaf': 5}
Best RF CV ROC-AUC: 0.9605226644700329


**Summary: Random Forest Depth and Leaf-size Control**
- Random Forest already generalizes well, but structural tuning still provides gains
- Best configuration found:

        max_depth = 4  
        min_samples_leaf = 5

with CV ROC-AUC ~ 0.9605

Observations:
- Shallow trees (max_depth = 3) underfit slightly
- Moderate depth (max_depth = 4–5) gives the best generalization
- Very deep trees (max_depth ≥ 8 or None) show:
    - Training ROC-AUC ~ 1.0
    - Drop in validation ROC-AUC
      -> mild overfitting even inside Random Forest
- Increasing min_samples_leaf:
    - Adds smoothing
    - Reduces variance
    - Helps deeper trees generalize better
    - But very large values (20) cause underfitting

### Evaluation of Tuned Random Forest (max_depth + min_samples_leaf) on Test Set

In [15]:
y_proba = tuned_rf_pipeline.predict_proba(X_test)[:, 1]
rf_tuned_roc_auc = roc_auc_score(y_test, y_proba)
rf_tuned_pr_auc = average_precision_score(y_test, y_proba)

print(f"Tuned RF Test ROC-AUC: {rf_tuned_roc_auc:.4f}")
print(f"Tuned RF Test PR-AUC: {rf_tuned_pr_auc:.4f}")

Tuned RF Test ROC-AUC: 0.9531
Tuned RF Test PR-AUC: 0.9125


**Summary: Random Forest Baseline vs Tuned (Depth + Leaf Control)**
- Baseline Random Forest:
    - Test ROC-AUC ~ 0.9238
    - Test PR-AUC ~ 0.8429
- Tuned Random Forest:
    - Test ROC-AUC ~ 0.9531
    - Test PR-AUC ~ 0.9125

This shows:
- Structural tuning significantly improves Random Forest performance
- ROC-AUC improvement:
    - 0.9238 -> 0.9531 -> Better overall ranking and discrimination
- PR-AUC improvement:
    - 0.8429 -> 0.9125 -> Much stronger detection of the minority (positive) class
- Unlike Decision Trees, Random Forest already starts strong, but tuning:
    - Refines generalization,
    - Improves minority-class usefulness,
    - Produces a more production-ready model

### Random Forest No. of Trees Control

In [19]:
# Use the best structural RF model as the base for further tuning (n_estimators)
base_tuned_rf_pipeline = tuned_rf_pipeline

In [20]:
param_grid = {
    "model__n_estimators": [50, 100, 200, 400]
}

grid = GridSearchCV(
    estimator=base_tuned_rf_pipeline,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    return_train_score=True
)

grid.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__n_estimators': [50, 100, ...]}"
,scoring,'roc_auc'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,4
,min_samples_split,2
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [21]:
cv_results = pd.DataFrame(grid.cv_results_)
display(cv_results[["param_model__n_estimators", "mean_train_score", "mean_test_score"]])

Unnamed: 0,param_model__n_estimators,mean_train_score,mean_test_score
0,50,0.985392,0.959531
1,100,0.985544,0.960523
2,200,0.985679,0.958649
3,400,0.985619,0.957438


In [22]:
best_rf_pipeline = grid.best_estimator_
best_rf_cv_roc_auc = grid.best_score_

print("Best n_estimators:", grid.best_params_["model__n_estimators"])
print("Best RF CV ROC-AUC:", best_rf_cv_roc_auc)

Best n_estimators: 100
Best RF CV ROC-AUC: 0.9605226644700329


**Summary: Number of Trees Tuning (n_estimators)**
- Performance improves from 50 → 100 trees:
    - CV ROC-AUC increases from ~0.9595 to ~0.9605
- Beyond 100 trees:
    - CV ROC-AUC starts to plateau and slightly decline
    - 200 and 400 trees do not provide additional benefit

Observations:
- Training ROC-AUC remains almost constant (~0.985) across all values:
    - Indicates no overfitting due to increasing number of trees
    - The model is already well-regularized structurally
- The best trade-off between performance and compute is: `n_estimators = 100`

Interpretation:
- Increasing `n_estimators` reduces variance by averaging more trees
- Gains diminish quickly after a moderate number of trees
- More trees increase training and inference cost without improving generalization

In [23]:
y_proba_rf_final = best_rf_pipeline.predict_proba(X_test)[:, 1]
rf_final_roc_auc = roc_auc_score(y_test, y_proba_rf_final)
rf_final_pr_auc = average_precision_score(y_test, y_proba_rf_final)

print(f"Final RF Test ROC-AUC: {rf_final_roc_auc:.4f}")
print(f"Final RF Test PR-AUC: {rf_final_pr_auc:.4f}")

Final RF Test ROC-AUC: 0.9531
Final RF Test PR-AUC: 0.9125


### Baseline vs Tuned

In [24]:
print(f"Baseline RF Test ROC-AUC    : {rf_baseline_roc_auc:.4f}")
print(f"Baseline RF Test PR-AUC     : {rf_baseline_pr_auc:.4f}")
print(f"Tuned RF Test ROC-AUC       : {rf_final_roc_auc:.4f}")
print(f"Tuned RF Test PR-AUC        : {rf_final_pr_auc:.4f}")

Baseline RF Test ROC-AUC    : 0.9238
Baseline RF Test PR-AUC     : 0.8429
Tuned RF Test ROC-AUC       : 0.9531
Tuned RF Test PR-AUC        : 0.9125


**Summary: Tuned Random Forest vs Tuned Decision Tree**
- Tuned Decision Tree:
    - Test ROC-AUC ~ 0.9171
- Tuned Random Forest:
    - Test ROC-AUC ~ 0.9531
    - Test PR-AUC ~ 0.9125

This shows:
- Random Forest significantly outperforms a single tuned Decision Tree
- The improvement comes from ensembling:
    - Random Forest averages many regularized trees
    - This further reduces variance and stabilizes predictions
    - Structural tuning makes Decision Trees usable
    - Ensembling those tuned trees makes the model substantially stronger