### CONFIG

In [1]:
RANDOM_STATE = 42
DATA_PATH = "../data/Social_Network_Ads.csv"
TARGET_COL = "Purchased"
TEST_SIZE = 0.25

### Importing the libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

### Importing the dataset

In [3]:
dataset = pd.read_csv(DATA_PATH)
print(dataset.shape)
print(dataset.dtypes)

(400, 3)
Age                int64
EstimatedSalary    int64
Purchased          int64
dtype: object


In [4]:
X = dataset.drop([TARGET_COL],axis=1)
y = dataset[TARGET_COL]

In [5]:
print(y.value_counts(normalize=True))

Purchased
0    0.6425
1    0.3575
Name: proportion, dtype: float64


### Train/Test split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=TEST_SIZE,
                                                    random_state=RANDOM_STATE,
                                                    stratify=y)

### Column typing

In [7]:
numeric_features = X_train.columns.tolist()

### Preprocessing skeleton

In [8]:
# Decision Trees do not need feature scaling as they only care about ordering, not magnitude
preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features)
    ],
    remainder="drop"
)

### Training the baseline Decision Tree model on the Training set

In [9]:
pipeline_dt_baseline = Pipeline(
    [
        ("preprocess", preprocessor),
        ("model", DecisionTreeClassifier(random_state=RANDOM_STATE))
    ]
)
pipeline_dt_baseline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


### Predicting the Test set results

In [10]:
y_pred = pipeline_dt_baseline.predict(X_test)
y_proba = pipeline_dt_baseline.predict_proba(X_test)[:,1]

### Baseline Evaluation

- The dataset shows class imbalance (~64% vs ~36%)
- Therefore, ROC-AUC will be used as the primary evaluation metric instead of accuracy

In [11]:
dt_baseline_roc_auc = roc_auc_score(y_test, y_proba)
dt_baseline_accuracy = accuracy_score(y_test, y_pred)

print(f"Decision Tree Baseline Test ROC-AUC: {dt_baseline_roc_auc:.4f}")
print(f"Decision Tree Baseline Test Accuracy: {dt_baseline_accuracy:.4f}")

Decision Tree Baseline Test ROC-AUC: 0.8837
Decision Tree Baseline Test Accuracy: 0.8900


**Summary: Baseline**
- Logistic Regression achieves a higher baseline ROC-AUC (\~0.91) than the Decision Tree (\~0.88), indicating better probability ranking and class separation
- Decision Tree shows higher accuracy (~0.89 vs ~0.84), but this is misleading due to class imbalance and its tendency to favor the majority class
- On imbalanced data, ROC-AUC is more reliable than accuracy, making Logistic Regression the stronger baseline model compared to an unconstrained Decision Tree on this dataset

### Decision Tree Depth Control (max_depth)

In [12]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

param_grid = {
    "model__max_depth": [2, 3, 4, 5, 8, 12, None]
}

grid = GridSearchCV(estimator=pipeline_dt_baseline,
                    param_grid=param_grid,
                    scoring="roc_auc",
                    n_jobs=-1,
                    cv=cv,
                    return_train_score=True)

grid.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__max_depth': [2, 3, ...]}"
,scoring,'roc_auc'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,4
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [13]:
cv_results = pd.DataFrame(grid.cv_results_)
display(
    cv_results[
        ["param_model__max_depth", "mean_train_score", "mean_test_score"]
    ]
)


Unnamed: 0,param_model__max_depth,mean_train_score,mean_test_score
0,2.0,0.948198,0.92596
1,3.0,0.968361,0.936558
2,4.0,0.98387,0.943054
3,5.0,0.991873,0.935561
4,8.0,0.999137,0.893713
5,12.0,0.999917,0.885739
6,,0.999917,0.885739


In [14]:
best_dt_depth_pipeline = grid.best_estimator_
best_dt_depth_cv_roc_auc = grid.best_score_

print("Best depth:", grid.best_params_['model__max_depth'])
print("Best CV ROC-AUC:", best_dt_depth_cv_roc_auc)

Best depth: 4
Best CV ROC-AUC: 0.9430540220013904


**Summary: Decision Tree Depth Control**
- At very small depths (e.g., max_depth=2), both train and validation ROC-AUC are lower, indicating underfitting
- Increasing depth to max_depth = 3/4 improves validation ROC-AUC and achieves the best performance (~ 0.94), showing the optimal bias–variance trade-off
- Beyond this point (max_depth ≥ 5):
    - Training ROC-AUC continues to increase and approaches 1.0
    - Validation ROC-AUC starts to drop sharply
    - This is clear evidence of **overfitting**

**Conclusion**

Decision Trees are highly sensitive to `max_depth`, and structural constraints are mandatory to prevent overfitting and ensure good generalization

### Decision Tree Leaf-Size Control (min_samples_leaf)

In [15]:
# max_depth fixed
pipeline_dt_depth_fixed = Pipeline([
    ("preprocess", preprocessor),
    ("model", DecisionTreeClassifier(
        random_state=RANDOM_STATE,
        max_depth=4
    ))
])

In [16]:
param_grid = {
    "model__min_samples_leaf": [1, 5, 10, 20, 50]
}

grid = GridSearchCV(
    estimator=pipeline_dt_depth_fixed,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    return_train_score=True
)

grid.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__min_samples_leaf': [1, 5, ...]}"
,scoring,'roc_auc'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,4
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [17]:
cv_results = pd.DataFrame(grid.cv_results_)
display(
    cv_results[
        ["param_model__min_samples_leaf", "mean_train_score", "mean_test_score"]
    ]
)

Unnamed: 0,param_model__min_samples_leaf,mean_train_score,mean_test_score
0,1,0.98387,0.943054
1,5,0.982677,0.931225
2,10,0.980628,0.942219
3,20,0.969694,0.940348
4,50,0.923049,0.911883


In [19]:
best_dt_min_leaf_pipeline = grid.best_estimator_
best_dt_min_leaf_cv_roc_auc = grid.best_score_

print("Best min_samples_leaf:", grid.best_params_["model__min_samples_leaf"])
print("Best CV ROC-AUC:", best_dt_min_leaf_cv_roc_auc)

Best min_samples_leaf: 1
Best CV ROC-AUC: 0.9430540220013904


**Summary: Decision Tree Leaf-Size Control**
- With very small leaf sizes (min_samples_leaf = 1):
    - Training ROC-AUC is very high (~0.984)
    - Validation ROC-AUC is also the highest (~0.943)
    - This indicates the tree is still generalizing well when depth is already controlled
- As min_samples_leaf increases:
    - Training ROC-AUC gradually decreases, meaning the model is being regularized
    - Validation ROC-AUC also starts to drop
- At large leaf sizes (min_samples_leaf = 50):
    - Both train and validation ROC-AUC drop significantly
    - This is clear underfitting due to excessive smoothing

**Conclusion**
- Once `max_depth` is properly constrained, additional leaf-size regularization is not required for this dataset
- `min_samples_leaf = 1` already provides the best bias–variance trade-off, and increasing `min_samples_leaf` over-regularizes the tree and hurts generalization
- This shows that for this dataset:
    - `max_depth` is the dominant regularization parameter
    - `min_samples_leaf` is a fine-tuning knob that should be used after depth is fixed

### Final Hyperparameter Optimization (max_depth, min_samples_leaf)

In [20]:
param_grid = {
    "model__max_depth": [3, 4, 5],
    "model__min_samples_leaf": [1, 5, 10, 20]
}

grid = GridSearchCV(
    estimator=pipeline_dt_baseline,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    return_train_score=True
)

grid.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__max_depth': [3, 4, ...], 'model__min_samples_leaf': [1, 5, ...]}"
,scoring,'roc_auc'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,10
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [21]:
cv_results = pd.DataFrame(grid.cv_results_)
display(
    cv_results[
        ["param_model__max_depth", "param_model__min_samples_leaf", "mean_train_score", "mean_test_score"]
    ]
)

Unnamed: 0,param_model__max_depth,param_model__min_samples_leaf,mean_train_score,mean_test_score
0,3,1,0.968361,0.936558
1,3,5,0.970152,0.932282
2,3,10,0.973646,0.940585
3,3,20,0.967033,0.939737
4,4,1,0.98387,0.943054
5,4,5,0.982677,0.931225
6,4,10,0.980628,0.942219
7,4,20,0.969694,0.940348
8,5,1,0.991873,0.935561
9,5,5,0.986807,0.932075


In [22]:
best_dt_pipeline = grid.best_estimator_
best_dt_cv_roc_auc = grid.best_score_

print("Best params:", grid.best_params_)
print("Best tuned DT CV ROC-AUC:", best_dt_cv_roc_auc)

Best params: {'model__max_depth': 5, 'model__min_samples_leaf': 10}
Best tuned DT CV ROC-AUC: 0.9442948864001497


**Summary: Joint Hyperparameter Tuning (max_depth + min_samples_leaf)**
- Individual tuning identified:
    - `max_depth` as the dominant regularization parameter
    - `min_samples_leaf` as a secondary smoothing parameter
- Joint tuning shows that these parameters interact:
    - A deeper tree can still generalize well when combined with stronger leaf-level regularization
- The best configuration was:

        max_depth = 5
        min_samples_leaf = 10

 with CV ROC-AUC ≈ 0.9443, slightly improving over single-parameter tuning
- This confirms:
    - Decision Tree structural parameters must be tuned jointly for optimal performance
    - Model capacity (depth) and smoothing (leaf size) trade off against each other to reach the best bias–variance balance

### Evaluation of Tuned Decision Tree on Test Set

In [23]:
y_proba_dt = best_dt_pipeline.predict_proba(X_test)[:, 1]
dt_tuned_roc_auc = roc_auc_score(y_test, y_proba_dt)
print(f"Tuned Decision Tree Test ROC-AUC: {dt_tuned_roc_auc:.4f}")

Tuned Decision Tree Test ROC-AUC: 0.9171


### Baseline vs Regularization

In [24]:
print(f"Baseline DT Test ROC-AUC        : {dt_baseline_roc_auc:.4f}")
print(f"Best DT CV ROC-AUC (GridSearch) : {best_dt_cv_roc_auc:.4f}")
print(f"Tuned DT Test ROC-AUC           : {dt_tuned_roc_auc:.4f}")

Baseline DT Test ROC-AUC        : 0.8837
Best DT CV ROC-AUC (GridSearch) : 0.9443
Tuned DT Test ROC-AUC           : 0.9171


**Summary: Baseline vs Regularization**
- Baseline Decision Tree achieves a Test ROC-AUC ~ 0.88
- Regularization tuning significantly improves cross-validation ROC-AUC (~ 0.94) and also improves Test ROC-AUC to ~ 0.92
- This indicates
    - The unconstrained Decision Tree was overfitting
    - Structural regularization (max_depth, min_samples_leaf) is essential for generalization
    - The tuned tree generalizes better than the baseline
    - Unlike Logistic Regression, regularization is not just stabilizing, it is actually improving real-world performance for Decision Trees

**Summary: Regularized Logistic Regression vs Tuned Decision Tree**
- Baseline comparison:
    - Logistic Regression baseline ROC-AUC ~ 0.91
    - Decision Tree baseline ROC-AUC ~ 0.88
    - Logistic Regression performs better without tuning
- After tuning:
    - Tuned Logistic Regression Test ROC-AUC ~ 0.910
    - Tuned Decision Tree Test ROC-AUC ~ 0.917
    - Properly tuned Decision Tree now slightly outperforms Logistic Regression

**Interpretation**
- Logistic Regression:
    - Strong out-of-the-box model due to natural regularization and stability
    - Best suited when the relationship between features and target is approximately linear
    - Provides consistent performance with minimal tuning
- Decision Trees:
    - Unstable when unconstrained because it easily overfits
    - Requires explicit structural regularization (max_depth, min_samples_leaf)
    - Once properly tuned, it can:
        - Capture non-linear patterns,
        - Model feature interactions,
        - Outperform linear models when the problem is not linearly separable