### CONFIG

In [1]:
RANDOM_STATE = 42
DATA_PATH = "../data/Social_Network_Ads.csv"
TARGET_COL = "Purchased"
TEST_SIZE = 0.25
MODEL_PATH = "../models"

### Importing the libraries

In [2]:
import joblib
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

### Importing the dataset

In [3]:
dataset = pd.read_csv(DATA_PATH)
print(dataset.shape)
print(dataset.dtypes)

(400, 3)
Age                int64
EstimatedSalary    int64
Purchased          int64
dtype: object


In [4]:
X = dataset.drop([TARGET_COL],axis=1)
y = dataset[TARGET_COL]

In [5]:
print(y.value_counts(normalize=True))

Purchased
0    0.6425
1    0.3575
Name: proportion, dtype: float64


### Train/Test split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=TEST_SIZE,
                                                    random_state=RANDOM_STATE,
                                                    stratify=y)

### Column typing

In [7]:
numeric_features = X_train.columns.tolist()

### Preprocessing skeleton

In [8]:
# Gradient Boosting Trees do not need feature scaling as they only care about ordering, not magnitude
preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features)
    ],
    remainder="drop"
)

### Training the baseline Gradient Boosting model on the Training set

In [9]:
pipeline_gb_baseline = Pipeline(
    [
        ("preprocess", preprocessor),
        ("model", GradientBoostingClassifier(random_state=RANDOM_STATE))
    ]
)
pipeline_gb_baseline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


### Predicting the Test set results

In [10]:
y_pred = pipeline_gb_baseline.predict(X_test)
y_proba = pipeline_gb_baseline.predict_proba(X_test)[:,1]

### Baseline Evaluation

- The dataset shows class imbalance (~64% vs ~36%)
- Therefore, ROC-AUC will be used as the primary evaluation metric instead of accuracy

In [11]:
gb_baseline_roc_auc = roc_auc_score(y_test, y_proba)
gb_baseline_pr_auc = average_precision_score(y_test, y_proba)
gb_baseline_accuracy = accuracy_score(y_test, y_pred)

print(f"Gradient Boosting Baseline Test ROC-AUC: {gb_baseline_roc_auc:.4f}")
print(f"Gradient Boosting Baseline Test PR-AUC: {gb_baseline_pr_auc:.4f}")
print(f"Gradient Boosting Baseline Test Accuracy: {gb_baseline_accuracy:.4f}")

Gradient Boosting Baseline Test ROC-AUC: 0.9510
Gradient Boosting Baseline Test PR-AUC: 0.9047
Gradient Boosting Baseline Test Accuracy: 0.9100


**Summary: Baseline (Metric Choice + Results)**
- Primary metric: ROC-AUC
    - Used for model comparison because it measures overall ranking quality and is stable under class imbalance
- Secondary metric: PR-AUC
    - Reported to evaluate performance on the minority (positive) class, which is more business-relevant for imbalanced datasets

Baseline results:
- Gradient Boosting baseline achieves:
    - Test ROC-AUC ~ 0.951
    - Test PR-AUC ~ 0.905
- This is substantially higher than:
    - Decision Tree baseline ROC-AUC ~ 0.884
    - Random Forest baseline ROC-AUC ~ 0.924
    - Random Forest baseline PR-AUC ~ 0.843

This shows:
- Gradient Boosting significantly outperforms both a single Decision Tree and a Random Forest even without tuning
- The gain comes from sequential learning:
    - Each tree corrects the mistakes of previous trees
    - This reduces bias more effectively than bagging-based methods
- Strong PR-AUC indicates:
    - Better minority-class ranking
    - More confident positive predictions

### Gradient Boosting Tree Depth Control

In [12]:
# Tuning tree complexity for Gradient Boosting
# Gradient Boosting builds trees sequentially and amplifies errors over iterations
# Therefore, individual trees must be weak (shallow) to prevent overfitting
# We tune max_depth first to control learner strength before tuning learning rate and number of estimators
param_grid = {
    "model__max_depth": [1, 2, 3, 4]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

grid = GridSearchCV(
    estimator=pipeline_gb_baseline,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    return_train_score=True
)

grid.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__max_depth': [1, 2, ...]}"
,scoring,'roc_auc'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,1
,min_impurity_decrease,0.0


In [13]:
cv_results = pd.DataFrame(grid.cv_results_)
display(
    cv_results[
        ["param_model__max_depth", "mean_train_score", "mean_test_score"]
    ]
)

Unnamed: 0,param_model__max_depth,mean_train_score,mean_test_score
0,1,0.969584,0.954954
1,2,0.994009,0.948587
2,3,0.999341,0.951602
3,4,0.999917,0.949938


In [14]:
tuned_gb_pipeline = grid.best_estimator_
tuned_gb_cv_roc_auc = grid.best_score_

print("Best params:", grid.best_params_)
print("Best GB CV ROC-AUC:", tuned_gb_cv_roc_auc)

Best params: {'model__max_depth': 1}
Best GB CV ROC-AUC: 0.9549536720589351


**Summary: Gradient Boosting Tree Depth**
- Tree depth has a strong impact on Gradient Boosting performance
- The best cross-validated performance is achieved with max_depth = 1 (decision stumps):
    - CV ROC-AUC ~ 0.955
- As tree depth increases:
    - Training ROC-AUC rapidly approaches 1.0
    - Validation ROC-AUC stagnates or declines
- This indicates overfitting due to error amplification, which is characteristic of boosting with strong learners

Hence:
- Tree depth should be fixed to a small value (here, max_depth = 1) before tuning learning rate and number of estimators
- This reinforces the core boosting principle: `Many weak learners > few strong learners`

### Evaluation of Tuned Gradient Boosting (max_depth) on Test Set

In [15]:
y_proba = tuned_gb_pipeline.predict_proba(X_test)[:, 1]
gb_tuned_roc_auc = roc_auc_score(y_test, y_proba)
gb_tuned_pr_auc = average_precision_score(y_test, y_proba)

print(f"Tuned GB Test ROC-AUC: {gb_tuned_roc_auc:.4f}")
print(f"Tuned GB Test PR-AUC: {gb_tuned_pr_auc:.4f}")

Tuned GB Test ROC-AUC: 0.9423
Tuned GB Test PR-AUC: 0.8972


**Summary: Gradient Boosting Baseline vs Tuned (Tree Depth Control)**
- Baseline Gradient Boosting:
    - Test ROC-AUC ~ 0.9510
    - Test PR-AUC ~ 0.9047
- Tuned Gradient Boosting:
    - Test ROC-AUC ~ 0.9423
    - Test PR-AUC ~ 0.8972

This shows:
- Cross-validation selected very weak learners (decision stumps) as the most robust configuration
- However, the default (slightly deeper) baseline achieves marginally better performance on this specific test split
- The performance gap is small and within expected variance for Gradient Boosting models

**NOTE:**

Although the baseline Gradient Boosting model achieves slightly higher test performance on this split, cross-validation consistently favors very shallow trees. This reflects the bias–variance trade-off in boosting: weaker learners generalize more reliably, while stronger learners may show marginal gains on specific splits. In practice, the CV-selected depth is preferred for robustness.

### Gradient Boosting No. of Trees and Learning Rate Control

In [16]:
# Use the best structural GB model as the base for further tuning
base_tuned_gb_pipeline = tuned_gb_pipeline

In [17]:
# After fixing tree depth (learner strength), we tune how Gradient Boosting is applied: 
# learning_rate controls step size, and n_estimators controls how many sequential corrections are made
# These two parameters trade off against each other and must be tuned together

param_grid = {
    "model__learning_rate": [0.05, 0.1, 0.2],
    "model__n_estimators": [100, 200, 300]
}

grid = GridSearchCV(
    estimator=base_tuned_gb_pipeline,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    return_train_score=True
)

grid.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__learning_rate': [0.05, 0.1, ...], 'model__n_estimators': [100, 200, ...]}"
,scoring,'roc_auc'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,loss,'log_loss'
,learning_rate,0.2
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,1
,min_impurity_decrease,0.0


In [18]:
cv_results = pd.DataFrame(grid.cv_results_)
display(cv_results[["param_model__n_estimators", "param_model__learning_rate", "mean_train_score", "mean_test_score"]])

Unnamed: 0,param_model__n_estimators,param_model__learning_rate,mean_train_score,mean_test_score
0,100,0.05,0.963572,0.949865
1,200,0.05,0.969485,0.955437
2,300,0.05,0.972231,0.955454
3,100,0.1,0.969584,0.954954
4,200,0.1,0.973972,0.954737
5,300,0.1,0.977357,0.953072
6,100,0.2,0.974306,0.955714
7,200,0.2,0.980307,0.95221
8,300,0.2,0.985135,0.950779


In [19]:
best_gb_pipeline = grid.best_estimator_
best_gb_cv_roc_auc = grid.best_score_

print("Best n_estimators:", grid.best_params_["model__n_estimators"])
print("Best learning_rate:", grid.best_params_["model__learning_rate"])
print("Best GB CV ROC-AUC:", best_gb_cv_roc_auc)

Best n_estimators: 100
Best learning_rate: 0.2
Best GB CV ROC-AUC: 0.9557135846609531


**Summary: Gradient Boosting No. of Trees and Learning Rate Control**
- After fixing tree depth (max_depth = 1), learning rate and number of estimators were tuned jointly
- Cross-validation shows a clear trade-off between `learning_rate` and `n_estimators`:
    - Lower learning rates require more trees to reach similar performance
    - Higher learning rates converge faster but risk overfitting if too many trees are used
- The best configuration was:
    - learning_rate = 0.2
    - n_estimators = 100
    - CV ROC-AUC ~ 0.956

Interpretation:
- Training ROC-AUC increases monotonically with higher learning rates and more trees, indicating increasing model capacity
- Validation ROC-AUC peaks at moderate settings and declines for more aggressive configurations, showing overfitting beyond the optimum
- The optimal model favors:
    - A moderately high learning rate
    - A limited number of boosting iterations
    - Reinforcing that boosting benefits from controlled, incremental corrections rather than excessive complexity

In [20]:
y_proba_gb_final = best_gb_pipeline.predict_proba(X_test)[:, 1]
gb_final_roc_auc = roc_auc_score(y_test, y_proba_gb_final)
gb_final_pr_auc = average_precision_score(y_test, y_proba_gb_final)

print(f"Final GB Test ROC-AUC: {gb_final_roc_auc:.4f}")
print(f"Final GB Test PR-AUC: {gb_final_pr_auc:.4f}")

Final GB Test ROC-AUC: 0.9460
Final GB Test PR-AUC: 0.9018


### Baseline vs Tuned

In [21]:
print(f"Baseline GB Test ROC-AUC    : {gb_baseline_roc_auc:.4f}")
print(f"Baseline GB Test PR-AUC     : {gb_baseline_pr_auc:.4f}")
print(f"Tuned GB Test ROC-AUC       : {gb_final_roc_auc:.4f}")
print(f"Tuned GB Test PR-AUC        : {gb_final_pr_auc:.4f}")

Baseline GB Test ROC-AUC    : 0.9510
Baseline GB Test PR-AUC     : 0.9047
Tuned GB Test ROC-AUC       : 0.9460
Tuned GB Test PR-AUC        : 0.9018


**Summary: Gradient Boosting Baseline vs Tuned (Tree Depth, No. of Trees, Learning Rate Control)**
- Baseline Gradient Boosting:
    - Test ROC-AUC ~ 0.9510
    - Test PR-AUC ~ 0.9047
- Tuned Gradient Boosting:
    - Test ROC-AUC ~ 0.9460
    - Test PR-AUC ~ 0.9018

This shows:
- Hyperparameter tuning improved cross-validation performance, but did not improve test-set performance
- The baseline Gradient Boosting configuration was already close to optimal for this dataset
- The tuned model is more conservative and generalizes more reliably across folds, even if it slightly underperforms on this specific test split
- The performance difference is small and within expected variance for boosting models

Interpretation:
- Cross-validation optimizes expected generalization, not performance on a single test split
- The test set is used only to confirm that performance remains competitive
- A slight drop in test performance does not invalidate the tuning process
- The tuned configuration is preferred in practice due to:
    - Better robustness
    - Lower risk of overfitting
    - Alignment with boosting theory (weak learners + controlled updates)

Hence, In Gradient Boosting, tuning primarily stabilizes generalization rather than increasing peak performance. When baseline performance is already strong, tuning reduces variance and improves reliability rather than producing large gains

### Compare Gradient Boosting with Random Forest

In [22]:
# Loading Random Forest Pipeline
best_rf_pipeline = joblib.load(os.path.join(MODEL_PATH, "best_rf_pipeline.pkl"))

In [23]:
# Evaluate Random Forest on test set
y_proba_rf = best_rf_pipeline.predict_proba(X_test)[:, 1]
rf_test_roc_auc = roc_auc_score(y_test, y_proba_rf)
rf_test_pr_auc = average_precision_score(y_test, y_proba_rf)

In [24]:
print("\nFinal Model Comparison")
print(f"Random Forest Test ROC-AUC      : {rf_test_roc_auc:.4f}")
print(f"Random Forest Test PR-AUC       : {rf_test_pr_auc:.4f}")
print(f"Gradient Boosting Test ROC-AUC  : {gb_final_roc_auc:.4f}")
print(f"Gradient Boosting Test PR-AUC   : {gb_final_pr_auc:.4f}")


Final Model Comparison
Random Forest Test ROC-AUC      : 0.9531
Random Forest Test PR-AUC       : 0.9125
Gradient Boosting Test ROC-AUC  : 0.9460
Gradient Boosting Test PR-AUC   : 0.9018


**Summary: Gradient Boosting vs Random Forest**
- Random Forest (tuned):
    - Test ROC-AUC ~ 0.953
    - Test PR-AUC ~ 0.913
- Gradient Boosting (tuned):
    - Test ROC-AUC ~ 0.946
    - Test PR-AUC ~ 0.902

Interpretation:
- Random Forest slightly outperforms Gradient Boosting on this dataset
- Random Forest:
    - Reduces variance through ensembling
    - Is robust and less sensitive to hyperparameters
- Gradient Boosting:
    - Performs competitively
    - Requires careful regularization and tuning
    - Is more sensitive to data noise and hyperparameters

**Key takeaway:** 

Random Forest is easier to use and provides stable performance with minimal tuning, while Gradient Boosting can achieve strong results but needs careful tuning to avoid overfitting

### Final Gradient Boosting Summary

In [25]:
print(f"Baseline GB Test ROC-AUC : {gb_baseline_roc_auc:.4f}")
print(f"Baseline GB Test PR-AUC  : {gb_baseline_pr_auc:.4f}")
print(f"Tuned GB Test ROC-AUC    : {gb_final_roc_auc:.4f}")
print(f"Tuned GB Test PR-AUC     : {gb_final_pr_auc:.4f}")

Baseline GB Test ROC-AUC : 0.9510
Baseline GB Test PR-AUC  : 0.9047
Tuned GB Test ROC-AUC    : 0.9460
Tuned GB Test PR-AUC     : 0.9018


**Conclusion**:
- Gradient Boosting achieves strong performance even without tuning
- Tree depth control confirms that very weak learners (depth = 1) generalize best
- Learning rate and number of estimators tuning stabilizes performance rather than increasing peak accuracy
- The tuned model performs competitively with Random Forest but requires more careful tuning
- Compared to Random Forest:
    - Gradient Boosting focuses on bias reduction
    - Random Forest focuses on variance reduction

Hence, Gradient Boosting is a powerful model for capturing complex patterns, but it is more sensitive to hyperparameters. When tuned carefully, it delivers strong performance, though Random Forest remains the more robust default choice.

### Save Final Gradient Boosting Model

In [26]:
joblib.dump(
    best_gb_pipeline,
    os.path.join(MODEL_PATH, "best_gb_pipeline.pkl")
)

['../models\\best_gb_pipeline.pkl']