## Import Libraries

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, roc_curve, auc
import matplotlib.pyplot as plt
import pickle

## Load Datasets

In [3]:
X_train = pd.read_csv('preprocessed_data/X_train_preprocessed_scaled.csv')
X_test = pd.read_csv('preprocessed_data/X_test_preprocessed_scaled.csv')
y_train = pd.read_csv('preprocessed_data/y_train.csv')['SUCCESS'].values  
y_test = pd.read_csv('preprocessed_data/y_test.csv')['SUCCESS'].values

## Ensure Feature Consistency

In [4]:
common_features = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_features]
X_test = X_test[common_features]

## Handle Missing Values

In [29]:
X_test = X_test.drop('BIRTHDATE', axis=1)

In [30]:
X_train.fillna(X_train.mean(), inplace=True)
X_test.fillna(X_train.mean(), inplace=True)

## Check Class Distribution Before Resampling

In [31]:
print("Original class distribution in training data:")
print(pd.Series(y_train).value_counts())

Original class distribution in training data:
1    73120
0    21349
Name: count, dtype: int64


## Apply Class Balancing (SMOTE + Random UnderSampling)

In [32]:
over = SMOTE(sampling_strategy=0.6, random_state=42)  # Oversample minority class to 60% of majority class
under = RandomUnderSampler(sampling_strategy=0.8, random_state=42)  # Undersample majority to 80% of its original size
pipeline = Pipeline(steps=[('oversample', over), ('undersample', under)])

## Apply Resampling

In [33]:
X_train_res, y_train_res = pipeline.fit_resample(X_train, y_train)

## Compute `scale_pos_weight` for XGBoost (Balances Class Weights)

In [42]:
class_counts = np.bincount(y_train_res)
scale_pos_weight = class_counts[0] / class_counts[1] if class_counts[1] > 0 else 1.0

## Check class distribution after resampling

In [34]:
print("\nResampled class distribution:")
print(pd.Series(y_train_res).value_counts())


Resampled class distribution:
1    54840
0    43872
Name: count, dtype: int64


## Initialize XGBoost Classifier

In [43]:
xgb_clf = XGBClassifier(
    random_state=42,
    n_estimators=400,       # Number of trees
    learning_rate=0.05,     # Helps prevent overfitting
    max_depth=6,            # Controls tree complexity
    scale_pos_weight=scale_pos_weight,  # Adjusts class imbalance
    eval_metric='logloss',
    use_label_encoder=False,
    n_jobs=-1
)

## Initialize Random Forest Classifier

In [44]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(
    random_state=42,
    n_estimators=400,
    max_depth=6,
    class_weight='balanced',  # Automatically handles imbalance
    n_jobs=-1
)


## Perform Cross-Validation

In [46]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(rf_clf, X_train_res, y_train_res, cv=skf, scoring='accuracy', n_jobs=-1)

print("\nCross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))


Cross-Validation Scores: [1. 1. 1. 1. 1.]
Mean CV Accuracy: 1.0
