# 1. Data Presentation


### Import essential libraries:

- for data manipulation and visualization: Numpy, Pandas, Seaborn, matplotlib

- for preprocessing and handling data imbalance: sklearn.preprocessing, sklearn.impute, imblearn.under_sampling

- for text processing: sklearn.feature_extraction.text

- for model building and evaluation: sklearn.model_selection, sklearn.metrics

- Machine learning models: sklearn.linear_model, sklearn.esemble, sklearn.svm, sklearn.neighbors, sklearn.naive_bayes

- Gradient boosting models: xgboost, lightgbm

- for building pipelines and transformations: sklearn.pipeline, sklearn.compose

- for statistical tests: scipy.stats

- Miscellaneous: warnings

In [None]:
# Core packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

# Data preprocessing
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Data splitting and evaluation
from sklearn.model_selection import (
    train_test_split, StratifiedKFold, cross_validate, RandomizedSearchCV, cross_val_score
)
from scipy.stats import chi2_contingency, f_oneway
from sklearn.metrics import (
    roc_auc_score, average_precision_score, f1_score,
    balanced_accuracy_score, matthews_corrcoef,
    classification_report, confusion_matrix,
    RocCurveDisplay, PrecisionRecallDisplay, make_scorer, 
    ConfusionMatrixDisplay, classification_report,
    confusion_matrix,
    roc_curve,
)

# Models 
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier,
    StackingClassifier
)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFE, SelectFromModel

# class imbalance & optimization
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from scipy.stats import randint, uniform, chi2_contingency

import warnings
warnings.filterwarnings('ignore')

# Plot settings
plt.style.use('fivethirtyeight')
%matplotlib inline
pd.set_option('display.max_columns', 32)

### Loading the dataset

In [None]:
df = pd.read_csv("bank_marketing/bank.csv", sep=";")

### First 5 rows of the dataset

In [None]:
df.head()

### Last 5 rows of the dataset

In [None]:
df.tail()

### Checking how many columns (features + target) does the set contain

In [None]:
len(df.columns)

### Checking what columns the dataset has

In [None]:
df.columns

### Checking the type of the colums

In [None]:
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
num_cols = [c for c in df.columns if c not in cat_cols]

print('\nCategorical columns:', cat_cols)
print('Numerical columns:', num_cols)

### Checking the shape (rows, columns) in DataFrame

In [None]:
df.shape

### Checking the size (number of cells) in DataFrame

In [None]:
df.size

### An overview of the data types, missing values, percentage of missing values, and the number of unique values for each column in the DataFrame

In [None]:
num_cols = df.select_dtypes(exclude='object').columns

data_info_num = pd.DataFrame({
    'Data Type': df[num_cols].dtypes,
    'Missing Values': df[num_cols].isnull().sum(),
    'Percentage Missing': (df[num_cols].isnull().sum() / len(df)) * 100,
    'Unique Values': df[num_cols].nunique()
})

data_info_num = data_info_num.sort_values(by='Missing Values', ascending=False)
data_info_num.style.format({'Percentage Missing': '{:.2f}%'})

In [None]:
cat_cols = df.select_dtypes(include='object').columns

cat_info = pd.DataFrame({
    'Data Type': df[cat_cols].dtypes,
    'Unknown Count': [(df[c] == 'unknown').sum() for c in cat_cols],
    'Unknown (%)': [(df[c].eq('unknown').sum() / len(df)) * 100 for c in cat_cols],
    'Unique Values': [df[c].nunique() for c in cat_cols]
})

cat_info = cat_info.sort_values(by='Unknown Count', ascending=False)
cat_info.style.format({'Unknown (%)': '{:.2f}%'})

### Cross tablulation between categorical variable and the goal attribute

In [None]:
for predictor in cat_cols:
    print(pd.crosstab(index=df[predictor], columns=df['y']),'\n')

### Numerical Feature Distributions

To better understand the data distribution of each **numerical variable**, histograms were plotted with **Kernel Density Estimation (KDE)**.  
This visualization helps identify skewness, outliers, and potential normalization needs before model training.

The code below dynamically arranges the plots in a grid based on the total number of numerical features, ensuring a clear and organized layout.

In [None]:
colx = 3
numero_features = len(num_cols)
n_rows = math.ceil(numero_features / colx)

plt.figure(figsize=(16, n_rows * 3))

for i, col in enumerate(num_cols, 1):
    plt.subplot(n_rows, colx, i)
    sns.histplot(df[col], kde=True, bins=20, color='steelblue')
    plt.title(f"Distribuição de {col}")
    plt.tight_layout()

plt.show()

### Check correlation between numerical variables

In [None]:
# Select numerical columns
num_cols = df.select_dtypes(include=['int64','float64']).columns

# Calculate correlation matrix
corr = df[num_cols].corr()

# Show heatmap of correlations
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation between numerical variables")
plt.show()

### Check correlation between categorical variables

In [None]:
def cramers_v(x, y):
    """Compute Cramér's V statistic for categorical-categorical association."""
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix, correction=False)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1)) / (n-1))
    rcorr = r - ((r-1)**2) / (n-1)
    kcorr = k - ((k-1)**2) / (n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

cat_cols = df.select_dtypes(include='object').columns

cramers_results = pd.DataFrame(index=cat_cols, columns=cat_cols, dtype=float)

for c1 in cat_cols:
    for c2 in cat_cols:
        if c1 == c2:
            cramers_results.loc[c1, c2] = 1.0
        else:
            cramers_results.loc[c1, c2] = cramers_v(df[c1], df[c2])

plt.figure(figsize=(10, 8))
sns.heatmap(cramers_results, annot=True, cmap='Blues', fmt=".2f", square=True)
plt.title("Cramér’s V Association between Categorical Variables", fontsize=14, pad=12)
plt.tight_layout()
plt.show()

--------------------
## Descriptive statistics
--------------------

In [None]:
df.describe(include = 'number').T

In [None]:
df.describe(include = 'object').T 

### Class balance plot

In [None]:
fig = plt.figure()
(df['y'].value_counts(normalize=True).sort_index()*100).plot(kind='bar')
plt.title('Target distribution (percentage)')
plt.ylabel('Percent')
plt.xlabel('y')
plt.show()

-------------------
# Data cleaning, feature selection
-------------------

### Dropping duplicated rows

In [None]:
df.drop_duplicates(inplace=True)
df.size

### Dropping columns with high correlation

In [None]:
#df.drop(columns=['pdays', 'cons.price.idx', 'cons.conf.idx', 'nr.employed'], inplace=True)
df.drop(columns=['cons.price.idx', 'cons.conf.idx', 'nr.employed'], inplace=True)
#df.drop(columns=['pdays','nr.employed', 'euribor3m'], inplace=True)
#df.drop(columns=['nr.employed', 'euribor3m'], inplace=True)
df.columns

### Handling 'unknown' values and Missing Values

For categorical variables, missing or `'unknown'` entries were handled to ensure data quality and model interpretability:

- The variables **`education`** and **`default`** were **kept and modeled**, as the `'unknown'` category may contain predictive meaning (e.g., clients with unavailable credit history).  
  - Missing values in these columns were filled with `'unknown'` to maintain consistency.  
- For **`job`**, **`marital`**, **`housing`**, and **`loan`**, rows containing `'unknown'` values were **removed**, since their frequency was **very low (<3%)**, minimizing data loss while improving dataset reliability.  
- The numeric variable **`pdays`** had values of `999` — indicating no previous contact — replaced with `-1` to make this condition explicit for modeling.  

Finally, a summary table was created to display for each categorical feature:
- Data type  
- Number and percentage of `'unknown'` values  
- Total unique categories  

This helps evaluate the distribution of missing or undefined information across categorical variables.

In [None]:
df = df[df['housing'] != 'unknown']
df = df[df['loan'] != 'unknown']
df = df[df['job'] != 'unknown']
df = df[df['marital'] != 'unknown']
df['education'] = df['education'].fillna('unknown')
df['default'] = df['default'].fillna('unknown')
df['pdays'] = df['pdays'].replace(999, -1)

cat_cols = df.select_dtypes(include='object').columns
cat_info = pd.DataFrame({
    'Data Type': df[cat_cols].dtypes,
    'Unknown Count': [(df[c] == 'unknown').sum() for c in cat_cols],
    'Unknown (%)': [(df[c].eq('unknown').sum() / len(df)) * 100 for c in cat_cols],
    'Unique Values': [df[c].nunique() for c in cat_cols]
})

cat_info = cat_info.sort_values(by='Unknown Count', ascending=False)
cat_info.style.format({'Unknown (%)': '{:.2f}%'})


### Categorical variable encoding

The categorical variables are converted into numeric form:
- **`education`** and **`month`** are ordinal, so they are mapped to ordered numeric codes.  
- Other categorical variables are nominal and are transformed using **one-hot encoding** (`pd.get_dummies()`), creating binary columns.  
- The target **`y`** is encoded with `LabelEncoder` (`no` = 0, `yes` = 1).

This ensures all variables are numeric and ready for model training.

In [None]:
edu_order = ['illiterate', 'basic.4y', 'basic.6y', 'basic.9y',
              'high.school', 'professional.course', 'university.degree', 'unknown']
month_order = ['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec']

df['education'] = pd.Categorical(df['education'], categories=edu_order, ordered=True).codes
df['month'] = pd.Categorical(df['month'], categories=month_order, ordered=True).codes

nominal_cols = ['job', 'marital', 'default', 'housing', 'loan',
                'contact', 'day_of_week', 'poutcome']
dfML = df.copy()
dfML = pd.get_dummies(dfML, columns=nominal_cols, drop_first=True)


le = LabelEncoder()
df['y'] = le.fit_transform(df['y'])
dfML['y'] = le.fit_transform(dfML['y'])
df.columns

### Scale numeric variables

In [None]:
# num_cols_scaled = ['age','campaign','pdays','previous','emp.var.rate','cons.price.idx',
#                    'cons.conf.idx']
# num_cols_scaled = ['age','campaign','previous','emp.var.rate','cons.price.idx',
#                     'cons.conf.idx']
num_cols_scaled = ['age','campaign','pdays','previous','emp.var.rate','euribor3m',]
#num_cols_scaled = ['age','campaign','previous','emp.var.rate','euribor3m',]
scaler = StandardScaler()
df[num_cols_scaled] = scaler.fit_transform(df[num_cols_scaled])

### Columns dropping based on the set description

The column **`duration`** will be removed from the modeling dataset to avoid data leakage,  
but will be **saved separately** for later analysis and evaluation at the end of the project.

In [None]:
duration = df['duration'].copy()
df=df.drop(columns=['duration'])
df.columns

In [None]:
corr = df.corr(numeric_only=True)

plt.figure(figsize=(12,8))
sns.heatmap(corr, cmap='coolwarm', annot=False)
plt.title("Matriz de Correlação")
plt.show()

### Correlation between Features and Target (y)

In [None]:
df_temp = df.drop(columns=['y']).copy()
df_temp['y'] = df['y']

corr_with_target = df_temp.corr(numeric_only=True)['y'].sort_values(ascending=False)

plt.figure(figsize=(6,10))
sns.heatmap(corr_with_target.to_frame(), annot=True, cmap="coolwarm", cbar=False)
plt.title("Correlation between Features and Target (y)")
plt.show()

###

### Train/Test Split and Cross-Validation

Experiment with different train/test splits (e.g., 70/30, 80/20, 90/10) to assess model robustness.
For a more reliable evaluation, use Stratified K-Fold Cross-Validation, which averages results across multiple partitions.

In [None]:
X = df.drop(columns=['y'])
y = df['y']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

train_df = X_train.copy()
train_df['y'] = y_train

test_df = X_test.copy()
test_df['y'] = y_test


X = dfML.drop(columns=['y'])
y = dfML['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

train_dfML = X_train.copy()
train_dfML['y'] = y_train

test_dfML = X_test.copy()
test_dfML['y'] = y_test

### Applying SMOTE to Balance the Training Data

Use the Synthetic Minority Over-sampling Technique (SMOTE) to generate synthetic samples of the minority class, ensuring a balanced training dataset and improving model fairness.

In [None]:
X_train = train_dfML.drop(columns=['y']).copy()
y_train = train_dfML['y']
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("\nOriginal train shape:", X_train.shape)
print("Resampled train shape:", X_train_res.shape)

In [None]:
model = LogisticRegression()
rfe = RFE(model, n_features_to_select=10)
rfe.fit(X, y)
print("Features selected:", X.columns[rfe.support_])

### Interpretation of Feature Selection Methods

Three feature selection techniques were applied:
- **Chi-Square Test** for categorical variables,
- **ANOVA Test** for continuous variables, and
- **LASSO Regularization** for multivariate feature importance.

These methods confirmed which features have a statistically significant relationship with the target variable (`y`).
The goal was not only to reduce dimensionality, but also to validate the relevance and predictive strength of each feature.

A reduced dataset (**dfFS**) was then created containing only the significant variables.
This dataset was compared with the full model (**dfML**) to evaluate whether model simplicity
could be achieved without compromising predictive performance.

In [None]:
def FunctionChisq(inpData, TargetVariable, CategoricalVariablesList):


    FiltPredictors = []

    for predictor in CategoricalVariablesList:
        CrossTabResult = pd.crosstab(index=inpData[TargetVariable],
                                     columns=inpData[predictor])
        ChiSqResult = chi2_contingency(CrossTabResult)
        p_value = ChiSqResult[1]

        if p_value < 0.05:
            print(f"{predictor} IS correlated with {TargetVariable} | P-Value: {p_value:.4f}")
        else:
            print(f"{predictor} is NOT correlated with {TargetVariable} | P-Value: {p_value:.4f}")
            FiltPredictors.append(predictor)
            
    return FiltPredictors

In [None]:
categorical_vars = ['job', 'marital', 'education', 'default', 'housing',
                    'loan', 'contact', 'month', 'day_of_week', 'poutcome']

insignificant_vars = FunctionChisq(inpData=train_df, TargetVariable='y', CategoricalVariablesList=categorical_vars)

### Continuous vs Categorical using ANOVA test

- **Assumption (H₀):** There is **no relationship** between the continuous predictor and the target variable.  
  In other words, the mean of the numeric variable is **the same across both classes** of the target (`y`).

- **Alternative (H₁):** There **is a relationship**, meaning that the mean values differ significantly between the groups of `y`.

The ANOVA test evaluates the probability that the null hypothesis (H₀) is true.  
If the **p-value < 0.05**, we reject H₀ and conclude that the variable is **significantly correlated** with the target.  
If the **p-value ≥ 0.05**, we fail to reject H₀, meaning there is **no significant difference** between the groups.

In [None]:
def FunctionAnova(inpData, TargetVariable, ContinuousPredictorList):
    FiltPredictors = []

    print('##### ANOVA Results #####\n')
    for predictor in ContinuousPredictorList:
        try:
            CategoryGroupLists = inpData.groupby(TargetVariable)[predictor].apply(list)
            AnovaResults = f_oneway(*CategoryGroupLists)

            p_value = AnovaResults[1]

            if p_value < 0.05:
                print(f"{predictor} IS correlated with {TargetVariable} | P-Value: {p_value:.4f}")
            else:
                print(f"{predictor} is NOT correlated with {TargetVariable} | P-Value: {p_value:.4f}")
                FiltPredictors.append(predictor)
        except Exception as e:
            print(f"⚠️ Could not test {predictor}: {e}")

    return FiltPredictors

In [None]:
# continuous_vars = [
#     'age', 'campaign', 'pdays','previous', 'emp.var.rate',
#     'cons.price.idx', 'cons.conf.idx'
# ]
# continuous_vars = [
#     'age', 'campaign', 'previous', 'emp.var.rate',
#     'cons.price.idx', 'cons.conf.idx'
# ]
continuous_vars = [
    'age', 'campaign','pdays', 'previous','euribor3m', 'emp.var.rate',
]
# continuous_vars = [
#     'age', 'campaign', 'previous','euribor3m', 'emp.var.rate',
# ]
insignificant_continuous_vars = FunctionAnova(inpData=train_df, TargetVariable='y', ContinuousPredictorList=continuous_vars)

In [None]:


def lasso_regularization(df):

    X = df.iloc[:,:-1].copy()          
    y = df.iloc[:,-1].copy() 
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    scaler = StandardScaler()
    scaler.fit(X_train)

    sel_ = SelectFromModel(LogisticRegression(C=0.5, penalty='l1', solver='liblinear', random_state=10))

    sel_.fit(scaler.transform(X_train), y_train)

    selected_feat = X_train.columns[(sel_.get_support())]
    
    print("Number of features which coefficient was shrank to zero: ", np.sum(sel_.estimator_.coef_ == 0))
    
    removed_feats = X_train.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]
    print('Removed features by Lasso: ',removed_feats) 

    return X_train.columns[(sel_.estimator_.coef_ != 0).ravel().tolist()]

In [None]:
selected_features = lasso_regularization(train_dfML)
print("Selected features:", selected_features)

In [None]:
selected_categorical = [var for var in categorical_vars if var not in insignificant_vars]

selected_continuous = [var for var in continuous_vars if var not in insignificant_continuous_vars]

selected_all = list(set(list(selected_categorical) + 
                        list(selected_continuous) + 
                        list(selected_features)))

print(f"Total features selected: {len(selected_all)}")
print("Selected variables:\n", selected_all)
dfFS_cols = []

for col in selected_all:
    matched = [c for c in dfML.columns if col in c]
    dfFS_cols.extend(matched)

dfFS_cols = list(set(dfFS_cols + ['y']))

dfFS = dfML[dfFS_cols].copy()

print(f"\ndfFS created successfully with {len(dfFS.columns)-1} features.")
print("Shape:", dfFS.shape)

-------------------
# Modeling and tunning
-------------------

### Model Evaluation Function

This function evaluates a classifier’s performance across multiple **classification thresholds** (`0.3`, `0.5`, and `0.7`), providing a detailed analysis of how sensitivity and precision change with decision boundaries.

For each model:
- The classifier is trained using the training set.  
- Predicted probabilities (`predict_proba`) are used to compute **ROC-AUC** and generate the **ROC Curve**, visualizing the trade-off between the True Positive Rate and False Positive Rate.  
- For each threshold value:
  - Predicted classes are derived from probabilities (`y_proba >= threshold`).
  - **Precision**, **Recall**, **F1-score**, **Balanced Accuracy**, and **Matthews Correlation Coefficient (MCC)** are computed.  
  - A **confusion matrix** is plotted to visualize prediction outcomes (TP, FP, FN, TN).  

This unified evaluation approach enables a consistent and comprehensive comparison across multiple classification models, ensuring fair assessment of both overall performance (AUC) and class-level behavior under different thresholds.

In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test, thresholds=[0.3, 0.5, 0.7]):
    model.fit(X_train, y_train)
    
    y_proba = model.predict_proba(X_test)[:, 1]
    
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    auc_value = roc_auc_score(y_test, y_proba)
    plt.plot(fpr, tpr, label=f"{model.__class__.__name__} (AUC={auc_value:.3f})")
    plt.plot([0,1],[0,1],'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    plt.show()

    for thr in thresholds:
        print(f"\n--- Threshold = {thr} ---")
        y_pred = (y_proba >= thr).astype(int)
        
        print(classification_report(y_test, y_pred))
        print("Balanced Accuracy:", balanced_accuracy_score(y_test, y_pred))
        print("Matthews Corr. Coefficient (MCC):", matthews_corrcoef(y_test, y_pred))
        print("AUC:", auc_value)
        
        cm = confusion_matrix(y_test, y_pred)
        disp = ConfusionMatrixDisplay(cm)
        disp.plot(cmap='Blues')
        plt.title(f'Confusion Matrix (Threshold = {thr})')
        plt.show()

-------------------
# Model Training and Evaluation
-------------------

Train and evaluate multiple machine learning models — **Random Forest**, **Logistic Regression**, **Decision Tree**, **K-Nearest Neighbours (KNN)**, **Naïve Bayes**, **XGBoost**, **LightGBM**, and **Multi-Layer Perceptron (MLP Neural Network)** — using the same balanced training dataset.

Each model is assessed based on **cross-validation ROC-AUC performance** and **test-set evaluation metrics**, including:

- **Precision**
- **Recall**
- **F1-Score**
- **Balanced Accuracy**
- **ROC-AUC**
- **Matthews Correlation Coefficient (MCC)**

The results are compared to identify which classifier provides the best trade-off between **sensitivity and specificity**, as well as overall **robustness** on the minority class.

Ensemble and deep learning models such as **XGBoost**, **LightGBM**, and **MLP** are further validated using **5-fold Stratified Cross-Validation** to ensure **consistency** and **generalisation** across folds.

In [None]:
print("\n--- RANDOM FOREST ---")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf_model = RandomForestClassifier(random_state=42)
scores = cross_val_score(rf_model, X_train, y_train, cv=cv, scoring='roc_auc')

print("ROC-AUC médio (5-fold CV):", round(scores.mean(), 4))
print("Desvio padrão:", round(scores.std(), 4))


print("\n--- LOGISTIC REGRESSION ---")
lr_model = LogisticRegression(max_iter=1000)
evaluate_model(lr_model, X_train, y_train, X_test, y_test)

print("\n--- DECISION TREE ---")
dt_model = DecisionTreeClassifier(
    criterion='gini', 
    max_depth=None, 
    min_samples_split=2, 
    random_state=42
)
evaluate_model(dt_model, X_train, y_train, X_test, y_test)

print("\n--- K-NEAREST NEIGHBOURS (KNN) ---")
knn_model = KNeighborsClassifier(
    n_neighbors=5,  
    metric='minkowski', 
    p=2             
)
evaluate_model(knn_model, X_train, y_train, X_test, y_test)

print("\n--- NAIVE BAYES ---")
nb_model = GaussianNB()
evaluate_model(nb_model, X_train, y_train, X_test, y_test)

print("\n--- XGBOOST ---")
xgb_model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    tree_method="hist",
    eval_metric="logloss",
    n_jobs=-1,
    random_state=42
)

evaluate_model(xgb_model, X_train, y_train, X_test, y_test)

print("\n--- LIGHTGBM ---")
lgbm_model = LGBMClassifier(
    n_estimators=700,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1
)

evaluate_model(lgbm_model, X_train, y_train, X_test, y_test)

print("\n--- MLP (Neural Network) ---")
mlp_model = MLPClassifier(
    hidden_layer_sizes=(128, 64),
    activation="relu",
    solver="adam",
    learning_rate_init=1e-3,
    alpha=1e-4,              
    batch_size=256,
    max_iter=100,
    early_stopping=True,
    n_iter_no_change=10,
    random_state=42
)

evaluate_model(mlp_model, X_train, y_train, X_test, y_test)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, mdl in [
    ("XGB", xgb_model),
    ("LGBM", lgbm_model),
    ("MLP", mlp_model)
]:
    scores = cross_val_score(mdl, X_train, y_train, cv=cv, scoring="roc_auc", n_jobs=-1)
    print(f"{name} | ROC-AUC CV: {scores.mean():.4f} ± {scores.std():.4f}")


In [None]:
print("\n--- LOGISTIC REGRESSION ---")
evaluate_model(LogisticRegression(max_iter=1000),X_train_res, y_train_res, X_test, y_test)


print("\n--- DECISION TREE ---")
dt_model = DecisionTreeClassifier(
    criterion='gini', 
    max_depth=None, 
    min_samples_split=2, 
    random_state=42,
)
evaluate_model(dt_model, X_train_res, y_train_res, X_test, y_test)

print("\n--- K-NEAREST NEIGHBOURS (KNN) ---")
knn_model = KNeighborsClassifier(
    n_neighbors=5,  
    metric='minkowski', 
    p=2             
)
evaluate_model(knn_model, X_train_res, y_train_res, X_test, y_test)

print("\n--- NAIVE BAYES ---")
nb_model = GaussianNB()
evaluate_model(nb_model, X_train_res, y_train_res, X_test, y_test)

print("\n--- XGBOOST ---")
xgb_model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    tree_method="hist",
    eval_metric="logloss",
    n_jobs=-1,
    random_state=42
)

evaluate_model(xgb_model, X_train_res, y_train_res, X_test, y_test)

print("\n--- LIGHTGBM ---")
lgbm_model = LGBMClassifier(
    n_estimators=700,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1
)

evaluate_model(lgbm_model, X_train_res, y_train_res, X_test, y_test)

print("\n--- MLP (Neural Network) ---")
mlp_model = MLPClassifier(
    hidden_layer_sizes=(128, 64),
    activation="relu",
    solver="adam",
    learning_rate_init=1e-3,
    alpha=1e-4,               
    batch_size=256,
    max_iter=100,
    early_stopping=True,
    n_iter_no_change=10,
    random_state=42
)

evaluate_model(mlp_model, X_train_res, y_train_res, X_test, y_test)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, mdl in [
    ("XGB", xgb_model),
    ("LGBM", lgbm_model),
    ("MLP", mlp_model)
]:
    scores = cross_val_score(mdl, X_train_res, y_train_res, cv=cv, scoring="roc_auc", n_jobs=-1)
    print(f"{name} | ROC-AUC CV: {scores.mean():.4f} ± {scores.std():.4f}")