In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import optuna

import matplotlib.pyplot as plt

# To split the data
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, log_loss, matthews_corrcoef


In [None]:
dtype = {
    'State': 'category',
    'Area code': 'category',
    'Churn': int
}

train_df = pd.read_csv('churn-bigml-80.csv', dtype=dtype)
test_df = pd.read_csv('churn-bigml-20.csv', dtype=dtype)
full_df = pd.concat([train_df, test_df])

train_len, test_len = len(train_df), len(test_df)
full_df['International plan'] = (full_df['International plan'] == 'Yes')
full_df['Voice mail plan'] = (full_df['Voice mail plan'] == 'Yes')

train_df = full_df[:train_len]
test_df = full_df[:test_len]
full_df.head()

In [None]:
X = full_df.iloc[:, :-1]
y = full_df.Churn
full_df.Churn.value_counts(True)

In [None]:
X_encoded = pd.get_dummies(X, columns=['State', 'Area code'])
X_encoded

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y)

#  Checking the shape of the split
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Checking churn proportions
print(y_train.value_counts(1))
print(y_test.value_counts(1))

In [None]:
scale_pos_weight = (y_train.size - y_train.sum()) / y_train.sum()

clf = xgb.XGBClassifier(
    seed = 42,
    early_stopping_rounds=10,
    eval_metric="auc",
    scale_pos_weight=scale_pos_weight,
)

clf.fit(
    X_train,
    y_train,
    verbose=True,
    eval_set=[(X_test, y_test)],
)

In [None]:
def calculate_metrics(confusion_matrix):
    report = classification_report(y_test, predictions, output_dict=True)
    
    # Accuracy
    accuracy = accuracy_score(y_test, predictions)
    mcc = matthews_corrcoef(y_test, predictions)
    
    metrics_dict = {
        'Accuracy': [accuracy],
        'MCC': [mcc],
        'Sensitivity': [report['1']['recall']],
        'Specificity': [report['0']['recall']],
        'Precision': [report['1']['precision']],
        'F1-Score': [report['1']['f1-score']]
    }
    metrics_df = pd.DataFrame(metrics_dict)
    return metrics_df

predictions = clf.predict(X_test)
cm = confusion_matrix(y_test, predictions, labels=clf.classes_)
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
disp.plot()

disp = ConfusionMatrixDisplay(confusion_matrix=cmn, display_labels=clf.classes_)
disp.plot()
plt.show()

calculate_metrics(cm)

In [None]:
import optuna
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

def objective(trial):
    # Define the search space for hyperparameters
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
    }

    # Set the scale_pos_weight value
    scale_pos_weight = (y_train.size - y_train.sum()) / y_train.sum()
    
    # Instantiate the classifier
    clf = xgb.XGBClassifier(
        seed=42,
        early_stopping_rounds=10,
        eval_metric="auc",
        scale_pos_weight=scale_pos_weight,
        **params
    )
    
    # Train the classifier
    clf.fit(
        X_train,
        y_train,
        verbose=False,
        eval_set=[(X_test, y_test)],
    )
    
    # Predict probabilities
    y_pred_prob = clf.predict_proba(X_test)[:, 1]
    
    # Calculate and return the AUC-ROC score as the optimization target
    auc = roc_auc_score(y_test, y_pred_prob)
    return auc

# Create the Optuna study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Get the best hyperparameters and the best AUC-ROC score
best_params = study.best_params
best_auc = study.best_value

print("Best Hyperparameters:", best_params)
print("Best AUC-ROC Score:", best_auc)

In [None]:
# Instantiate the classifier
clf = xgb.XGBClassifier(
    seed=42,
    early_stopping_rounds=10,
    eval_metric="auc",
    scale_pos_weight=scale_pos_weight,
    **best_params
)

# Train the classifier
clf.fit(
    X_train,
    y_train,
    verbose=False,
    eval_set=[(X_test, y_test)],
)

predictions = clf.predict(X_test)
cm = confusion_matrix(y_test, predictions, labels=clf.classes_)
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
disp.plot()

disp = ConfusionMatrixDisplay(confusion_matrix=cmn, display_labels=clf.classes_)
disp.plot()
plt.show()

calculate_metrics(cm)