In [1]:
import itertools
from joblib import load
from tqdm import tqdm
import random
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import recall_score, f1_score, make_scorer, precision_score, average_precision_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.model_selection import cross_val_score
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc, confusion_matrix, ConfusionMatrixDisplay
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline as make_imb_pipeline
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.model_selection import cross_val_predict, StratifiedKFold, GridSearchCV

ModuleNotFoundError: No module named 'imblearn'

In [75]:
# Load data
data = pd.read_csv("AlarmGrundlag_ModelParametre_Merged1.1.csv", delimiter=";")
data = data.drop(columns=["Customer_Refnr", "RUN_DATE", "CASE_CLOSE_DATE", "SCENARIO_NAME", "ALERT_ID", "CASE_ID", "Customer_Risk_Profile_Current"])
data['CASE_STATUS_CODE'] = data['CASE_STATUS_CODE'].replace({'C': 0, 'R': 1})
data.dropna(subset=['Customer_Risk_Profile_BeforeAlert'], inplace=True)
data = pd.get_dummies(data, columns=['Customer_Risk_Profile_BeforeAlert'], prefix='RiskGroup')
# Replace infinities with NaN for easier handling
data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with any NaNs that might have been infinities initially
data.dropna(inplace=True)

In [76]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11600 entries, 0 to 11601
Data columns (total 85 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Customer_Age                               11600 non-null  int64  
 1   CASE_STATUS_CODE                           11600 non-null  int64  
 2   ALERT_Triggered_In_HolidayPeriod           11600 non-null  int64  
 3   Customer_can_Overdraft                     11600 non-null  int64  
 4   Customer_has_Loan                          11600 non-null  int64  
 5   Customer_has_Depot                         11600 non-null  int64  
 6   Customer_has_InstallmentPension            11600 non-null  int64  
 7   Customer_has_SelfPension                   11600 non-null  int64  
 8   Customer_has_BusinessAccount               11600 non-null  int64  
 9   Customer_Gender                            11600 non-null  int64  
 10  Customer_Large_Deposits    

In [77]:
data[['Express_Ratio_SumDKK', 'Express_Ratio_Count', 'MobilePay_Count_DebitCreditRatio', 'MobilePay_Sum_DebitCreditRatio']].describe()

Unnamed: 0,Express_Ratio_SumDKK,Express_Ratio_Count,MobilePay_Count_DebitCreditRatio,MobilePay_Sum_DebitCreditRatio
count,11600.0,11600.0,11600.0,11600.0
mean,0.04385,1.154982,0.889737,1.810119
std,0.107714,4.12999,7.953707,49.077408
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.155172,0.152138
75%,0.024511,0.366664,0.6621,0.867546
max,0.999187,82.022472,434.0,5104.0


In [78]:
y = data['CASE_STATUS_CODE']
X = data.drop('CASE_STATUS_CODE', axis=1)

In [79]:
columns_to_log = ['Express_Ratio_SumDKK', 'Express_Ratio_Count', 'MobilePay_Count_DebitCreditRatio', 'MobilePay_Sum_DebitCreditRatio']
for column in columns_to_log:
    data[column] = np.log1p(data[column])

In [80]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# # Scale the data
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_val_scaled = scaler.transform(X_val)  # Use the same scaler instance
# X_test_scaled = scaler.transform(X_test)  # Use the same scaler instance

# # Apply PCA
# pca = PCA(n_components=0.95)
# X_train_pca = pca.fit_transform(X_train_scaled)
# X_val_pca = pca.transform(X_val_scaled)  # Transform validation data using the same PCA model
# X_test_pca = pca.transform(X_test_scaled)  # Transform test data using the same PCA model

# print(f'Percentage - X_train_pca: {len(X_train)/len(X)}, X_val_pca: {len(X_val)/len(X)}, X_test_pca: {len(X_test)/len(X)}')
# print(f"Shapes - X_train_pca: {X_train.shape}, X_val_pca: {X_val.shape}, X_test_pca: {X_test.shape}")
# print(f"Shapes - y_train: {y_train.shape}, y_val: {y_val.shape}, y_test: {y_test.shape}")

In [81]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UndefinedMetricWarning)

# Framework using predefined validation set

In [54]:
# Validation Split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# Define columns to log-transform
columns_to_log = ['Express_Ratio_SumDKK', 'Express_Ratio_Count', 'MobilePay_Count_DebitCreditRatio', 'MobilePay_Sum_DebitCreditRatio']

# Function to apply log transformation
def log_transform(x):
    return np.log1p(x)

# Create a transformer that applies log transformation
log_transformer = FunctionTransformer(np.log1p)

# Define the column transformer & PCA
preprocessor = ColumnTransformer(
    transformers=[
        ('num', make_pipeline(FunctionTransformer(np.log1p), PCA(n_components=0.95, random_state=42)), columns_to_log)
    ],
    remainder='passthrough'
)

# Assuming X_train, X_val, y_train, y_val are already defined
# Define the parameter grids
param_grid_lr = {
    'preprocessor__num__pca__n_components': [0.75, 0.85, 0.95],  # Explained variance ratios
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__solver': ['liblinear', 'saga']
}
param_grid_brf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'class_weight': [{0: 1, 1: v} for v in [1, 2, 3, 4, 5, 10, 20]]
}
param_grid_rf_smote = {
    'randomforestclassifier__n_estimators': [100, 200, 300],
    'randomforestclassifier__max_depth': [None, 10, 20, 30],
    'randomforestclassifier__min_samples_leaf': [1, 2, 4],
    'randomforestclassifier__max_features': ['sqrt', 'log2']
}
param_grid_rf_adasyn = {
    'randomforestclassifier__n_estimators': [100, 200, 300],
    'randomforestclassifier__max_depth': [None, 10, 20, 30],
    'randomforestclassifier__min_samples_leaf': [1, 2, 4],
    'randomforestclassifier__max_features': ['sqrt', 'log2']
}
param_grid_xgb = {
    'max_depth': [3, 4, 5, 6],
    'min_child_weight': [1, 5, 10], 
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0], 
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2, 0.3] 
}

# Define model configurations using pipelines that include log transformation and PCA
models = [
    #(BalancedRandomForestClassifier(random_state=42), param_grid_brf, 'Balanced Random Forest'),
    #(make_imb_pipeline(SMOTE(random_state=42), RandomForestClassifier(random_state=42, class_weight='balanced')), param_grid_rf_smote, 'Random Forest with SMOTE'),
    #(make_imb_pipeline(ADASYN(random_state=42), RandomForestClassifier(random_state=42, class_weight='balanced')), param_grid_rf_adasyn, 'Random Forest with ADASYN'),
    #(Pipeline([('preprocessor', preprocessor), ('classifier', LogisticRegression(random_state=42, max_iter=1000))]), param_grid_lr, 'Logistic Regression'),
    (XGBClassifier(random_state=42, objective='binary:logistic'), param_grid_xgb, 'XGBoost')
]

# Initialize best results for each model
best_results = {name: {'score': 0, 'params': {}, 'model': None, 'recall': 0, 'precision': 0, 'threshold': 0, 'predictions': None} for _, _, name in models}

# Loop through each model and parameter grid
for model, param_grid, model_name in models:
    # Create all possible combinations of the current model's parameter grid
    all_combinations = list(itertools.product(*param_grid.values()))
    
    # Randomly sample 100 combinations from the list of all combinations, if there are enough
    selected_combinations = random.sample(all_combinations, min(100, len(all_combinations)))
    
    # Evaluate each combination
    for combo in tqdm(selected_combinations, desc=f"Evaluating {model_name}"):
        params = dict(zip(param_grid.keys(), combo))
        model.set_params(**params)
        
        # Fit the model on training data
        model.fit(X_train, y_train)
        
        # Predict and evaluate on validation data
        probabilities = model.predict_proba(X_val)[:, 1]
        
        # Evaluate different thresholds
        for threshold in np.linspace(0.1, 0.9, 41):
            predictions = (probabilities >= threshold).astype(int)
            recall = recall_score(y_val, predictions, pos_label=1)
            auc_prc = average_precision_score(y_val, probabilities)
            
            # Update best results if current model's AUC-PRC is better and recall is above 90%
            if auc_prc > best_results[model_name]['score']: # and recall >= 0.90
                best_results[model_name]['score'] = auc_prc
                best_results[model_name]['recall'] = recall
                best_results[model_name]['precision'] = precision_score(y_val, predictions, pos_label=1)
                best_results[model_name]['threshold'] = threshold
                best_results[model_name]['params'] = params
                best_results[model_name]['model'] = model
                best_results[model_name]['predictions'] = predictions

# Output the best results for each model configuration
for model_name, details in best_results.items():
    print('----------------------------------------------------------------------')
    print(f"Model: {model_name}")
    print(f"Best auc_prc score: {details['score']}")
    print(f"Best recall achieved: {details['recall']}")
    print(f"Best precision achieved: {details['precision']}")
    print(f"Optimal threshold for predictions: {details['threshold']}")
    print(f"Best parameters found: {details['params']}")
    print('----------------------------------------------------------------------')

Evaluating XGBoost:  17%|█▋        | 17/100 [00:06<00:32,  2.52it/s]


KeyboardInterrupt: 

# Framework using Cross Validation

In [85]:
import xgboost as xgb
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import recall_score, precision_score, average_precision_score
import itertools
import random
from tqdm import tqdm
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Define your data (assuming X_train_df and y_train_df are pandas DataFrames)
X_train_values = X_train.values
y_train_values = y_train.values.ravel()

# Define the number of folds for cross-validation
k = 3
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

# Define the parameter grid for XGBoost
param_grid_xgb = {
    'classifier__max_depth': [10, 50, 75, 100],
    'classifier__min_child_weight': [0.001, 0.05, 0.1, 0.5, 1, 5, 10], 
    'classifier__subsample': [0.6, 0.8, 1.0],
    'classifier__colsample_bytree': [0.2, 0.4, 0.6, 0.8, 1.0], 
    'classifier__n_estimators': [50, 100, 200, 300, 400, 500],
    'classifier__learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5]
}

# Initialize best results for XGBoost
best_results_xgb = {'score': 0, 'params': {}, 'recall': 0, 'precision': 0, 'threshold': 0, 'predictions': None}

# Setup the pipeline with SMOTE and XGBoost classifier
smote = SMOTE(random_state=42)
xgb_clf = xgb.XGBClassifier(objective='binary:logistic', eval_metric='aucpr', seed=42)
pipeline = ImbPipeline([
    ('smote', smote),
    ('classifier', xgb_clf)
])

# Loop through each fold for cross-validation
for train_index, val_index in skf.split(X_train, y_train):
    X_train_cv, X_val_cv = X_train_values[train_index], X_train_values[val_index]
    y_train_cv, y_val_cv = y_train_values[train_index], y_train_values[val_index]
    
    # Create all possible combinations of the XGBoost parameter grid
    all_combinations = list(itertools.product(*[param_grid_xgb[key] for key in sorted(param_grid_xgb.keys())]))
    
    # Randomly sample 100 combinations from the list of all combinations, if there are enough
    selected_combinations = random.sample(all_combinations, min(100, len(all_combinations)))
    
    # Evaluate each combination
    for combo in tqdm(selected_combinations, desc="Evaluating XGBoost on fold"):
        params = dict(zip(sorted(param_grid_xgb.keys()), combo))
        pipeline.set_params(**params)
        
        # Fit the pipeline on training data
        pipeline.fit(X_train_cv, y_train_cv)
        
        # Predict probabilities on validation data
        probabilities = pipeline.predict_proba(X_val_cv)[:, 1]
        
        # Evaluate different thresholds
        for threshold in np.linspace(0.1, 0.9, 41):
            predictions = (probabilities >= threshold).astype(int)
            recall = recall_score(y_val_cv, predictions)
            auc_prc = average_precision_score(y_val_cv, probabilities)
            
            # Update best results if current model's AUC-PRC is better and recall is above 90%
            if auc_prc > best_results_xgb['score']:
                best_results_xgb['score'] = auc_prc
                best_results_xgb['recall'] = recall
                best_results_xgb['precision'] = precision_score(y_val_cv, predictions)
                best_results_xgb['threshold'] = threshold
                best_results_xgb['params'] = params
                best_results_xgb['predictions'] = predictions
                best_results_xgb['model'] = pipeline

# Output the best results for XGBoost
print('----------------------------------------------------------------------')
print("Best results for XGBoost:")
print(f"Best auc_prc score: {best_results_xgb['score']}")
print(f"Best recall achieved: {best_results_xgb['recall']}")
print(f"Best precision achieved: {best_results_xgb['precision']}")
print(f"Optimal threshold for predictions: {best_results_xgb['threshold']}")
print(f"Best parameters found: {best_results_xgb['params']}")
print('----------------------------------------------------------------------')

Evaluating XGBoost on fold: 100%|██████████| 100/100 [08:51<00:00,  5.31s/it]
Evaluating XGBoost on fold: 100%|██████████| 100/100 [09:04<00:00,  5.45s/it]
Evaluating XGBoost on fold: 100%|██████████| 100/100 [09:34<00:00,  5.74s/it]

----------------------------------------------------------------------
Best results for XGBoost:
Best auc_prc score: 0.8351337546601176
Best recall achieved: 0.7714285714285715
Best precision achieved: 0.6878980891719745
Optimal threshold for predictions: 0.1
Best parameters found: {'classifier__colsample_bytree': 0.4, 'classifier__learning_rate': 0.2, 'classifier__max_depth': 50, 'classifier__min_child_weight': 0.001, 'classifier__n_estimators': 200, 'classifier__subsample': 1.0}
----------------------------------------------------------------------





In [86]:
from sklearn.metrics import classification_report

classifier__colsample_bytree = best_results_xgb['params']['classifier__colsample_bytree']
classifier__learning_rate = best_results_xgb['params']['classifier__learning_rate']
classifier__max_depth = best_results_xgb['params']['classifier__max_depth']
classifier__min_child_weight = best_results_xgb['params']['classifier__min_child_weight']
classifier__n_estimators = best_results_xgb['params']['classifier__n_estimators']
classifier__subsample = best_results_xgb['params']['classifier__subsample']

# Train the best model on the entire training dataset
best_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='aucpr', classifier__colsample_bytree=classifier__colsample_bytree, 
                               classifier__learning_rate=classifier__learning_rate, classifier__max_depth=classifier__max_depth, classifier__min_child_weight=classifier__min_child_weight,
                               classifier__n_estimators=classifier__n_estimators, classifier__subsample=classifier__subsample)
best_model.fit(X_train, y_train)

# Make predictions on the test dataset
y_pred = best_model.predict(X_test)

# Generate a classification report
report = classification_report(y_test, y_pred)
print("Classification Report for the Best Model:")
print(report)

Parameters: { "classifier__colsample_bytree", "classifier__learning_rate", "classifier__max_depth", "classifier__min_child_weight", "classifier__n_estimators", "classifier__subsample" } are not used.



Classification Report for the Best Model:
              precision    recall  f1-score   support

           0       0.93      0.98      0.95      1900
           1       0.86      0.67      0.76       420

    accuracy                           0.92      2320
   macro avg       0.90      0.83      0.85      2320
weighted avg       0.92      0.92      0.92      2320



In [70]:
# Assuming best_results is populated and contains the best model, threshold, parameters, and predictions
for model_name, details in best_results.items():
    if details['model'] is not None and details['predictions'] is not None:
        # Use the saved predictions from the validation set evaluation
        validation_predictions = details['predictions']
        
        # Generate and print the classification report for the saved validation set predictions
        print(f"Classification Report for {model_name} on Validation Data:")
        print(classification_report(y_val, validation_predictions))
        print("\n")  # Add a newline for better readability between reports
        print(details['score'])
    else:
        print(f"No model or predictions available for {model_name}")

Classification Report for XGBoost on Validation Data:
              precision    recall  f1-score   support

           0       1.00      0.95      0.97      1882
           1       0.82      1.00      0.90       438

    accuracy                           0.96      2320
   macro avg       0.91      0.97      0.94      2320
weighted avg       0.97      0.96      0.96      2320



0.9999320790747918
