In [1]:
%load_ext autoreload
%autoreload 2

In [2]:

import warnings
warnings.filterwarnings("ignore")

In [3]:
import sys
sys.path.append('../')

# Custom functions
from src.data_split import create_masks_data_split_per_months
from src.paths import PREPROCESSED_DATA_DIR

In [4]:
import numpy as np
import pandas as pd


In [5]:
df = pd.read_parquet(PREPROCESSED_DATA_DIR / 'modeling_dataset.parquet')

In [6]:
train_masks, val_masks, test_masks = create_masks_data_split_per_months(df,
                                                                        n_splits=3,
                                                                        offset_trainval=5,
                                                                        offset_test=1)

In [7]:
cat_vars = ['errors','use_chip','foreign_transaction', 'morning', 'afternoon','day_week']
high_cat_vars = ['mcc']
num_vars = ['amount','amount_log','recency','frequency','monetary']

In [8]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing._target_encoder import TargetEncoder
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

import xgboost as xgb


class TopCategoriesEncoder(BaseEstimator, TransformerMixin):
  def __init__(self, n=5):
    self.n = n
    self.top_categories = []

  def fit(self, X, y=None):
    # Calculate the proportion of the positive class for each category
    
    proportions = self.group_mean(X, y)

    # Sort the categories by the proportion of the positive class
    sorted_categories = dict(sorted(proportions.items(), key=lambda item: item[1], reverse=True))

    # Store the top n categories
    self.top_categories = list(sorted_categories.keys())[:self.n]

    return self
    
  def transform(self, X):
    # For each top category, add a new binary feature to X
    X = X.flatten()
    X_trans = np.zeros((X.shape[0], len(self.top_categories)))
    for i, category in enumerate(self.top_categories):
    
      X_trans[:, i] = (X == category)

    return X_trans

  @staticmethod
  def group_mean(X, y):
    # Get unique categories
    categories = np.unique(X)
    X = X.flatten()

    # Compute mean for each category
    means = {category: np.mean(y[X == category]) for category in categories}
  
    return means
    
class AddIsolationForestAnomalyScore(BaseEstimator, TransformerMixin):
  def __init__(self, **kwargs):
    for key, value in kwargs.items():
      setattr(self, key, value)
    
  def fit(self, X, y=None):
    n_estimators = getattr(self, 'n_estimators', 100)
    
    self.iforest = IsolationForest(n_estimators=n_estimators)
    self.iforest.fit(X)
    return self

  def transform(self, X):
    anomaly_scores = self.iforest.decision_function(X).reshape(-1, 1)
    return np.hstack([X, anomaly_scores])
      
      
class CatNarrow(BaseEstimator, TransformerMixin):
  "Narrows the categories of a categorical variable based on a threshold."

  def __init__(self, threshold: float=0.05):
    self.threshold = threshold
    
  def fit(self, X, y=None):
    X_ = np.array(X).astype(str)
    self.dc_filtered  = []
    
    # For each column, check the frequency of each category and filter based on threshold
    for c in range(X_.shape[1]):
      X_aux = X_[:, c]
      unique, counts = np.unique(X_aux, return_counts=True)
      counts_normalized = counts / sum(counts)
      dc = dict(zip(unique, counts_normalized))
      dc_filtered_ = {k: v for k, v in dc.items() if v >= self.threshold}  # Only keep categories with frequency >= threshold
      dc_filtered_ = np.array(list(dc_filtered_.keys())).reshape(-1,1)
      self.dc_filtered.append(dc_filtered_)
    
    return self
  
  def transform(self, X):
    """Replace categories with frequency < threshold by 'undefined'.
    
    Args:
      X (np.ndarray): The input array containing categories.
      
    Returns:
      X_trans (np.ndarray): The transformed array with replaced categories.
    """
       
    X_ = np.array(X).astype(str)
    list_aux = []
    
    for c in range(X_.shape[1]):  
      X_aux = np.array(X_[:, c]).reshape(-1,1)
      dc_filtered_ = self.dc_filtered[c]
      np.place(X_aux, np.isin(X_aux, dc_filtered_, invert = True), ['undefined']) # Replace categories with frequency < threshold by 'undefined'
      list_aux.append(X_aux)
    
    X_trans = np.concatenate(list_aux, axis = 1)
    return X_trans
  


num_transformer = Pipeline(steps = [
                                    ('imputer', SimpleImputer(strategy='median')),
                                    ('scaler', StandardScaler())
                                    ]
                          )

cat_transformer = Pipeline(steps = [
                                    ('imputer', SimpleImputer(
                                                              strategy='constant',
                                                              fill_value=-9999
                                                              )
                                    ),
                                    ('catnarrow', CatNarrow(threshold = 0.05)),
                                    ('target_enc', TargetEncoder()),
                                    ('scaler', StandardScaler())
                                            ]
                                   )

high_cat_transformer = Pipeline(steps = [
                                    ('imputer', SimpleImputer(
                                                              strategy='constant',
                                                              fill_value=-9999
                                                              )
                                    ),
                                    ('topcat_enc', TopCategoriesEncoder(n=3)),
                                    ('target_enc', TargetEncoder()),
                                    ('scaler', StandardScaler())
                                            ]
                                   )
                                   

processing = ColumnTransformer(transformers=[
                                            ('cat', cat_transformer, cat_vars),
                                            ('high_cat', high_cat_transformer, high_cat_vars),
                                            ('num', num_transformer, num_vars)
                                            ],
                               remainder='drop'
                              )
pipe = Pipeline(steps=[
                      ('processing', processing),
                      ('isolation_forest', AddIsolationForestAnomalyScore(n_estimators=100))
                       ]
              )

In [9]:
trainval_masks = train_masks + val_masks

In [10]:
split = 2


X_train = df.drop(columns=['fraud']).loc[trainval_masks[split]]
y_train = df['fraud'].loc[trainval_masks[split]]


X_test = df.drop(columns=['fraud']).loc[test_masks[split]]
y_test = df['fraud'].loc[test_masks[split]]


In [11]:
pipeline_training = pipe.fit(X_train, y_train)

In [12]:
pipeline_training

In [13]:
X_train_trans = pipeline_training.transform(X_train)
X_test_trans = pipeline_training.transform(X_test)

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [15]:
m0_rf = RandomForestClassifier(n_estimators=500, random_state=123, n_jobs=-1)
m0_rf.fit(X_train_trans, y_train)
pred_m0_rf = m0_rf.predict_proba(X_test_trans)[:, 1]


In [16]:

def ave_savings_score(y_true, score, amount, cf=5):
    #TODO: update description
    """Savings score.
   
    Parameters
    ----------
    y_true : array-like or label indicator matrix
        Ground truth (correct) labels.
    score : array-like or label indicator matrix
        Probabilities returned by a classifier.
    amount : array-like of shape = [n_samples, 4]
        Cost matrix of the classification problem
        Where the columns represents the costs of: false positives, false negatives,
        true positives and true negatives, for each example.
    cf    : float, default=10
        Fixed cost of investigation.
    Returns
    -------
    expected savings : float
        Savings of a using y_pred on y_true with cost-matrix cost-mat
        The best performance is 1.

    """

    ave_savings = np.sum(y_true*score*amount - score*cf)/np.sum(y_true*amount)
  
    return ave_savings


def savings_score(y_true, amount, cf=5):
    #TODO: update description
    """Savings score.
   
    Parameters
    ----------
    y_true : array-like or label indicator matrix
        Ground truth (correct) labels.
    amount : array-like of shape = [n_samples, 4]
        Cost matrix of the classification problem
        Where the columns represents the costs of: false positives, false negatives,
        true positives and true negatives, for each example.
    cf    : float, default=10
        Fixed cost of investigation.
    Returns
    -------
    expected savings : float
        Savings of a using y_pred on y_true with cost-matrix cost-mat
        The best performance is 1.

    """

    savings = np.sum(y_true*amount - len(y_true)*cf)/np.sum(y_true*amount)
  
    return savings


In [17]:
ix_sorted_top_rf = np.argsort(pred_m0_rf)[::-1]
topk = 0.01
y_test_ix = y_test.iloc[ix_sorted_top_rf][:int(topk*len(y_test))]
amount_test_ix = X_test['amount'].iloc[ix_sorted_top_rf][:int(topk*len(y_test))]

print('ROC-AUC: ', roc_auc_score(y_test, pred_m0_rf))
print('PR-AUC: ', average_precision_score(y_test, pred_m0_rf))
print('Expected Savings: ', ave_savings_score(y_test, pred_m0_rf, X_test['amount'], cf=10))
print('Uplift: ', average_precision_score(y_test, pred_m0_rf)/np.mean(y_test))
print('F1-score: ', f1_score(y_test, pred_m0_rf > 0.5))
print('Recall Top-1%: ',100*np.sum(y_test.iloc[ix_sorted_top_rf][:int(topk*len(y_test))])/np.sum(y_test))
print('Precision Top-1%', 100*np.mean(y_test.iloc[ix_sorted_top_rf][:int(topk*len(y_test))]))
print('Savings Top-1%: ', savings_score(y_test.iloc[ix_sorted_top_rf][:int(topk*len(y_test))], 
                                        X_test['amount'].iloc[ix_sorted_top_rf][:int(topk*len(y_test))]))
print('Recall Monetary Top-1%: ',100*np.sum(y_test_ix*amount_test_ix)/np.sum(y_test*X_test['amount']))

ROC-AUC:  0.9992682380810086
PR-AUC:  0.5992219069564511
Expected Savings:  0.309186383531873
Uplift:  558.5474502418314
F1-score:  0.4489795918367347
Recall Top-1%:  100.0
Precision Top-1% 10.749185667752444
Savings Top-1%:  -118.64298408636219
Recall Monetary Top-1%:  100.0


In [18]:

m0_xgb = xgb.XGBClassifier(n_estimators=100,
                           max_depth=5,
                           learning_rate=0.1,
                           random_state=123, 
                           n_jobs=-1
                           
                           )
m0_xgb.fit(X_train_trans, y_train)
pred_m0_xgb = m0_xgb.predict_proba(X_test_trans)[:, 1]


In [19]:
ix_sorted_top = np.argsort(pred_m0_xgb)[::-1]
topk = 0.01
y_test_ix = y_test.iloc[ix_sorted_top][:int(topk*len(y_test))]
amount_test_ix = X_test['amount'].iloc[ix_sorted_top][:int(topk*len(y_test))]

print('ROC-AUC: ', roc_auc_score(y_test, pred_m0_xgb))
print('PR-AUC: ', average_precision_score(y_test, pred_m0_xgb))
print('Expected Savings: ', ave_savings_score(y_test, pred_m0_xgb, X_test['amount']))
print('Uplift: ', average_precision_score(y_test, pred_m0_xgb)/np.mean(y_test))
print('F1-score: ', f1_score(y_test, pred_m0_xgb > 0.5))
print('Recall Top-1%: ', 100*np.sum(y_test.iloc[ix_sorted_top][:int(topk*len(y_test))])/np.sum(y_test))
print('Precision Top-1%', 100*np.mean(y_test.iloc[ix_sorted_top][:int(topk*len(y_test))]))
print('Recall Monetary Top-1%: ',100*np.sum(y_test_ix*amount_test_ix)/np.sum(y_test*X_test['amount']))


ROC-AUC:  0.9993145895772251
PR-AUC:  0.5891145894818625
Expected Savings:  0.39654242127402317
Uplift:  549.126205226124
F1-score:  0.4528301886792453
Recall Top-1%:  100.0
Precision Top-1% 10.749185667752444
Recall Monetary Top-1%:  100.0


In [20]:
import optuna
from sklearn.metrics import average_precision_score
def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    # pick hyper-parameters
    hyperparams = {
        "n_estimators": trial.suggest_int("n_estimators", 2, 50),
        "max_depth": trial.suggest_int("max_depth", 2, 5),
        "learning_rate": trial.suggest_float("learning_rate", 0.10, 1.0),
    }
       
    train_masks, val_masks, test_masks = create_masks_data_split_per_months(df,
                                                                            n_splits=3,
                                                                            offset_trainval=5,
                                                                            offset_test=1)
    scores = []
    
    train_masks = train_masks + val_masks
    for i in range(3):
        
        # split data for training and validation
        X_train = df.drop(columns=['fraud']).loc[train_masks[i]]
        y_train = df['fraud'].loc[train_masks[i]]
        
        X_val = df.drop(columns=['fraud']).loc[test_masks[i]]
        y_val = df['fraud'].loc[test_masks[i]]
        

        pipeline_training = pipe.fit(X_train, y_train)
        
        X_train = pipeline_training.transform(X_train)

        X_val = pipeline_training.transform(X_val)

        
        # train the model
        m0_xgb = xgb.XGBClassifier(**hyperparams,
                                    random_state=123, 
                                    n_jobs=-1
                                    )
        
        
        m0_xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
        
        
        # evaluate the model
        y_pred = m0_xgb.predict_proba(X_val)[:, 1]
        ap = ave_savings_score(y_val, y_pred, df.drop(columns=['fraud']).loc[test_masks[i]]['amount'], cf=10)

        scores.append(ap)
   
    # Return the mean score
    return np.array(scores).mean()

In [21]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=25)

[I 2024-03-07 15:49:20,218] A new study created in memory with name: no-name-3d6513d3-e510-420e-89cc-c6876d48bdc9
[I 2024-03-07 15:49:26,548] Trial 0 finished with value: 0.30531264732252267 and parameters: {'n_estimators': 41, 'max_depth': 4, 'learning_rate': 0.6290876285087037}. Best is trial 0 with value: 0.30531264732252267.
[I 2024-03-07 15:49:32,687] Trial 1 finished with value: 0.15307914045180973 and parameters: {'n_estimators': 15, 'max_depth': 4, 'learning_rate': 0.2983548096734421}. Best is trial 0 with value: 0.30531264732252267.
[I 2024-03-07 15:49:39,083] Trial 2 finished with value: 0.2327829668705892 and parameters: {'n_estimators': 36, 'max_depth': 2, 'learning_rate': 0.23973655565305435}. Best is trial 0 with value: 0.30531264732252267.
[I 2024-03-07 15:49:45,394] Trial 3 finished with value: 0.26032143213529524 and parameters: {'n_estimators': 48, 'max_depth': 2, 'learning_rate': 0.5156544695787717}. Best is trial 0 with value: 0.30531264732252267.
[I 2024-03-07 15:4

In [22]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'n_estimators': 36, 'max_depth': 3, 'learning_rate': 0.41027535331453185}


In [23]:
m0_xgb = xgb.XGBClassifier(**best_params,
                            random_state=123, 
                            n_jobs=-1
                            )

m0_xgb.fit(X_train_trans, y_train, verbose=False)
pred_m0_xgb = m0_xgb.predict_proba(X_test_trans)[:, 1]

In [25]:
ix_sorted_top = np.argsort(pred_m0_xgb)[::-1]
topk = 0.01
y_test_ix = y_test.iloc[ix_sorted_top][:int(topk*len(y_test))]
amount_test_ix = X_test['amount'].iloc[ix_sorted_top][:int(topk*len(y_test))]

print('ROC-AUC: ', roc_auc_score(y_test, pred_m0_xgb))
print('PR-AUC: ', average_precision_score(y_test, pred_m0_xgb))
print('Expected Savings: ', ave_savings_score(y_test, pred_m0_xgb, X_test['amount']))
print('Uplift: ', average_precision_score(y_test, pred_m0_xgb)/np.mean(y_test))
print('F1-score: ', f1_score(y_test, c > np.mean(y_train==0)))
print('Recall Top-1%: ', 100*np.sum(y_test.iloc[ix_sorted_top][:int(topk*len(y_test))])/np.sum(y_test))
print('Precision Top-1%', 100*np.mean(y_test.iloc[ix_sorted_top][:int(topk*len(y_test))]))
print('Recall Monetary Top-1%: ',100*np.sum(y_test_ix*amount_test_ix)/np.sum(y_test*X_test['amount']))

ROC-AUC:  0.9992697173840793
PR-AUC:  0.5637293976099368
Expected Savings:  0.36942357258694075
Uplift:  525.4641294085351
F1-score:  0.0
Recall Top-1%:  100.0
Precision Top-1% 10.749185667752444
Recall Monetary Top-1%:  100.0


# Optimal Threshold

In [194]:
from sklearn.metrics import recall_score, precision_score, f1_score

X_val_only = df.drop(columns=['fraud']).loc[val_masks[split]]
y_val_only = np.asarray(df['fraud'].loc[val_masks[split]]).astype(int)

X_train = df.drop(columns=['fraud']).loc[train_masks[split]]
y_train = df['fraud'].loc[train_masks[split]]


pipeline_training = pipe.fit(X_train, y_train)
X_val_trans = pipeline_training.transform(X_val_only)

pred_m0_xgb_val = m0_xgb.predict_proba(X_val_trans)[:, 1]

In [195]:

list_f1_score = []
list_recall = []
list_precision = []
for i in np.linspace(0, 1, 100):
    threshold = i
    pred_labels = (pred_m0_xgb_val > threshold).astype(int)
    list_f1_score.append(f1_score(y_val_only, pred_labels)) 
    list_recall.append(recall_score(y_val_only, pred_labels))
    list_precision.append(precision_score(y_val_only, pred_labels))

my_dict = dict(zip(np.linspace(0, 1, 100), list_f1_score))

# Get the key with the maximum value
max_key = max(my_dict, key=my_dict.get)
index_max_key = list(my_dict.keys()).index(max_key)

optimal_threshold = max_key
print(f'optimal_threshold={optimal_threshold}')  

print('Max F1 score on test set: ', max(my_dict.values()))
print('Recall on test set: ', list_recall[index_max_key])
print('Precision on test set: ', list_precision[index_max_key])
print('Number of predicted positives proportional number of obs: ', np.sum((pred_m0_xgb_val > optimal_threshold).astype(int))/y_val_only.shape[0])

optimal_threshold=0.04040404040404041
Max F1 score on test set:  0.5405405405405406
Recall on test set:  0.7407407407407407
Precision on test set:  0.425531914893617
Number of predicted positives proportional number of obs:  0.001578399435806159


# Optimal Thresholding under Constraint

In [197]:
def fbeta_score_under_constraint(recall: float, precision: float, beta: float) -> float:
    """

    """
    return (1 + beta**2) * (precision * recall) / (beta**2 * precision + recall)

In [208]:
n_top1perc = np.round((len(pred_m0_xgb_val[np.argsort(pred_m0_xgb_val)[::-1]])*0.01),0).astype(int)
ix_sorted_top_val = np.argsort(pred_m0_xgb_val)[::-1]
y_val_only_ = np.asarray(y_val_only).astype(int)
list_f1_score = []
list_precision = []
list_recall = []
list_recall_money = []
for i in range(1,n_top1perc+1):

    pseudo_labels = np.ones_like(pred_m0_xgb_val[ix_sorted_top_val[:i]]).astype(int)
    
    precision_under_constraint = precision_score(y_val_only_[ix_sorted_top_val[:i]], pseudo_labels) 
    list_precision.append(precision_under_constraint)
      
    recall_under_constraint = np.sum(y_val_only_[ix_sorted_top_val[:i]])/np.sum(y_val_only_) 
    list_recall.append(recall_under_constraint)
    
    recall_monetary_under_constraint = np.sum(y_val_only_[ix_sorted_top_val[:i]]* \
                                              X_val_only['amount'].iloc[ix_sorted_top_val[:i]]) / \
                                              np.sum(y_val_only_*X_val_only['amount'])
    list_recall_money.append(recall_monetary_under_constraint)
    
    f1_score_ = fbeta_score_under_constraint(recall_under_constraint, precision_under_constraint, 1)
    list_f1_score.append(f1_score_)
    
    

my_dict = dict(zip(np.sort(pred_m0_xgb_val)[::-1], list_f1_score))

# Get the key with the maximum value
max_key = max(my_dict, key=my_dict.get)
index_max_key = list(my_dict.keys()).index(max_key)

optimal_threshold = max_key
print(f'optimal_threshold={optimal_threshold}')  

proportion_investigations = np.sum((pred_m0_xgb_val > optimal_threshold).astype(int))/y_val_only.shape[0]
number_investigations = np.sum((pred_m0_xgb_val > optimal_threshold).astype(int))

print('Max F1 score on val set: ', max(my_dict.values()))
print('Recall on val set: ', list_recall[index_max_key])
print('Recall Monetary on val set: ', list_recall_money[index_max_key])
print('Precision on val set: ', list_precision[index_max_key])
print(f'Proportion {100*proportion_investigations:.2f}% out of the whole val set and number {number_investigations} of investigations out of {n_top1perc} investigations permitted')

optimal_threshold=0.0351359061896801
Max F1 score on val set:  0.5454545454545454
Recall on val set:  0.7407407407407407
Recall Monetary on val set:  0.8753913337365259
Precision on val set:  0.4166666666666667
Proportion 0.16% out of the whole val set and number 49 of investigations out of 298 investigations permitted


In [211]:
label_pred_test = (pred_m0_xgb > optimal_threshold).astype(int)
n_top1perc = np.round((len(pred_m0_xgb[np.argsort(pred_m0_xgb)[::-1]])*0.01),0).astype(int)
test_amount = X_test['amount']
recall_monetary_test_set = np.sum(label_pred_test*test_amount*y_test)/np.sum(y_test*test_amount)

proportion_investigations = np.sum((pred_m0_xgb > optimal_threshold).astype(int))/y_test.shape[0]
number_investigations = np.sum((pred_m0_xgb > optimal_threshold).astype(int))

print('Max F1 score on test set: ',f1_score(y_test, label_pred_test))
print('Recall on test set: ', recall_score(y_test, label_pred_test))
print('Recall Monetary on test set: ', recall_monetary_test_set)
print('Precision on test set: ', precision_score(y_test, label_pred_test))
print('Number of predicted positives proportional number of obs: ', np.sum(label_pred_test)/y_test.shape[0])
print(f'Proportion {100*proportion_investigations:.2f}% out of the whole test set and number {number_investigations} of investigations out of {n_top1perc} investigations permitted')

Max F1 score on test set:  0.44285714285714284
Recall on test set:  0.9393939393939394
Recall Monetary on test set:  0.9931780560379408
Precision on test set:  0.2897196261682243
Number of predicted positives proportional number of obs:  0.0034785435630689207
Proportion 0.35% out of the whole test set and number 107 of investigations out of 308 investigations permitted


In [83]:
from sklearn.metrics import recall_score, precision_score

list_f1_score = []
list_recall = []
list_precision = []
for i in np.linspace(0, 1, 100):
    threshold = i
    pred_labels = (pred_m0_xgb > threshold).astype(int)
    list_f1_score.append(f1_score(y_test, pred_labels)) 
    list_recall.append(recall_score(y_test, pred_labels))
    list_precision.append(precision_score(y_test, pred_labels))

my_dict = dict(zip(np.linspace(0, 1, 100), list_f1_score))

# Get the key with the maximum value
max_key = max(my_dict, key=my_dict.get)
index_max_key = list(my_dict.keys()).index(max_key)

optimal_threshold = max_key
print(f'optimal_threshold={optimal_threshold}')
print('Max F1 score on test set: ', max(my_dict.values()))
print('Recall on test set: ', list_recall[index_max_key])
print('Precision on test set: ', list_precision[index_max_key])
print('Number of predicted positives proportional number of obs: ', np.sum((pred_m0_xgb > optimal_threshold).astype(int))/y_test.shape[0])

0.19191919191919193
Max F1 score on test set:  0.5822784810126582
Recall on test set:  0.696969696969697
Precision on test set:  0.5
Number of predicted positives proportional number of obs:  0.0014954486345903771
