In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../')

# Custom functions
from src.data_split import create_masks_data_split_per_months
from src.paths import PREPROCESSED_DATA_DIR

In [3]:
import numpy as np
import pandas as pd

In [28]:
df = pd.read_parquet(PREPROCESSED_DATA_DIR / 'modeling_dataset.parquet')

In [29]:
train_masks, val_masks, test_masks = create_masks_data_split_per_months(df,
                                                                        n_splits=3,
                                                                        offset_trainval=5,
                                                                        offset_test=1)

In [30]:
cat_vars = ['errors','use_chip','foreign_transaction', 'morning', 'afternoon','day_week']
high_cat_vars = ['mcc']
num_vars = ['amount','amount_log','recency','frequency','monetary']

In [31]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing._target_encoder import TargetEncoder
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

import xgboost as xgb


class TopCategoriesEncoder(BaseEstimator, TransformerMixin):
  def __init__(self, n=5):
    self.n = n
    self.top_categories = []

  def fit(self, X, y=None):
    # Calculate the proportion of the positive class for each category
    
    proportions = self.group_mean(X, y)

    # Sort the categories by the proportion of the positive class
    sorted_categories = dict(sorted(proportions.items(), key=lambda item: item[1], reverse=True))

    # Store the top n categories
    self.top_categories = list(sorted_categories.keys())[:self.n]

    return self
    
  def transform(self, X):
    # For each top category, add a new binary feature to X
    X = X.flatten()
    X_trans = np.zeros((X.shape[0], len(self.top_categories)))
    for i, category in enumerate(self.top_categories):
    
      X_trans[:, i] = (X == category)

    return X_trans

  @staticmethod
  def group_mean(X, y):
    # Get unique categories
    categories = np.unique(X)
    X = X.flatten()

    # Compute mean for each category
    means = {category: np.mean(y[X == category]) for category in categories}
  
    return means
    
class AddIsolationForestAnomalyScore(BaseEstimator, TransformerMixin):
  def __init__(self, **kwargs):
    for key, value in kwargs.items():
      setattr(self, key, value)
    
  def fit(self, X, y=None):
    n_estimators = getattr(self, 'n_estimators', 100)
    
    self.iforest = IsolationForest(n_estimators=n_estimators)
    self.iforest.fit(X)
    return self

  def transform(self, X):
    anomaly_scores = self.iforest.decision_function(X).reshape(-1, 1)
    return np.hstack([X, anomaly_scores])
      
      
class CatNarrow(BaseEstimator, TransformerMixin):
  "Narrows the categories of a categorical variable based on a threshold."

  def __init__(self, threshold: float=0.05):
    self.threshold = threshold
    
  def fit(self, X, y=None):
    X_ = np.array(X).astype(str)
    self.dc_filtered  = []
    
    # For each column, check the frequency of each category and filter based on threshold
    for c in range(X_.shape[1]):
      X_aux = X_[:, c]
      unique, counts = np.unique(X_aux, return_counts=True)
      counts_normalized = counts / sum(counts)
      dc = dict(zip(unique, counts_normalized))
      dc_filtered_ = {k: v for k, v in dc.items() if v >= self.threshold}  # Only keep categories with frequency >= threshold
      dc_filtered_ = np.array(list(dc_filtered_.keys())).reshape(-1,1)
      self.dc_filtered.append(dc_filtered_)
    
    return self
  
  def transform(self, X):
    """Replace categories with frequency < threshold by 'undefined'.
    
    Args:
      X (np.ndarray): The input array containing categories.
      
    Returns:
      X_trans (np.ndarray): The transformed array with replaced categories.
    """
       
    X_ = np.array(X).astype(str)
    list_aux = []
    
    for c in range(X_.shape[1]):  
      X_aux = np.array(X_[:, c]).reshape(-1,1)
      dc_filtered_ = self.dc_filtered[c]
      np.place(X_aux, np.isin(X_aux, dc_filtered_, invert = True), ['undefined']) # Replace categories with frequency < threshold by 'undefined'
      list_aux.append(X_aux)
    
    X_trans = np.concatenate(list_aux, axis = 1)
    return X_trans
  


num_transformer = Pipeline(steps = [
                                    ('imputer', SimpleImputer(strategy='median')),
                                    ('scaler', StandardScaler())
                                    ]
                          )

cat_transformer = Pipeline(steps = [
                                    ('imputer', SimpleImputer(
                                                              strategy='constant',
                                                              fill_value=-9999
                                                              )
                                    ),
                                    ('catnarrow', CatNarrow(threshold = 0.05)),
                                    ('target_enc', TargetEncoder()),
                                    ('scaler', StandardScaler())
                                            ]
                                   )

high_cat_transformer = Pipeline(steps = [
                                    ('imputer', SimpleImputer(
                                                              strategy='constant',
                                                              fill_value=-9999
                                                              )
                                    ),
                                    ('topcat_enc', TopCategoriesEncoder(n=3)),
                                    ('target_enc', TargetEncoder()),
                                    ('scaler', StandardScaler())
                                            ]
                                   )
                                   

processing = ColumnTransformer(transformers=[
                                            ('cat', cat_transformer, cat_vars),
                                            ('high_cat', high_cat_transformer, high_cat_vars),
                                            ('num', num_transformer, num_vars)
                                            ],
                               remainder='drop'
                              )
pipe = Pipeline(steps=[
                      ('processing', processing),
                      ('isolation_forest', AddIsolationForestAnomalyScore(n_estimators=100))
                       ]
              )

In [32]:
trainval_masks = train_masks + val_masks

In [33]:
split = 2


X_train = df.drop(columns=['fraud']).loc[trainval_masks[split]]
y_train = df['fraud'].loc[trainval_masks[split]]


X_test = df.drop(columns=['fraud']).loc[test_masks[split]]
y_test = df['fraud'].loc[test_masks[split]]


In [34]:
pipeline_training = pipe.fit(X_train, y_train)

In [35]:
pipeline_training

In [36]:
X_train_trans = pipeline_training.transform(X_train)
X_test_trans = pipeline_training.transform(X_test)

In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [161]:
m0_rf = RandomForestClassifier(n_estimators=500, random_state=123, n_jobs=-1)
m0_rf.fit(X_train_trans, y_train)
pred_m0_rf = m0_rf.predict_proba(X_test_trans)[:, 1]


In [43]:

def ave_savings_score(y_true, score, amount, cf=5):
    #TODO: update description
    """Savings score.
   
    Parameters
    ----------
    y_true : array-like or label indicator matrix
        Ground truth (correct) labels.
    score : array-like or label indicator matrix
        Probabilities returned by a classifier.
    amount : array-like of shape = [n_samples, 4]
        Cost matrix of the classification problem
        Where the columns represents the costs of: false positives, false negatives,
        true positives and true negatives, for each example.
    cf    : float, default=10
        Fixed cost of investigation.
    Returns
    -------
    expected savings : float
        Savings of a using y_pred on y_true with cost-matrix cost-mat
        The best performance is 1.

    """

    ave_savings = np.sum(y_true*score*amount - score*cf)/np.sum(y_true*amount)
  
    return ave_savings


def savings_score(y_true, amount, cf=5):
    #TODO: update description
    """Savings score.
   
    Parameters
    ----------
    y_true : array-like or label indicator matrix
        Ground truth (correct) labels.
    amount : array-like of shape = [n_samples, 4]
        Cost matrix of the classification problem
        Where the columns represents the costs of: false positives, false negatives,
        true positives and true negatives, for each example.
    cf    : float, default=10
        Fixed cost of investigation.
    Returns
    -------
    expected savings : float
        Savings of a using y_pred on y_true with cost-matrix cost-mat
        The best performance is 1.

    """

    savings = np.sum(y_true*amount - len(y_true)*cf)/np.sum(y_true*amount)
  
    return savings


In [172]:
ix_sorted_top_rf = np.argsort(pred_m0_rf)[::-1]
topk = 0.01
y_test_ix = y_test.iloc[ix_sorted_top_rf][:int(topk*len(y_test))]
amount_test_ix = X_test['amount'].iloc[ix_sorted_top_rf][:int(topk*len(y_test))]

print('ROC-AUC: ', roc_auc_score(y_test, pred_m0_rf))
print('PR-AUC: ', average_precision_score(y_test, pred_m0_rf))
print('Expected Savings: ', ave_savings_score(y_test, pred_m0_rf, X_test['amount'], cf=10))
print('Uplift: ', average_precision_score(y_test, pred_m0_rf)/np.mean(y_test))
print('F1-score: ', f1_score(y_test, pred_m0_rf > 0.5))
print('Recall Top-1%: ',100*np.sum(y_test.iloc[ix_sorted_top_rf][:int(topk*len(y_test))])/np.sum(y_test))
print('Precision Top-1%', 100*np.mean(y_test.iloc[ix_sorted_top_rf][:int(topk*len(y_test))]))
print('Savings Top-1%: ', savings_score(y_test.iloc[ix_sorted_top_rf][:int(topk*len(y_test))], 
                                        X_test['amount'].iloc[ix_sorted_top_rf][:int(topk*len(y_test))]))
print('Recall Monetary Top-1%: ',100*np.sum(y_test_ix*amount_test_ix)/np.sum(y_test*X_test['amount']))

ROC-AUC:  0.9992352003124289
PR-AUC:  0.6166211874093772
Expected Savings:  0.2968086199717678
Uplift:  574.7656886276499
F1-score:  0.4489795918367347
Recall Top-1%:  100.0
Precision Top-1% 10.749185667752444
Savings Top-1%:  -118.64298408636219
Recall Monetary Top-1%:  100.0


In [165]:

m0_xgb = xgb.XGBClassifier(n_estimators=100,
                           max_depth=5,
                           learning_rate=0.1,
                           random_state=123, 
                           n_jobs=-1
                           
                           )
m0_xgb.fit(X_train_trans, y_train)
pred_m0_xgb = m0_xgb.predict_proba(X_test_trans)[:, 1]


In [169]:
ix_sorted_top = np.argsort(pred_m0_xgb)[::-1]
topk = 0.01
y_test_ix = y_test.iloc[ix_sorted_top][:int(topk*len(y_test))]
amount_test_ix = X_test['amount'].iloc[ix_sorted_top][:int(topk*len(y_test))]

print('ROC-AUC: ', roc_auc_score(y_test, pred_m0_xgb))
print('PR-AUC: ', average_precision_score(y_test, pred_m0_xgb))
print('Expected Savings: ', ave_savings_score(y_test, pred_m0_xgb, X_test['amount']))
print('Uplift: ', average_precision_score(y_test, pred_m0_xgb)/np.mean(y_test))
print('F1-score: ', f1_score(y_test, pred_m0_xgb > 0.5))
print('Recall Top-1%: ', 100*np.sum(y_test.iloc[ix_sorted_top][:int(topk*len(y_test))])/np.sum(y_test))
print('Precision Top-1%', 100*np.mean(y_test.iloc[ix_sorted_top][:int(topk*len(y_test))]))
print('Recall Monetary Top-1%: ',100*np.sum(y_test_ix*amount_test_ix)/np.sum(y_test*X_test['amount']))


ROC-AUC:  0.9990423978122094
PR-AUC:  0.504068121971996
Expected Savings:  0.30116774804917806
Uplift:  469.85258884419994
F1-score:  0.46153846153846156
Recall Top-1%:  100.0
Precision Top-1% 10.749185667752444
Recall Monetary Top-1%:  100.0


In [47]:
import optuna
from sklearn.metrics import average_precision_score
def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    # pick hyper-parameters
    hyperparams = {
        "n_estimators": trial.suggest_int("n_estimators", 2, 50),
        "max_depth": trial.suggest_int("max_depth", 2, 5),
        "learning_rate": trial.suggest_float("learning_rate", 0.10, 1.0),
    }
       
    train_masks, val_masks, test_masks = create_masks_data_split_per_months(df,
                                                                            n_splits=3,
                                                                            offset_trainval=5,
                                                                            offset_test=1)
    scores = []
    
    train_masks = train_masks + val_masks
    for i in range(3):
        
        # split data for training and validation
        X_train = df.drop(columns=['fraud']).loc[train_masks[i]]
        y_train = df['fraud'].loc[train_masks[i]]
        
        X_val = df.drop(columns=['fraud']).loc[test_masks[i]]
        y_val = df['fraud'].loc[test_masks[i]]
        

        pipeline_training = pipe.fit(X_train, y_train)
        
        X_train = pipeline_training.transform(X_train)

        X_val = pipeline_training.transform(X_val)

        
        # train the model
        m0_xgb = xgb.XGBClassifier(**hyperparams,
                                    random_state=123, 
                                    n_jobs=-1
                                    )
        
        
        m0_xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
        
        
        # evaluate the model
        y_pred = m0_xgb.predict_proba(X_val)[:, 1]
        ap = ave_savings_score(y_val, y_pred, df.drop(columns=['fraud']).loc[test_masks[i]]['amount'], cf=10)

        scores.append(ap)
   
    # Return the mean score
    return np.array(scores).mean()

In [48]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=25)

[I 2024-02-28 14:15:30,282] A new study created in memory with name: no-name-862dd232-1f01-4e3a-b207-bc4fa722ad66
[I 2024-02-28 14:15:36,589] Trial 0 finished with value: -4.530794399567267 and parameters: {'n_estimators': 3, 'max_depth': 5, 'learning_rate': 0.29316220921714}. Best is trial 0 with value: -4.530794399567267.
[I 2024-02-28 14:15:42,986] Trial 1 finished with value: 0.21282264372651669 and parameters: {'n_estimators': 35, 'max_depth': 3, 'learning_rate': 0.14722479952758447}. Best is trial 1 with value: 0.21282264372651669.
[I 2024-02-28 14:15:49,470] Trial 2 finished with value: 0.2902135509347114 and parameters: {'n_estimators': 37, 'max_depth': 2, 'learning_rate': 0.4091621857856269}. Best is trial 2 with value: 0.2902135509347114.
[I 2024-02-28 14:15:55,886] Trial 3 finished with value: 0.35470770909690047 and parameters: {'n_estimators': 25, 'max_depth': 5, 'learning_rate': 0.9486370336971207}. Best is trial 3 with value: 0.35470770909690047.
[I 2024-02-28 14:16:02,3

In [49]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'n_estimators': 25, 'max_depth': 5, 'learning_rate': 0.9486370336971207}


In [62]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_learning_rate,params_max_depth,params_n_estimators,state
0,0,-4.530794,2024-02-28 14:15:30.283114,2024-02-28 14:15:36.588821,0 days 00:00:06.305707,0.293162,5,3,COMPLETE
1,1,0.212823,2024-02-28 14:15:36.589821,2024-02-28 14:15:42.985775,0 days 00:00:06.395954,0.147225,3,35,COMPLETE
2,2,0.290214,2024-02-28 14:15:42.986775,2024-02-28 14:15:49.470265,0 days 00:00:06.483490,0.409162,2,37,COMPLETE
3,3,0.354708,2024-02-28 14:15:49.471265,2024-02-28 14:15:55.886977,0 days 00:00:06.415712,0.948637,5,25,COMPLETE
4,4,0.320467,2024-02-28 14:15:55.887978,2024-02-28 14:16:02.302124,0 days 00:00:06.414146,0.813915,3,36,COMPLETE
5,5,0.330581,2024-02-28 14:16:02.303124,2024-02-28 14:16:08.855286,0 days 00:00:06.552162,0.728034,3,28,COMPLETE
6,6,0.310289,2024-02-28 14:16:08.856286,2024-02-28 14:16:15.407178,0 days 00:00:06.550892,0.52116,4,44,COMPLETE
7,7,0.276446,2024-02-28 14:16:15.408178,2024-02-28 14:16:21.957177,0 days 00:00:06.548999,0.206513,3,48,COMPLETE
8,8,0.277153,2024-02-28 14:16:21.958177,2024-02-28 14:16:28.274881,0 days 00:00:06.316704,0.598622,3,12,COMPLETE
9,9,0.311993,2024-02-28 14:16:28.275882,2024-02-28 14:16:34.571115,0 days 00:00:06.295233,0.542179,5,13,COMPLETE


In [50]:
m0_xgb = xgb.XGBClassifier(**best_params,
                            random_state=123, 
                            n_jobs=-1
                            )

m0_xgb.fit(X_train_trans, y_train, verbose=False)
pred_m0_xgb = m0_xgb.predict_proba(X_test_trans)[:, 1]

In [51]:
ix_sorted_top = np.argsort(pred_m0_xgb)[::-1]
topk = 0.01
y_test_ix = y_test.iloc[ix_sorted_top][:int(topk*len(y_test))]
amount_test_ix = X_test['amount'].iloc[ix_sorted_top][:int(topk*len(y_test))]

print('ROC-AUC: ', roc_auc_score(y_test, pred_m0_xgb))
print('PR-AUC: ', average_precision_score(y_test, pred_m0_xgb))
print('Expected Savings: ', ave_savings_score(y_test, pred_m0_xgb, X_test['amount']))
print('Uplift: ', average_precision_score(y_test, pred_m0_xgb)/np.mean(y_test))
print('F1-score: ', f1_score(y_test, pred_m0_xgb > 0.5))
print('Recall Top-1%: ', 100*np.sum(y_test.iloc[ix_sorted_top][:int(topk*len(y_test))])/np.sum(y_test))
print('Precision Top-1%', 100*np.mean(y_test.iloc[ix_sorted_top][:int(topk*len(y_test))]))
print('Recall Monetary Top-1%: ',100*np.sum(y_test_ix*amount_test_ix)/np.sum(y_test*X_test['amount']))

ROC-AUC:  0.9985285865456399
PR-AUC:  0.5186018585364451
Expected Savings:  0.31324503182158453
Uplift:  483.39979298730464
F1-score:  0.5
Recall Top-1%:  96.96969696969697
Precision Top-1% 10.423452768729643
Recall Monetary Top-1%:  99.66918522580708
