In [10]:
import numpy as np
import pandas as pd
pd.options.display.max_seq_items = 100

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, RandomOverSampler, KMeansSMOTE
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import MiniBatchKMeans
from datetime import timedelta


from ADVO.generator import Generator
from ADVO.oversampler import ADVO, TimeGANOverSampler, CTGANOverSampler
from ADVO.utils import evaluate_models, compute_kde_difference_auc

In [11]:
# Necessary imports for this notebook

import os

import pandas as pd
import numpy as np
import datetime

# For Pandas parallelisation
from pandarallel import pandarallel
# pandarallel.initialize(nb_workers=20)
pandarallel.initialize( use_memory_fs=False, nb_workers=10) # 
# pandarallel.initialize(progress_bar=True)

# For plotting
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

#!curl -O https://raw.githubusercontent.com/Fraud-Detection-Handbook/fraud-detection-handbook/main/shared_functions_basic.ipynb
%run ../worldline_home/shared_functions_basic.ipynb
#%run ../../../worldline_home/worldline_home/shared_functions_basic.ipynb



INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [12]:
categorical = ['TERM_COUNTRY', 'TERM_MCC']


In [13]:
# Load a set of pickle files, put them together in a single dataframe, and order them by time
# It takes as input the folder DIR_INPUT where the files are stored, and the BEGIN_DATE and END_DATE
def read_from_files(DIR_INPUT, BEGIN_DATE, END_DATE):
    
    files = [join(DIR_INPUT, f) for f in listdir(DIR_INPUT) if f>=BEGIN_DATE+'.pkl' and f<=END_DATE+'.pkl']

    frames = []
    for f in files:
        df = pd.read_pickle(f)
        frames.append(df)
        del df
    df_final = pd.concat(frames)
    
    df_final=df_final.sort_values('TX_ID')
    df_final.reset_index(drop=True,inplace=True)
    #  Note: -1 are missing values for real world data 
    #df_final=df_final.replace([-1],0)
    
    return df_final


def clean_categorical(transactions_df):
    
    all_features = transactions_df.columns

    prel_df = transactions_df[transactions_df['TX_FRAUD']==1]
    #print(prel_df.loc[prel_df['TERM_COUNTRY']=='SLV'])
    prel_df_gen = transactions_df[transactions_df['TX_FRAUD']==0] 
    categorical = ['TERM_COUNTRY', 'TERM_MCC']
    
    #print('Expected number is ' + str(prel_df['TERM_COUNTRY'].value_counts()['SLV']))
    
    features_counts = {}
    features_counts_gen = {}
    
    for column in categorical:
        print(column)
        features_counts[column] =  prel_df[column].value_counts()
        features_counts_gen[column] =  prel_df_gen[column].value_counts()
    #print(features_counts['TERM_COUNTRY']['SLV'])
    values_to_keep = {}
    percentages = {}

    counter = 0 
    sum_freq = 0

    #print('Feature counts terminals columns are ' + str(features_counts['TERM_COUNTRY'].keys()))
    #print('Single features are:')
    for feature in features_counts.keys():
        values_to_keep[feature] = []
        percentages[feature] = {}
        frequencies = 0
        #print(features_counts[feature].keys())
        for subkey in features_counts[feature].keys():
            # print('Subkey is '+str(subkey) + ', values are ' + str(features_counts_gen[feature][subkey]) )
            try:
                frequency = features_counts[feature][subkey]/prel_df_gen.shape[0]
            except:
                print('We are looking into ' + str(features_counts[feature]) + ', for subkey ' + str(subkey) )
                
                frequency = features_counts[feature][subkey]/prel_df_gen.shape[0]
            
            values_to_keep[feature].append(subkey)
            if subkey in   features_counts_gen[feature].keys():
                percentages[feature][subkey] =    features_counts[feature][subkey] / (features_counts[feature][subkey] + features_counts_gen[feature][subkey])
            else: 
                percentages[feature][subkey] =    1
            #else:
            #    sum_freq +=  (features_counts[feature][subkey] /  features_counts_gen[feature][subkey]) * frequency
            #    frequencies += frequency
            #    counter = counter + 1
        # if counter >0:
        for subkey in features_counts_gen[feature].keys():
            if subkey not in percentages[feature].keys():
                percentages[feature][subkey] = 0
    #print(percentages)
    #Scale percentages                
    for key in percentages.keys():
        max_val = max(percentages[key].values())
        percentages[key] = {k: v/max_val for k, v in percentages[key].items()}
        
    for column in categorical:
        transactions_df.replace(percentages[column], inplace=True)    
    return transactions_df



def retrieve_and_clean_realData(DIR_INPUT, BEGIN_DATE, END_DATE):
    df = read_from_files(DIR_INPUT, BEGIN_DATE, END_DATE)
    df.rename({'TX_TIME_DAYS': 'TX_DAY', 'TX_TIME_SECONDS': 'TX_TIME'}, inplace=True, axis = 1)
    df.drop(['TX_3D_SECURE', 'TX_LOCAL_AMOUNT',
        'TX_LOCAL_CURRENCY', 'TX_CARD_ENTRY_MODE', 'CARD_AUTHENTICATION','TX_INTL', 'AGE', 'LANGUAGE', 'GENDER', 'BROKER', 'ZIP',
        'INS_CODE', 'CITY', 'COUNTRY', 'PROVINCE_CODE', 'DISTRICT_CODE',
        'CARD_BRAND', 'CARD_EXPIRY', 'CARD_TYPE', 'CREDIT_LIMIT', 'TX_ECOM_IND', 'TX_ID'], axis=1, inplace=True)
    df = clean_categorical(df)
    df.rename({'TERM_MCC': 'X_TERMINAL', 'TERM_COUNTRY': 'Y_TERMINAL'}, inplace=True, axis = 1)
    return df


In [14]:
DIR_INPUT ='../worldline_home/2018/baseline/data_clean/'
#DIR_INPUT='../worldline_home/media/hdd3/worldline_home/2018/baseline/data_clean/' 
#DIR_INPUT = '2018/baseline/data_clean/'

BEGIN_DATE = "2018-04-01"
END_DATE = "2018-04-20"
# To load everything:
# END_DATE = "2018-09-30"

#BEGIN_DATE = "2018-07-25"
#END_DATE = "2018-08-31"

print("Load  files")
%time transactions_df=retrieve_and_clean_realData(DIR_INPUT, BEGIN_DATE, END_DATE)


Load  files
TERM_COUNTRY
TERM_MCC
CPU times: user 36min 3s, sys: 1min 22s, total: 37min 25s
Wall time: 37min 22s


In [25]:
SAMPLE_STRATEGY = 0.18
N_JOBS = 10
N_TREES = 20
N_USERS = 10000
N_TERMINALS = 1000
RANDOM_STATE = 42

RANDOM_GRID_RF = {'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'max_features': [1, 'sqrt', 'log2'], 'max_depth': [5, 16, 28, 40, None], 'min_samples_split': [10, 25, 50], 'min_samples_leaf': [4, 8, 32], 'bootstrap': [True, False]}
RANDOM_GRID_RIDGE = {'alpha': [int(x) for x in np.linspace(start = 0.001, stop = 1, num = 100)], 'fit_intercept': [True, False]}
RANDOM_GRID_NN = {'hidden_layer_sizes': [int(x) for x in np.linspace(start = 1, stop = 41, num = 80)], 'alpha': [int(x) for x in np.linspace(start = 0.005, stop = 0.02, num = 100)]}


CANDIDATE_REGRESSORS = [MLPRegressor(max_iter=2000, random_state=RANDOM_STATE), Ridge(random_state=RANDOM_STATE), RandomForestRegressor(random_state=RANDOM_STATE)]
CANDIDATE_GRIDS = [RANDOM_GRID_NN, RANDOM_GRID_RIDGE, RANDOM_GRID_RF]

In [26]:
def fit_predict(X_train,y_train,learner, X_test, predictions_proba, discrete_predictions):
    learner.fit(X_train, y_train)
    y_hat = learner.predict(X_test)
    y_hat_proba = learner.predict_proba(X_test)[:,1]
    predictions_proba.append(y_hat_proba)
    discrete_predictions.append(y_hat)

def run_advo(X_train, y_train, window_counter):
    advo = ADVO(n_jobs=N_JOBS,sampling_strategy=SAMPLE_STRATEGY,random_state=RANDOM_STATE, mimo=False)
    advo.set_transactions(X_train, y_train)
    advo.create_couples()
    regressor_scores = advo.select_best_regressor(candidate_regressors=CANDIDATE_REGRESSORS,parameters_set=CANDIDATE_GRIDS)
    advo.tune_best_regressors()
    advo.fit_regressors()
    advo.transactions_df = advo.insert_synthetic_frauds(advo.transactions_df)
    regressor_scores.to_csv('realresults/regressor_scores_'+str(window_counter)+'.csv', index=False)
    return advo

In [28]:
def make_classification(train_size_days=5, test_size_days=2):

    #transactions_df = Generator().generate(filename='dataset_six_months.csv',nb_days_to_generate=180)
    #transactions_df = pd.read_csv('utils/dataset_six_months.csv', parse_dates=['TX_DATETIME'])

    start_date, end_date = transactions_df['TX_DATETIME'].min(), transactions_df['TX_DATETIME'].max()
    
    window_start, window_end, window_counter  = start_date, start_date + timedelta(days=train_size_days), 0
    while window_end <= end_date:
        print('Window: ', window_counter, ' - ', window_start, ' - ', window_end)

        # Split data into train and test according to the window
        train_mask, test_mask = (transactions_df['TX_DATETIME'] >= window_start) & (transactions_df['TX_DATETIME'] < window_end), (transactions_df['TX_DATETIME'] >= window_end) & (transactions_df['TX_DATETIME'] < window_end + timedelta(days=test_size_days))
        X_train, y_train, X_test, y_test = transactions_df[train_mask].drop(columns=['TX_FRAUD']), transactions_df[train_mask]['TX_FRAUD'], transactions_df[test_mask].drop(columns=['TX_FRAUD']), transactions_df[test_mask]['TX_FRAUD']
        training_variables, predictions_proba, discrete_predictions = ['X_TERMINAL', 'Y_TERMINAL', 'TX_AMOUNT'], [], []

        # Oversample data using ADVO, SMOTE, RandomOverSampler and KMeansSMOTE
        advo = run_advo(X_train, y_train, window_counter)
        kmeans_smote = KMeansSMOTE(n_jobs=N_JOBS, kmeans_estimator=MiniBatchKMeans(n_init=3),sampling_strategy=SAMPLE_STRATEGY, cluster_balance_threshold=0.005, random_state=RANDOM_STATE).fit_resample(X_train[training_variables], y_train)
        smote = SMOTE(k_neighbors=NearestNeighbors(n_jobs=N_JOBS),sampling_strategy=SAMPLE_STRATEGY,random_state=RANDOM_STATE).fit_resample(X_train[training_variables], y_train)
        random = RandomOverSampler(sampling_strategy=SAMPLE_STRATEGY, random_state=RANDOM_STATE).fit_resample(X_train[training_variables], y_train)
        timegan = TimeGANOverSampler(sampling_strategy=SAMPLE_STRATEGY, epochs=100, seq_len=4, n_seq=3, hidden_dim=24, gamma=1, noise_dim = 32, dim = 128, batch_size = 32, log_step = 100, learning_rate = 5e-4,random_state=RANDOM_STATE).fit_resample(X_train[training_variables+['CUSTOMER_ID']], y_train)
        ctgan = CTGANOverSampler(sampling_strategy=SAMPLE_STRATEGY,random_state=RANDOM_STATE).fit_resample(X_train[training_variables], y_train)
    
        names = ['Baseline','Baseline_balanced', 'SMOTE','Random', 'KMeansSMOTE', 'CTGAN','TIMEGAN', 'ADVO']
        Xy = [(X_train[training_variables], y_train), kmeans_smote, smote, random, ctgan, timegan, (advo.transactions_df[advo.useful_features], advo.transactions_df['TX_FRAUD'])]

        fit_predict(X_train[training_variables],y_train, RandomForestClassifier(n_estimators=N_TREES ,n_jobs=N_JOBS, random_state=RANDOM_STATE) , X_test[training_variables], predictions_proba, discrete_predictions)
        for X, y in Xy:
            fit_predict(X,y, BalancedRandomForestClassifier(n_estimators=N_TREES ,n_jobs=N_JOBS, random_state=RANDOM_STATE) , X_test[training_variables], predictions_proba, discrete_predictions)

        # Compute metrics
        _, all_metrics = evaluate_models(predictions_proba, discrete_predictions, X_test['CUSTOMER_ID'], names, y_test, K_needed = [50, 100, 200, 500, 1000, 2000])
        all_metrics.to_csv('realresults/all_metrics_'+str(window_counter)+'.csv', index=False)
        trapzs = compute_kde_difference_auc(Xy, training_variables, names)
        trapzs.to_csv('realresults/trapz_'+str(window_counter)+'.csv', index=False)
        

        window_start, window_end, window_counter  = window_end, window_end + timedelta(days=train_size_days), window_counter + 1
        print('Window ', window_counter, ' done')

In [29]:
np.random.seed(RANDOM_STATE)
    
make_classification(train_size_days=5, test_size_days=1)

Window:  0  -  2018-04-01 01:00:02  -  2018-04-06 01:00:02


Emddeding network training: 100%|████████████████████| 100/100 [00:18<00:00,  5.43it/s]
Supervised network training: 100%|███████████████████| 100/100 [00:13<00:00,  7.68it/s]
Joint networks training: 100%|███████████████████████| 100/100 [03:13<00:00,  1.93s/it]
Synthetic data generation: 100%|███████████████████| 2902/2902 [13:30<00:00,  3.58it/s]


Window  1  done
Window:  1  -  2018-04-06 01:00:02  -  2018-04-11 01:00:02


RuntimeError: No clusters found with sufficient samples of class 1. Try lowering the cluster_balance_threshold or increasing the number of clusters.

In [None]:
synthetic_transactions_df = pd.read_csv('utils/dataset_six_months.csv', parse_dates=['TX_DATETIME'])

In [None]:
synthetic_transactions_df.columns, transactions_df.columns

In [None]:
[column for column in synthetic_transactions_df.columns if column not in transactions_df.columns], [column for column in transactions_df.columns if column not in synthetic_transactions_df.columns]

In [None]:
%time tr=read_from_files(DIR_INPUT, BEGIN_DATE, END_DATE)


In [None]:
fraufs_df = tr[tr['TX_FRAUD'] == 1]
fraufs_df[fraufs_df['TERM_COUNTRY']=='SLV']

In [None]:
fraufs_df = tr[tr['TX_FRAUD'] == 1]['TERM_COUNTRY'].value_counts()['SLV']
fraufs_df#[fraufs_df['TERM_MCC']=='9999']

In [None]:
#7929  VS 9999