In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np
import scipy
import seaborn as sns
from utils import GraphUtils, ExpUtils
import pandas as pd
import tensorflow as tf
from tensorflow_probability import edward2 as ed
from tensorflow.python import tf2
if not tf2.enabled():
    import tensorflow.compat.v2 as tf
    tf.enable_v2_behavior()
    assert tf2.enabled()
from tqdm import tqdm
import tensorflow_probability as tfp
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_curve,roc_auc_score, classification_report
from matplotlib import pyplot as plt

sns.reset_defaults()
sns.set_context(context='talk',font_scale=0.7)
plt.rcParams['image.cmap'] = 'viridis'

%matplotlib inline

tfd = tfp.distributions



## original data are available here (https://www.kaggle.com/ntnu-testimon/banksim1)

In [None]:
# load the data from a local copy
df_bank = pd.read_csv('data\\bs140513_032310.csv')

# Data cleaning

remove the zip code information as it is the same across all the customers

clean the strings from category, merchant, age, gender, and customer fields

In [None]:
del df_bank['zipMerchant']
del df_bank['zipcodeOri']

df_bank['category']=df_bank['category'].str.strip("'")
df_bank['merchant']=df_bank['merchant'].str.strip("'")
df_bank['age']=df_bank['age'].str.strip("'")
df_bank['gender']=df_bank['gender'].str.strip("'")
df_bank['customer']=df_bank['customer'].str.strip("'")

In [None]:
df_bank.head()

# Extract additional features
- risk of nature of business: measure for each business type the probability of its transaction to be fraud
- amount context: output the p-value of the amount after all the customer transactions are fitted to a lognorm distribution
- frequency: fit all the frequencies across the dataset with a lognorm distribution and then measure the probability of frequency of a given transaction (frequent or not frequent states only)

# Nature of Business

In [None]:
# Risk of nature of business
nature_of_bus_lbl = LabelEncoder().fit(df_bank['category'])

#calculate nature of business risk
risk_per_nob = {}
for b in df_bank.category.unique():
    t_df = df_bank[df_bank['category']==b]
    ratio = t_df[t_df['fraud']==1].shape[0] / t_df.shape[0]
    risk_per_nob[b] = ratio

In [None]:
nob_risks=np.asarray(list(risk_per_nob.values()))

In [None]:
def risk_to_label(risk):
    if risk <=0.2:
        if np.random.rand() >0.1:
            return 'low'
    if risk<=0.7:
        if np.random.rand() >0.1:
            return 'medium'
    return 'high'

In [None]:
df_nob = df_bank[['amount','category']]
df_nob['risk_label']=df_nob['category'].apply(lambda x :risk_to_label(risk_per_nob[x]))

In [None]:
# build a classifier for nature of business risk
nob_cat_encoder = OneHotEncoder().fit(np.asarray(df_nob['category']).reshape(-1,1))

df_nob['lbl'] =  LabelEncoder().fit_transform(df_nob['risk_label'])
cat_feature = nob_cat_encoder.transform(np.asarray(df_nob['category']).reshape(-1,1)).toarray()

feature_names_nob = np.append(nob_cat_encoder.categories_,['amount'])

In [None]:
data_nob = np.hstack((cat_feature,np.asarray(df_nob['amount']).reshape(-1,1)))

feature_columns=[]
for feature_name in feature_names_nob:
    feature_columns.append(tf.feature_column.numeric_column(feature_name,
                                           dtype=tf.float32))
data_nob = pd.DataFrame(columns = feature_names_nob,data=data_nob)
train_input_fn_nob = ExpUtils.make_input_fn(data_nob, df_nob['lbl'])
eval_input_fn_nob = ExpUtils.make_input_fn(data_nob, df_nob['lbl'], num_epochs=1, shuffle=False)

nob_est = tf.estimator.LinearClassifier(feature_columns=feature_columns,n_classes=3)
nob_est.train(train_input_fn_nob)
nob_est.evaluate(eval_input_fn_nob)

In [None]:
pred_dicts = list(nob_est.predict(eval_input_fn_nob))
preds = [int(pred['classes'][0]) for pred in pred_dicts]

print(classification_report(df_nob['lbl'],preds))

## Amount Context

In [None]:
# fit the prior distribution
amounts = df_bank['amount']
from scipy.stats import norm,kstest,lognorm
param = norm.fit(amounts[amounts<5000])
amounts_prior = tfd.TruncatedNormal(loc=tf.reduce_mean(amounts[amounts<5000]),
                                    scale =tf.math.reduce_std(amounts[amounts<5000]),low=0., high=10000)

In [None]:
o = amounts_prior.sample(10000)
plt.hist(o,density=True)

## Frequency

In [None]:
def extract_frequency_features(transaction,cust_df):
    #count the number of times the cp has been transacted with before
    mer = transaction['merchant']
    mer_data = cust_df[cust_df['merchant']==mer]
    
    tr_count = mer_data.shape[0]
    mer_time_stamps = np.array(mer_data['step'])
    avg_amount = np.mean(mer_data['amount'])
    std_amount = np.std(mer_data['amount'])
    
    return tr_count,mer_time_stamps,avg_amount,std_amount,mer_data

In [None]:
# This part is only necessary if you want to re-collect and label the frequency data 
# otherwise laod the labelled frequency data

relabel_frequency=False

if not relabel_frequency:
    sample_freq_data = pd.read_csv('data/ferquency_training_data.csv')
else:
    #collect all the frequency data
    n = 200
    sample_freq_data = df_bank.sample(n,)
    freq_labels = np.zeros(n)
    i=0
    for idx, sample in sample_freq_data.iterrows():
        cust_df = df_bank[df_bank['customer']==sample['customer']]
        tr_count,mer_time_stamps,avg_amount,std_amount,mer_data = extract_frequency_features(sample,cust_df)
        avg_time = 0
        std_time = 0
        if len(mer_time_stamps)>1:
            avg_time = np.mean(np.diff(mer_time_stamps))
            std_time = np.std(np.diff(mer_time_stamps))
        print('index:',i,'\n',
          'all transactions', cust_df.shape[0],'\n'
          'Count:',tr_count,'\n',
          #'time_stamps:',mer_time_stamps,'\n',
          'average_time:',avg_time,'\n',
          'std_time:',std_time,'\n',
          'average amount:',avg_amount,'\n',
          'std amount:',std_amount)
        x = input('Enter 0-infrequent, 1- rare, 2- regular:\n')
        if x=='q':
            break
        freq_labels[i]=int(x)
        i = i+1
    sample_freq_data['freq_lbl']=freq_labels.astype(int)
    sample_freq_data['frequency'] = np.asarray(['infrequent','rare','regular'])[freq_labels.astype(int)]
    sample_freq_data.to_csv('ferquency_training_data_temp.csv')

In [None]:
sample_freq_data.head()

In [None]:
freq_data=np.zeros((sample_freq_data.shape[0],5))
i=0
for idx, sample in sample_freq_data.iterrows():
    cust_df = df_bank[df_bank['customer']==sample['customer']]
    tr_count,mer_time_stamps,avg_amount,std_amount,mer_data = extract_frequency_features(sample,cust_df)
    avg_time = 0
    std_time = 0
    if len(mer_time_stamps)>1:
        avg_time = np.mean(np.diff(mer_time_stamps))
        std_time = np.std(np.diff(mer_time_stamps))
    freq_data[i,0]=avg_time
    freq_data[i,1]=std_time
    freq_data[i,2]=avg_amount
    freq_data[i,3]=std_amount
    freq_data[i,4]=tr_count/cust_df.shape[0]
    i = i+1

In [None]:
#simulate n points of the distrubtion to be able to measure the probability of a point
def estimate_probability(_param,sample,n=10000):
    samples = scipy.stats.lognorm.rvs(*_param[:-2], 
                       loc=_param[-2], scale=_param[-1], size=n)
    return len(samples[samples<sample])/n

In [None]:
# gender encoder
gender_encoder = LabelEncoder().fit(df_bank['gender'])

# features :
- age 
- gender
- category
- amount
- avgerage time between transactions with a given merchant(avg_tbt)
- stddev time between transactions with a given merchant(std_tbt)
- avgerage transaction amount with a given merchant(avg_amt)
- stddev transaction amount with a given merchant(std_amt)
- ratio of number of transactions with a given merchant to the overall transactions number (t_count)

In [None]:
# to speed the feature extraction we will cache some of the calculations

cache_amount={}
cache_customer={}

In [None]:
def feature_extraction(customer_df,merchant,category,amount):
    
    customer_name = customer_df['customer'].unique()[0]
    #add the age,gender features
    age = list(customer_df['age'])[0]
    if age =='U':
        age = 7
    else:
        age = int(age)
    features = [age,
                gender_encoder.transform([list(customer_df['gender'])[0]])[0]]
    
    #add the nature of business and its risk
    features.extend(nob_cat_encoder.transform(np.asarray([category]).reshape(-1,1)).toarray()[0])
    #add the amount context
    if customer_df.shape[0] < 10:
        ratio = amount/np.sum(customer_df['amount'])
        features.append(ratio)
    else:
        param = cache_amount.get(customer_name)
        if param is None:
            param = scipy.stats.lognorm.fit(customer_df['amount'])
            cache_amount[customer_name] = param
        features.append(estimate_probability(param,amount,n=1000))
        
    #add the frequency features
    #count the number of times the cp has been transacted with before
    key = customer_name+'_'+merchant
    cached = cache_customer.get(key)
    if cached is not None:
        [avg_tbt,std_tbt,avg_amt,std_amt,t_count] = cached
    else:
        mer_data = customer_df[customer_df['merchant']==merchant]
    
        mer_time_stamps = np.array(mer_data['step'])
        avg_tbt = 0
        std_tbt = 0
        if len(mer_time_stamps)>1:
            avg_tbt = np.mean(np.diff(mer_time_stamps))
            std_tbt = np.std(np.diff(mer_time_stamps))
        
        avg_amt = np.mean(mer_data['amount'])
        std_amt = np.std(mer_data['amount'])
    
        t_count = mer_data.shape[0]/customer_df.shape[0]
        cache_customer[key] = [avg_tbt,std_tbt,avg_amt,std_amt,t_count]
        
    features.extend([avg_tbt,std_tbt,avg_amt,std_amt,t_count])
    
    return features

In [None]:
#test feature extraction
customer_df = df_bank[df_bank['customer']=='C352968107']

feature_extraction(customer_df,'M348934600','es_transportation',39.68)

In [None]:
feature_names_all = ['age','gender']
feature_names_all.extend(nob_cat_encoder.categories_[0])
feature_names_all.extend(['amount','avg_tbt','std_tbt','avg_amt','std_amt','t_count'])

## Load the graph and then stress test the models

In [None]:
network_structure = GraphUtils.load_graph(r"graph_structures/fraud_example_structure.xlsx")
GraphUtils.visualise_network(network_structure)

In [None]:
import time
load_data = True #load all training data only set to false if you require re-running the feature extraction code.
if load_data:
    data = pd.read_csv('data/training_data.csv')
    del data['Unnamed: 0']
else:
    #run the feature extraction on all the transactions
    feature_data=[]
    
    with tqdm(total=df_bank.shape[0]) as pbar:
        for row in df_bank.iterrows():
            customer = row[1]['customer']
            merchant = row[1]['merchant']
            category = row[1]['category']
            amount   = row[1]['amount']
            customer_df = df_bank[df_bank['customer']==customer]

            row_features = feature_extraction(customer_df,merchant,category,amount)
            feature_data.append(row_features)
            #time.sleep(0.01)
            pbar.update(1)
    data = pd.DataFrame(columns=feature_names_all,data=feature_data)
    data['label'] = df_bank['fraud']
    


## Build the nature of business classifier

In [None]:
sub_sampled = data[data['label']==1]
sub_sampled = pd.concat((sub_sampled,data[data['label']==0].sample(7200)))

In [None]:
sub_sampled_labels=np.ones(7200,dtype=int)
sub_sampled_labels=np.append(sub_sampled_labels, np.zeros(7200,dtype=int))

In [None]:
nob_onehot_encoder = OneHotEncoder().fit(np.array(df_bank['category']).reshape(-1,1))

In [None]:
# nature of business classifier
feature_columns_nob=[]
for feature_name in feature_names_nob:
    feature_columns_nob.append(tf.feature_column.numeric_column(feature_name,
                                           dtype=tf.float64))
data_nob = data[feature_names_nob]
train_input_fn_nob = ExpUtils.make_input_fn(data_nob, df_nob['lbl'])
eval_input_fn_nob = ExpUtils.make_input_fn(data_nob, df_nob['lbl'], num_epochs=1, shuffle=False)

nob_est = tf.estimator.LinearClassifier(feature_columns=feature_columns_nob,n_classes=3)
nob_est.train(train_input_fn_nob)
nob_est.evaluate(eval_input_fn_nob)

## Build the frequency classifier

In [None]:
# build the frequency classifier

feature_names_freq = ['avg_tbt','std_tbt','avg_amt','std_amt','t_count']
feature_columns_freq=[]
for feature_name in feature_names_freq:
    feature_columns_freq.append(tf.feature_column.numeric_column(feature_name,
                                           dtype=tf.float64))
data_frequency = pd.DataFrame(columns = feature_names_freq,data=freq_data)

train_input_fn_freq = ExpUtils.make_input_fn(data_frequency, sample_freq_data['freq_lbl'])
eval_input_fn_freq = ExpUtils.make_input_fn(data_frequency, sample_freq_data['freq_lbl'], num_epochs=1, shuffle=False)

freq_est = tf.estimator.LinearClassifier(feature_columns=feature_columns_freq,n_classes=3)
freq_est.train(train_input_fn_freq)
freq_est.evaluate(eval_input_fn_freq)

## Build the decision model

In [None]:
# get the predcitions from the freq and nob estimators to build the decision classifier
input_fn_ = ExpUtils.make_input_fn(data, None, num_epochs=1, shuffle=False)
freq_preds = ExpUtils.get_class_probs(input_fn_,freq_est)

In [None]:
nob_preds = ExpUtils.get_class_probs(input_fn_,nob_est)

In [None]:
feature_names_decision=['age','gender','amount']
feature_names_decision.extend(['infrequent','rare','regular','high', 'low', 'medium'])

In [None]:
data['infrequent'] = freq_preds[:,0]
data['rare'] = freq_preds[:,1]
data['regular'] = freq_preds[:,2]

data['high'] = nob_preds[:,0]
data['low'] = nob_preds[:,1]
data['medium'] = nob_preds[:,2]

In [None]:
data['category'] = df_bank['category']

In [None]:
if not load_data:
    data.to_csv('training_data.csv')

In [None]:
feature_columns_decision=[]
for feature_name in feature_names_decision:
    feature_columns_decision.append(tf.feature_column.numeric_column(feature_name,
                                           dtype=tf.float64))
data_decision = sub_sampled[feature_names_decision]

train_input_fn_dec = ExpUtils.make_input_fn(data_decision, sub_sampled_labels)
eval_input_fn_dec = ExpUtils.make_input_fn(data_decision, sub_sampled_labels, num_epochs=1, shuffle=False)

dec_est = tf.estimator.LinearClassifier(feature_columns=feature_columns_decision,n_classes=2)
dec_est.train(train_input_fn_dec)
dec_est.evaluate(eval_input_fn_dec)

## Define Priors and Feature distributions

In [None]:
# define the priors and 
nob_prior = tfd.OneHotCategorical(probs=[.1,.88,.2]) 
freq_prior = tfd.OneHotCategorical(probs=[.4,.15,.45]) 
dec_prior = tfd.OneHotCategorical(probs=[.98,.02])
amount_prior = tfd.TruncatedNormal(loc=0.6,scale =0.27,low=0., high=1.)

In [None]:
cat_freq = df_bank['category'].value_counts(normalize=True)[['es_barsandrestaurants',
 'es_contents',
 'es_fashion',
 'es_food',
 'es_health',
 'es_home',
 'es_hotelservices',
 'es_hyper',
 'es_leisure',
 'es_otherservices',
 'es_sportsandtoys',
 'es_tech',
 'es_transportation',
 'es_travel',
 'es_wellnessandbeauty']]

In [None]:
#define the distributions
temperature = .7
age_dist = tfd.Categorical(probs=data['age'].value_counts(normalize=True, sort=False))
gender_dist = tfd.Categorical(probs=data['gender'].value_counts(normalize=True, sort=False))
category_dist = tfd.RelaxedOneHotCategorical(temperature, probs=cat_freq)
amount_dist = tfd.TruncatedNormal(loc=0.6,scale =0.27,low=0., high=1.)
std_tbt_dist = tfd.Gamma(concentration=0.395,rate = 0.1) # params estimated using scipy
avg_tbt_dist = tfd.Gamma(concentration=0.1277,rate = 0.04)

avg_amt_dist = tfd.Gamma(concentration=2.7,rate = 0.1)
std_amt_dist = tfd.Gamma(concentration=3.5,rate = 0.2)
t_count_dist = tfd.TruncatedNormal(loc=0.6,scale =0.27,low=0., high=1.)

In [None]:
# Preapre the configuration to start the experimentation.
conf_df = network_structure.copy()

In [None]:
data['age']=data['age'].astype(float)
data['gender']=data['gender'].astype(float)

In [None]:
#add the confirguration
estimators_ = ['','','','','','','','','','freq_est','nob_est','dec_est']
priors_ = ['','','','amount_prior','','','','','','freq_prior','nob_prior','dec_prior']
encoders_ = ['','','nob_onehot_encoder','','','','','','','','','']
feature_names_ = ['','','','','','','','','','feature_names_freq','feature_names_nob','feature_names_decision']

In [None]:
conf_df['estimator'] = estimators_
conf_df['prior'] = priors_
conf_df['encoder'] = encoders_
conf_df['feature_names'] = feature_names_

In [None]:
conf_df

In [None]:
# implement Eq 1
def joint_prob(df,conf_df, head):
    
    if head not in set(conf_df['node']):
        print('head is not a node')
        return
    # get dependencies
    current_node = conf_df[conf_df['node']==head]
    prior_str = current_node['prior'].iloc[0]
    if ''!=prior_str:
        prior_dist = eval(prior_str)
    else:
        prior_dist = None
        
    dep_nodes = current_node.parent_node.str.split(',').tolist()[0]
    
    if len(dep_nodes)==1 and dep_nodes[0]=='':#no dependencies end of recurssion
        
        encoded_features = None
        #load the encoder and encode the input data if required
        if ''!= current_node['encoder'].iloc[0]:  
            encoder_ = eval(current_node['encoder'].iloc[0])
            encoded_features = encoder_.transform(np.asarray(df[head]).reshape(-1,1)).toarray() 
            # encode the features and then calcualte the priors if they are defined
            if prior_dist is not None:    
                tmp = tf.convert_to_tensor(np.tile(prior_dist.prob(encoded_features),(encoded_features.shape[1],1)).T) 
                return tf.convert_to_tensor(encoded_features,dtype=tf.float64)* tmp
            else: 
                return encoded_features
        else:
            if prior_dist is not None:
                return prior_dist.prob(df[head])
            else:
                return tf.cast(df[head],tf.float32)

    n_probs = None
    for n in dep_nodes:
        probs_ = joint_prob(df,conf_df, head=n)
        if len(probs_.shape) ==1:
            probs_=tf.expand_dims(probs_,1)    
        if n_probs is None:
            n_probs = probs_
        else:
            n_probs = tf.concat([n_probs,probs_],axis=1)
            
                

    est = eval(current_node['estimator'].iloc[0])
    feature_names = eval(current_node['feature_names'].iloc[0])
    eval_input_fn = ExpUtils.make_input_fn(pd.DataFrame(columns=feature_names,data=n_probs.numpy()),
                                               None,num_epochs=1, shuffle=False)
    cond = ExpUtils.conditional_prob(eval_input_fn,est)
    
    if prior_dist is not None:
        prior = prior_dist.prob(cond)
        return tf.tensordot(cond,tf.reduce_mean(prior,axis=0),axes=0)
    return cond

In [None]:
#test the implementation
r1= joint_prob(data,conf_df, head='decision')

In [None]:
plt.boxplot(r1[:,0])

In [None]:
# sample some data and calculate the joint distribution

def one_run_simulation(conf_df,dists,sample_no=10000,head='decision',features=['age',
 'gender','category','amount','avg_tbt','std_tbt','avg_amt','std_amt','t_count']):
    #sample all the features
    sim_data = pd.DataFrame(columns=features)
    for feature in features:
        #sample from the feature distribution
        column =  conf_df[conf_df['node']==feature]
        feature_samples = dists[feature].sample(sample_no)
        if ''!= column['encoder'].iloc[0]:
            categories_ = eval(column['encoder'].iloc[0]).categories_[0]
            sim_data[feature] = categories_[np.argmax(feature_samples,axis=1)]
        else:
            sim_data[feature] = feature_samples
    return joint_prob(sim_data,conf_df, head=head)

def repeated_sim(conf_df,dists,head='decision',features=['age',
 'gender','category','amount','avg_tbt','std_tbt','avg_amt','std_amt','t_count'],n_samples=1000, repeats=100):
    results=[]
    
    for r in tqdm(range(repeats)):
        result= one_run_simulation(conf_df,dists,sample_no=n_samples,head=head,features=features)
        results.append(result)
    return results

## Establish baseline and implement comparions functions

In [None]:

dists={'age':age_dist,
       'gender':gender_dist,
       'category':category_dist,
       'amount':amount_dist,
       'avg_tbt':avg_tbt_dist,
       'std_tbt':std_tbt_dist,
       'avg_amt':avg_amt_dist,
       'std_amt':std_amt_dist,
       't_count':t_count_dist}
results = repeated_sim(conf_df,dists,n_samples=1000,repeats=100)

In [None]:
def make_cat_dist(x):
    binary_results = np.argmax(x,axis=1)
    probs = np.array([np.sum(binary_results==0), np.sum(binary_results==1)])/binary_results.shape[0]
    return tfd.Categorical(probs=probs)
def kl_divergence(p,q, bins):
    def get_norm_bins (x,bins):
        bin_values= np.histogram(np.vstack(x),bins =bins)[0]
        return bin_values/np.sum(bin_values)
    p_bins = get_norm_bins(p,bins)
    q_bins = get_norm_bins(q,bins)
    result = np.sum(np.where((q_bins!=0) & (p_bins!=0), p_bins*np.log(p_bins/q_bins),0))
    return result,p_bins,q_bins
def compare_results(results_1,results_2,bins=30, color=None, label='Experiment'):
    plt_results1 = np.vstack(results_1)
    plt_results2 = np.vstack(results_2)
    
    df = pd.DataFrame(columns=['Fraud Probability','Experiment'])
    df['Fraud Probability']=np.append(plt_results1[:,1], plt_results2[:,1])
    df['Experiment'].iloc[:plt_results1.shape[0]]='Baseline'
    df['Experiment'].iloc[plt_results1.shape[0]:]=label
    sns.boxplot(x='Experiment',y='Fraud Probability',data=df)
    plt.show()
    sns.distplot(plt_results1[:,1],bins=bins,kde=True,norm_hist=True)
    sns.distplot(plt_results2[:,1],bins=bins,kde=True,norm_hist=True,color=color)
    plt.ylim([0,10])
    dist_kl = tfp.distributions.kl_divergence(make_cat_dist(plt_results1),make_cat_dist(plt_results2))
    bin_kl = kl_divergence(results_1,results_2,bins)
    
    return dist_kl,bin_kl[0]

## Expirment 8: Change the distribution of age

In [None]:
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)

In [None]:
age_dist_exp8 = tfd.Categorical(probs=[.1,.2,.3,.04,0.06,0.2,.05,0.05])
dists_8={'age':age_dist_exp8,
       'gender':gender_dist,
       'category':category_dist,
       'amount':amount_dist,
       'avg_tbt':avg_tbt_dist,
       'std_tbt':std_tbt_dist,
       'avg_amt':avg_amt_dist,
       'std_amt':std_amt_dist,
       't_count':t_count_dist}
results_exp_8 = repeated_sim(conf_df,dists_8,n_samples=1000,repeats=100)

In [None]:
sns.distplot(age_dist.sample(10000),kde=False,norm_hist=True)
sns.distplot(age_dist_exp8.sample(10000),kde=False,norm_hist=True)
plt.xlabel('Age Ranges')
plt.ylabel('Density')
plt.legend(['baseline','Exp_8'])

In [None]:
print(compare_results(results,results_exp_8,bins=20,label='Exp_8'))
plt.legend(['baseline','Exp_8'])
plt.xlabel('Fraud Probability')
plt.ylabel('Density')
plt.savefig('exp_fraud_8.png')

## Experiment9: Change the distribution of the amount

In [None]:
# experiment 2 change the distribution of amount
amount_dist_exp_9 = tfd.TruncatedNormal(loc=0.1,scale =0.1,low=0., high=1.)
dists_9={'age':age_dist,
       'gender':gender_dist,
       'category':category_dist,
       'amount':amount_dist_exp_9,
       'avg_tbt':avg_tbt_dist,
       'std_tbt':std_tbt_dist,
       'avg_amt':avg_amt_dist,
       'std_amt':std_amt_dist,
       't_count':t_count_dist}
results_exp_9 = repeated_sim(conf_df,dists_9,n_samples=1000,repeats=100)

In [None]:
sns.distplot(amount_dist.sample(1000),norm_hist=True)
sns.distplot(amount_dist_exp_9.sample(1000),norm_hist=True,color='r')
plt.xlabel('Normalized Transaction Amount')
plt.ylabel('Density')
plt.legend(['baseline', 'Exp_9'])

In [None]:
print(compare_results(results,results_exp_9,bins=20,label='Exp_9',color='r'))
plt.legend(['baseline','Exp_9'])
plt.xlabel('Fraud Probability')
plt.ylabel('Density')
plt.savefig('exp_fraud_9.png')

## Expierment 10: Replace Frequency and NoB with randomly trained classifeirs

In [None]:
# experiment 10a train frequency randomly
random_lbl = sample_freq_data['freq_lbl'][np.random.permutation(sample_freq_data.shape[0])]
train_input_fn_freq = ExpUtils.make_input_fn(data_frequency,random_lbl )
eval_input_fn_freq = ExpUtils.make_input_fn(data_frequency, sample_freq_data['freq_lbl'], num_epochs=1, shuffle=False)

freq_est_rand = tf.estimator.LinearClassifier(feature_columns=feature_columns_freq,n_classes=3)
freq_est_rand.train(train_input_fn_freq)
freq_est_rand.evaluate(eval_input_fn_freq)

In [None]:
conf_df_10a = conf_df.copy()
conf_df_10a.iloc[9]['estimator'] = 'freq_est_rand'

results_exp_10a = repeated_sim(conf_df_10a,dists,n_samples=1000)

In [None]:
print(compare_results(results,results_exp_10a,bins=30))
plt.legend(['baseline','Exp_10a'])
plt.xlabel('Fraud Probability')
plt.ylabel('Density')
plt.savefig('exp_fraud_10a.png')

In [None]:
# expeiment 10b train the nature of business classifier on randomly labelled data
random_lbl = np.argmax(tfd.OneHotCategorical(probs=[.3,.3,.4]).sample(df_nob['lbl'].shape[0]),axis=1)
train_input_fn_nob_rand = ExpUtils.make_input_fn(data_nob, random_lbl)
eval_input_fn_nob_rand = ExpUtils.make_input_fn(data_nob, random_lbl, num_epochs=1, shuffle=False)

nob_est_rand = tf.estimator.LinearClassifier(feature_columns=feature_columns_nob,n_classes=3)
nob_est_rand.train(train_input_fn_nob_rand)
nob_est_rand.evaluate(eval_input_fn_nob_rand)

In [None]:
conf_df_10b = conf_df.copy()
conf_df_10b.iloc[10]['estimator'] = 'nob_est_rand'

results_10b = repeated_sim(conf_df_10b,dists,n_samples=1000)

In [None]:
print(compare_results(results,results_10b,bins=30))
plt.legend(['baseline','Exp_10b'])
plt.xlabel('Fraud Probability')
plt.ylabel('Density')

In [None]:
plt_results = np.vstack(results)
plt_results10a = np.vstack(results_exp_10a)
plt_results10b = np.vstack(results_10b)
    
df_plt = pd.DataFrame(columns=['Fraud Probability','Experiment'])
df_plt['Fraud Probability']=np.append(np.append(plt_results[:,1], plt_results10b[:,1]),plt_results10a[:,1])
df_plt['Experiment'].iloc[:plt_results.shape[0]]='Baseline'
df_plt['Experiment'].iloc[plt_results.shape[0]:plt_results.shape[0]+plt_results10b.shape[0]]='Random $m_1$'
df_plt['Experiment'].iloc[plt_results.shape[0]+plt_results10b.shape[0]:]='Random $m_2$'
box_plot = sns.boxplot(x='Experiment',y='Fraud Probability',data=df_plt,palette="Set1")
plt.xticks(rotation=20)
medians = np.array([np.median(plt_results[:,1]),np.median(plt_results10b[:,1]),np.median(plt_results10a[:,1])])
vertical_offset = medians*0.05

In [None]:
base_output = joint_prob(data,conf_df, head='decision')
exp_10a_output = joint_prob(data,conf_df_10a, head='decision')
exp_10b_output = joint_prob(data,conf_df_10b, head='decision')

In [None]:
joint_plot = pd.DataFrame(columns=['x','y'])
joint_plot['baseline'] = base_output[:2000,1]
joint_plot['Random_m2'] = exp_10a_output[:2000,1]
joint_plot['Random_m1'] = exp_10b_output[:2000,1]
sns.jointplot(x="baseline", y="Random_m1", data=joint_plot, kind="kde",);
plt.plot([-0.2,-0.2],[1.2,1.2])
plt.show()
sns.jointplot(x="baseline", y="Random_m2", data=joint_plot, kind="kde");