Implementation for "Limitations and Applicability of GANs in Banking Domain"


http://ceur-ws.org/Vol-2692/paper1.pdf

In [None]:
# !nvcc --version
# !python --version
# !pip3 install http://download.pytorch.org/whl/cu92/torch-0.4.1-cp36-cp36m-linux_x86_64.whl
# !pip3 install torchvision

In [None]:
# #To install UMAP
!pip install umap-learn

In [None]:
import numpy as np

import xgboost as xgb
import sklearn.cluster as cluster
from sklearn.preprocessing import normalize 
from sklearn import metrics
from sklearn.metrics import recall_score, precision_score, roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

from torchvision import datasets
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.autograd as autograd
from torch.autograd import Variable
from torch.optim.lr_scheduler import StepLR

from random import Random
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

import umap
# import umap.plot
import progressbar
from time import sleep

cuda = True if torch.cuda.is_available() else False

Load Dataset

In [None]:
from google.colab import drive

drive.mount('/content/drive')

train_df = pd.read_csv('/content/drive/MyDrive/CreditCardFraudDataset/train_df.csv')
test_df = pd.read_csv('/content/drive/MyDrive/CreditCardFraudDataset/test_df.csv')

print(train_df.shape)
train_df = train_df.loc[:, ~train_df.columns.str.contains('^Unnamed')]
print(train_df.columns)
print(train_df.groupby('Class')['Class'].count())

test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]
print(test_df.columns)
print(test_df.groupby('Class')['Class'].count())

**Model Parameters**


In [None]:
#Define Model Parameters
n_clusters = 1  #no of clusters for k-means, if >1 conditional GAN = True else conditional GAN = False
rand_dim = 30  #noise dimension
base_n_count = 128  #no of neurons 
nb_steps = 3000 + 1 #no of iterations
batch_size = 64
k_d = 5   #updates for discriminator per generator update
log_interval = 100  #epochs to show results
learning_rate = 2e-4 
b1 = 0.5 
b2 = 0.9
LAMBDA = 10.0
loss_fn = 'None' #Options:['None', 'Triplet', 'Siamese']
use_network = False #use triplet network? 
show = True #print the plots of generated samples and real samples

In [None]:
#k_means clustering to set labels for fraud data in train set
def k_means(train_df, k):
  train = train_df.loc[ train_df['Class']==1 ].copy()
  label_cols = [ i for i in train.columns if 'Class' in i ]
  data_cols = [ i for i in train.columns if i not in label_cols ]
  train_no_label = train[ data_cols ]
  args = ()
  kwds = {'n_clusters':k, 'random_state':0} 
  labels = cluster.KMeans(*args, **kwds).fit_predict(train_no_label) 
  fraud_w_classes = train.copy()     
  fraud_w_classes['Class'] = labels  
  return fraud_w_classes

fraud_w_classes = k_means(train_df, n_clusters)
train = fraud_w_classes.copy().reset_index(drop=True)

label_cols = [ i for i in train.columns if 'Class' in i ]
data_cols = [ i for i in train.columns if i not in label_cols ]
train_no_label = train[ data_cols ]
data_dim = len(data_cols)
total_samples = 2*len(train)  #no of fraud samples to generate to augment the train set

#true for conditional WGAN i.e. WCGAN and false for WGAN
if n_clusters == 1:
  condition = False
  label_dim = 0
else:
  condition = True
  label_dim = 1

**Utility Functions**

In [None]:
#Split the dataset into Train and Test Set

def split_dataset(data):
  n_real = np.sum(data['Class']==0) # 200000
  n_fraud = np.sum(data['Class']==1) # 492
  real_samples = data.loc[ data.Class==0].sample(n_real, replace=False).reset_index(drop=True)
  fraud_samples = data.loc[ data.Class==1].sample(n_fraud, replace=False).reset_index(drop=True)

  train_fraction = 0.7
  fn_real = int(n_real * train_fraction)
  fn_fraud = int(n_fraud * train_fraction)

  train_df = pd.concat([real_samples[:fn_real],fraud_samples[:fn_fraud]],axis=0,ignore_index=True).reset_index(drop=True)
  test_df = pd.concat([real_samples[fn_real:],fraud_samples[fn_fraud:]],axis=0,ignore_index=True).reset_index(drop=True)

  print(train_df.groupby('Class')['Class'].count())
  print(test_df.groupby('Class')['Class'].count())

  return train_df, test_df

In [None]:
def BaseMetrics(y_pred,y_true):
    TP = np.sum( (y_pred == 1) & (y_true == 1) )
    TN = np.sum( (y_pred == 0) & (y_true == 0) )
    FP = np.sum( (y_pred == 1) & (y_true == 0) )
    FN = np.sum( (y_pred == 0) & (y_true == 1) )
    return TP, TN, FP, FN

def SimpleMetrics(y_pred,y_true, show=True):
    TP, TN, FP, FN = BaseMetrics(y_pred,y_true)
    ACC = ( TP + TN ) / ( TP + TN + FP + FN )

    Recall = TP / (TP+FN)
    Precision = TP / (TP+FP)
    
    # Reporting
    if show:
      from IPython.display import display
      print( 'Confusion Matrix')
      display( pd.DataFrame( [[TN,FP],[FN,TP]], columns=['Pred 0','Pred 1'], index=['True 0', 'True 1'] ) )
    
    # print( 'Accuracy : {}'.format( ACC ))
    # print( 'Recall  : {}'.format( Recall ))
    # print( 'Precision  : {}'.format( Precision ))
    return ACC, Recall, Precision
    
def SimpleAccuracy(y_pred,y_true):
    TP, TN, FP, FN = BaseMetrics(y_pred,y_true)
    ACC = ( TP + TN ) / ( TP + TN + FP + FN )
    return ACC    

In [None]:
def draw_umap_test(train_df,test_df, y_pred, n_neighbors=50, min_dist=0.3, n_components=2, metric='euclidean', title=''):

  np.random.seed(42)

  df_tmp_tst = pd.DataFrame()
  df_tmp_tst = test_df.copy()
  X_col = df_tmp_tst.columns[:-1]
  y_col = df_tmp_tst.columns[-1]
  df_tmp_tst['y_pred'] = np.round(y_pred)

  test_non_fraud = test_df.loc[test_df['Class']==0][X_col].values  
  Random(4).shuffle(test_non_fraud)
  test_non_fraud = test_non_fraud[:5000]
  test_non_fraud_labels = test_df.loc[test_df['Class']==0 ]['Class'].values
  test_non_fraud_labels = test_non_fraud_labels[:5000]

  test_fraud_tp = df_tmp_tst[(df_tmp_tst['y_pred']==1)&(df_tmp_tst['Class']==1)].copy()
  test_fraud_fn = df_tmp_tst[(df_tmp_tst['y_pred']==0)&(df_tmp_tst['Class']==1)].copy()

  train_fraud = train_df[(train_df['Class']==1)][X_col].values

  print(test_fraud_tp[X_col].shape, test_fraud_fn[X_col].shape, train_fraud.shape)

  dtrain = np.vstack( [  test_fraud_tp[X_col], test_fraud_fn[X_col], train_fraud ]) #test_non_fraud,
  dlabels = np.hstack( [ np.zeros(len(test_fraud_tp[y_col])), np.ones(int(len(test_fraud_fn))) , 2*np.ones(int(len(train_fraud))) ] ) #test_non_fraud_labels

  
  fit = umap.UMAP(random_state=42 )  
  u = fit.fit_transform(dtrain)

  #Train Fraud
  fig = plt.figure()
  ax = fig.add_subplot(111)
  labels = ['TruePositive_TestFraud', 'FalseNegative_TestFraud','TrainFraud']
  x = u[:,0]
  y = u[:,1]
  for g in np.unique(dlabels):
    if g == 1 or g == 2:
      continue
    i = np.where(dlabels == g)
    ax.scatter(x[i], y[i], c=[sns.color_palette()[int(x)] for x in dlabels[i]], label=labels[int(g)])
  ax.legend()
  plt.title(title, fontsize=18)
  plt.show() 

  #Trained Fraud and True Positives
  fig = plt.figure()
  ax = fig.add_subplot(111)
  labels = ['TruePositive_TestFraud', 'FalseNegative_TestFraud','TrainFraud']
  x = u[:,0]
  y = u[:,1]
  for g in np.unique(dlabels):
    if g == 2:
      continue
    i = np.where(dlabels == g)
    ax.scatter(x[i], y[i], c=[sns.color_palette()[int(x)] for x in dlabels[i]], label=labels[int(g)])
  ax.legend()
  plt.title(title, fontsize=18)
  plt.show()   

  #True Positives, False positives and Train Fraud
  fig = plt.figure()
  ax = fig.add_subplot(111)
  labels = ['TruePositive_TestFraud', 'FalseNegative_TestFraud','TrainFraud']
  x = u[:,0]
  y = u[:,1]
  for g in np.unique(dlabels):
    i = np.where(dlabels == g)
    ax.scatter(x[i], y[i], c=[sns.color_palette()[int(x)] for x in dlabels[i]], label=labels[int(g)])
  ax.legend()
  plt.title(title, fontsize=18)
  plt.show() 

In [None]:
def draw_umap(train_df,g_z, n_neighbors=50, min_dist=0.3, n_components=2, metric='euclidean', title=''):

  X_col = train_df.columns[:-1]
  y_col = train_df.columns[-1]  

  train_non_fraud = train_df.loc[ train_df['Class']==0 ][X_col].values  
  Random(4).shuffle(train_non_fraud)  
  train_non_fraud = train_non_fraud[:10000]
  train_non_fraud_labels = train_df.loc[ train_df['Class']==0 ][y_col].values
  train_fraud = train_df.loc[ train_df['Class']==1 ].copy()

  dtrain = np.vstack( [ train_fraud[X_col], train_non_fraud, g_z ] )
  dlabels = np.hstack( [train_fraud[y_col], train_non_fraud_labels[:10000], 2*np.ones(int(len(g_z))) ] ) 

  fit = umap.UMAP(  random_state=42 )  
  u = fit.fit_transform(dtrain)

  fig = plt.figure()
  ax = fig.add_subplot(111)
  labels = ['TrainNonFraud','TrainRealFraud', 'TrainGeneratedFraud']
  x = u[:,0]
  y = u[:,1]
  for g in np.unique(dlabels):
    i = np.where(dlabels == g)
    ax.scatter(x[i], y[i], c=[sns.color_palette()[int(x)] for x in dlabels[i]], label=labels[int(g)])
  ax.legend()
  plt.title(title, fontsize=18)
  plt.show() 


  dtrain = np.vstack( [ train_fraud[X_col], g_z ] )
  dlabels = np.hstack( [train_fraud[y_col], np.zeros(int(len(g_z))) ] ) 
  fit = umap.UMAP(  random_state=42 )  
  u = fit.fit_transform(dtrain)
  fig = plt.figure()
  ax = fig.add_subplot(111)
  labels = ['TrainGeneratedFraud', 'TrainRealFraud']
  x = u[:,0]
  y = u[:,1]
  for g in np.unique(dlabels):
    i = np.where(dlabels == g)
    ax.scatter(x[i], y[i], c=[sns.color_palette()[int(x)] for x in dlabels[i]], label=labels[int(g)])
  ax.legend()
  plt.title(title, fontsize=18)
  plt.show()   

**Classifier**

In [None]:
#To check performance of XGBoost on the generated samples

def runXGB(train_df, test_df, g_z=[]):  
  X_col = test_df.columns[:-1]
  y_col = test_df.columns[-1]

  if len(g_z) == 0:
    dtrain = xgb.DMatrix(train_df[X_col], train_df[y_col], feature_names=X_col)
  else:
    dtrain = np.vstack( [train_df[X_col], g_z] )
    dlabels = np.hstack( [ train_df[y_col], np.ones(int(len(g_z))) ] ) 
    dtrain = xgb.DMatrix(dtrain, dlabels, feature_names=X_col)

  dtest = xgb.DMatrix(test_df[X_col], test_df[y_col], feature_names=X_col)
  y_true = test_df['Class'].values  
  
  xgb_params = {
      'max_depth': 4, 
      'objective': 'binary:logistic',
      'random_state': 0,
      'eval_metric': 'auc',
      }

  xgb_test = xgb.train(xgb_params, dtrain, num_boost_round=10)

  y_pred = xgb_test.predict(dtest, ntree_limit=xgb_test.best_iteration+1)

  if show:
    draw_umap_test(train_df,test_df, y_pred)

  ACC, Recall, Precision = SimpleMetrics( np.round(y_pred), y_true)

  # print( 'Accuracy : {}'.format( ACC ))
  # print( 'Recall  : {}'.format( Recall ))
  # print( 'Precision  : {}'.format( Precision ))

  precision, recall, thresholds = metrics.precision_recall_curve(dtest.get_label(), y_pred)
  f1 = metrics.f1_score(dtest.get_label(), np.round(y_pred))
  # print("f1 Score: ",f1)   
  pr_auc = metrics.auc(recall[:], precision[:])
  # print('pr_auc score',pr_auc)  

  print( "[Recall: %f] [Precision: %f] [F1_score: %f] [PR_AUC: %f]" % (Recall, Precision, f1, pr_auc))  

  return y_pred, Precision, Recall, f1

In [None]:
#To run XGBoost on the real and generated fraud samples. Ideally the accuracy should be 50%
# i.e not able to distinguish between real and generated fraud samples.

def CheckAccuracy( x, g_z, data_cols, label_cols=[]):
    n = len(x)
    m = int(len(x)/2)
    
    #train set
    dtrain = np.vstack( [ x[:m], g_z[:m] ] ) # train classifier on half of the samples from real and generated train fraud data
    dlabels = np.hstack( [ np.zeros(m), np.ones(m) ] ) # label '0' for real and '1' for generated fraud samples
    
    #test set
    dtest = np.vstack( [ x[int(len(x)/2):], g_z[int(len(g_z)/2):] ] ) # test the classifier on the the other half of each set
    y_true = np.hstack( [ np.zeros(n-m), np.ones(n-m) ] )
    
    dtrain = xgb.DMatrix(dtrain, dlabels, feature_names=data_cols+label_cols)
    dtest = xgb.DMatrix(dtest, feature_names=data_cols+label_cols)
    
    xgb_params = {
        'max_depth': 4, 
        'objective': 'binary:logistic',
        'random_state': 0,
        'eval_metric': 'auc',
        }
    xgb_test = xgb.train(xgb_params, dtrain, num_boost_round=10) 

    y_pred = np.round(xgb_test.predict(dtest))

    return SimpleAccuracy(y_pred, y_true)

In [None]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP,self).__init__()
        self.lin1 = nn.Linear(data_dim,base_n_count*4)
        self.lin2 = nn.Linear(base_n_count*4, base_n_count*2)
        self.lin3 = nn.Linear(base_n_count*2, base_n_count*1)
        self.lin4 = nn.Linear(base_n_count*1, 2)          
        self.relu = nn.ReLU()

    def forward(self, inp):      
      h0 = self.lin1(inp)
      h0=self.relu(h0)
      h0 = F.tanh(h0)
      h1=self.lin2(h0)
      h1=self.relu(h1)
      # h1 = F.tanh(h1)
      h2=self.lin3(h1)
      h2=self.relu(h2)      
      # h2 = F.tanh(h2)
      h3=self.lin4(h2)   
      return h3

    def predict(self,x):
        pred = F.softmax(self.forward(x))
        ans = []
        #Pick the class with maximum weight
        for t in pred:
            if t[0]>t[1]:
                ans.append(0)
            else:
                ans.append(1)
        return torch.tensor(ans)

In [None]:
#MLP Classifier on the test dataset without oversampling(data augmentation)
def runMLP(train_df, test_df, g_z=[], epoch = 2000):  
  FloatTensor = torch.cuda.FloatTensor
  LongTensor = torch.cuda.LongTensor

  X_col = test_df.columns[:-1]
  y_col = test_df.columns[-1]

  if len(g_z) != 0:
    dtrain = np.vstack( [train_df[X_col].values, g_z] )
    dlabels = np.hstack( [ train_df[y_col].values, np.ones(int(len(g_z))) ] ) 
  else:
    dtrain = train_df[X_col].values
    dlabels = train_df['Class'].values  

  dtrain=(torch.from_numpy(dtrain)).type(FloatTensor)
  dlabels=(torch.from_numpy(dlabels)).type(LongTensor)
  train_data = TensorDataset(dtrain,dlabels)
  train_loader = DataLoader(train_data,batch_size=32,shuffle=True)      

  dtest = test_df[X_col].values
  y_true = test_df['Class'].values    
  dtest=(torch.from_numpy(dtest)).type(FloatTensor)
      
  classifier = MLP()
  auxiliary_loss = torch.nn.CrossEntropyLoss()
  classifier.cuda()
  auxiliary_loss.cuda()

  cls_optim = torch.optim.Adam(classifier.parameters(), lr=1e-3, betas=(0.9, 0.999),weight_decay=1e-3)
  cls_schedular=StepLR(cls_optim,step_size=200,gamma=0.9)

  best_f1 = 0.0
  best_pr_auc = 0.0
  best_precision = 0.0
  best_recall = 0.0
  best_epoch = 0
  best_y = []

  bar = progressbar.ProgressBar(maxval=epoch, widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
  bar.start()
  for it in range(epoch):
      cls_schedular.step()
      cls_optim.zero_grad()

      x,l = train_loader.__iter__().next()      
      
      cls_out=classifier(x)
      c_loss=auxiliary_loss(cls_out, l)
      c_loss.backward()
      cls_optim.step()

      if it%100 == 0:      
        with torch.no_grad():
            y_pred=classifier.predict(dtest)
        y_pred=np.array(y_pred)                                  
            
        ACC, Recall, Precision = SimpleMetrics( np.round(y_pred), y_true, False)
        precision, recall, thresholds = metrics.precision_recall_curve(y_true, y_pred)
        f1 = metrics.f1_score(y_true, np.round(y_pred))
        pr_auc = metrics.auc(recall[:], precision[:])
        if f1 > best_f1:
          best_epoch = it
          best_f1 = f1
          best_pr_auc = pr_auc
          best_precision = Precision
          best_recall = Recall                    
          best_y = y_pred

        bar.update(it+1)
        sleep(0.1)
  bar.finish()
  _,_,_ = SimpleMetrics( np.round(best_y), y_true)
  print( "[Recall: %f] [Precision: %f] [F1_score: %f] [PR_AUC: %f]" % (best_recall,  best_precision, best_f1, best_pr_auc))  

**WGAN-GP**

In [None]:
class Generator(nn.Module):
    def __init__(self):
      super(Generator, self).__init__()
      # Generator
      self.lin1 = nn.Linear(rand_dim+label_dim,base_n_count)
      self.lin2 = nn.Linear(base_n_count, base_n_count*2)
      self.lin3 = nn.Linear(base_n_count*2, base_n_count*4)
      self.lin4 = nn.Linear(base_n_count*4, data_dim)                     
      self.relu = nn.ReLU()

    def modelG(self,inp):
      h0=self.lin1(inp)
      h0=self.relu(h0)
      h1=self.lin2(h0)
      h1=self.relu(h1)
      h2=self.lin3(h1)
      h2=self.relu(h2)
      h3=self.lin4(h2)        
      return h3

    def forward(self, inp):            
      _x = self.modelG(inp)
      #return torch.cat([_x,label], -1)
      return _x

In [None]:
class Discriminator(nn.Module):
    def __init__(self):
      super(Discriminator, self).__init__()
      # Discriminator
      self.lin1 = nn.Linear(data_dim+label_dim,base_n_count*4)
      self.lin2 = nn.Linear(base_n_count*4, base_n_count*2)
      self.lin3 = nn.Linear(base_n_count*2, base_n_count*1)
      self.lin4 = nn.Linear(base_n_count*1, 1)    
      self.relu = nn.ReLU()           

    def modelD(self,inp):
      h0 = self.lin1(inp)
      h0=self.relu(h0)
      h1=self.lin2(h0)
      h1=self.relu(h1)
      h2=self.lin3(h1)
      h2=self.relu(h2)
      h3=self.lin4(h2)        
      return h3

    def forward(self, inp):      
      x = self.modelD(inp)
      return x

***Gradient Penalty***

In [None]:
def calc_gradient_penalty(netD, real_data, fake_data,label):
    alpha = torch.rand(batch_size, 1)
    alpha = alpha.expand(real_data.size())
    alpha = alpha.cuda() if cuda else alpha

    interpolates = alpha * real_data + ((1 - alpha) * fake_data)

    if cuda:
        interpolates = interpolates.cuda()

    if condition:
      interpolates = torch.cat([interpolates, label],-1)
          
    interpolates = autograd.Variable(interpolates, requires_grad=True)
    
    disc_interpolates = netD(interpolates)

    gradients = autograd.grad(outputs=disc_interpolates, inputs=interpolates,
                              grad_outputs=torch.ones(disc_interpolates.size()).cuda() if cuda else torch.ones(
                                  disc_interpolates.size()),
                              create_graph=True, retain_graph=True, only_inputs=True)[0]

    gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean() * LAMBDA
    return gradient_penalty

**Discriminator Rejection Sampling**

In [None]:

def sigmoid(F):
    return 1/(1 + np.exp(-F))

def sample():
  # print('Start BurnIn')
  max_M = 0.0
  max_logit = 0.0
  processed_samples = 0

  burnin_samples = data.shape[0]*2  #no of samples to decide on the max_logit

  while processed_samples<burnin_samples:

    x,l = next(iter(train_loader))
    l = Variable(l.type(Tensor))    
    noiseIP = np.random.normal(size=[batch_size , rand_dim])

    if condition:      
      noiseIP = torch.from_numpy(noiseIP).cuda().float()      
      z_input = torch.cat([noiseIP,l], -1)
    else:
      z_input = torch.from_numpy(noiseIP).cuda().float()      

    with torch.no_grad():
      g_z = generator.modelG(z_input)    
      if condition:
        discrim_logits = discriminator.modelD(torch.cat([g_z,l], -1))
      else:
        discrim_logits = discriminator.modelD(g_z)

    logits = discrim_logits.view(-1).cpu().float()

    batch_ratio = np.exp(logits)
    max_idx = np.argmax(batch_ratio)
    max_ratio = batch_ratio[max_idx]

    if max_ratio > max_M:
        max_M = max_ratio
        max_logit = logits[max_idx]    

    processed_samples += batch_size
    # print("Processing BurnIn...%d/%d"%(processed_samples, burnin_samples))        
  # print(max_M, max_logit)

  # Sample
  gen_samples = []
  label_samples = []
  gen_samples1 = []
  label_samples1 = []
  # print ("Start Sampling...")
  counter = 0
  rejected_counter = 0
  epsilon = 1e-8
  gamma_percentile = 0.80

  while counter < total_samples:
    x,l = next(iter(train_loader))     
    l = Variable(l.type(Tensor))   

    if condition:
      l = torch.tensor(np.hstack([np.zeros(1500),np.ones(1500), 2*np.ones(1500), 3*np.ones(1500), 4*np.ones(1500), 5*np.ones(1500) ]))
      l = l.unsqueeze(-1)      
      l = Variable(l.type(Tensor))         
      noiseIP = np.random.normal(size=[9000 , rand_dim])
      noiseIP = torch.from_numpy(noiseIP).cuda().float()            
      z_input = torch.cat([noiseIP,l], -1)
    else:
      noiseIP = np.random.normal(size=[batch_size , rand_dim])
      z_input = torch.from_numpy(noiseIP).cuda().float()

    with torch.no_grad():                            
      g_z = generator.modelG(z_input)
      if condition:
        discrim_logits = discriminator.modelD(torch.cat([g_z,l], -1))
      else:
        discrim_logits = discriminator.modelD(g_z)

    logits = discrim_logits.view(-1).cpu().float()

    batch_ratio = np.exp(logits)
    max_idx = np.argmax(batch_ratio)
    max_ratio = batch_ratio[max_idx]        

    #update max_M if larger M is found
    if max_ratio > max_M:
        max_M = max_ratio
        max_logit = logits[max_idx]

    #calculate F_hat and pass it into sigmoid
    # set gamma dynamically (95th percentile of F)
    Fs = logits - max_logit - np.log(1 - np.exp(logits - max_logit - epsilon))
    gamma = np.percentile(Fs, gamma_percentile)
    F_hat = Fs - gamma
    acceptance_prob = sigmoid(F_hat)      
    
    g_z = g_z.cpu().float()

    l = l.cpu()
    for idx, sample in enumerate(g_z):
        probability = np.random.uniform(0, 1)
        if probability <= acceptance_prob[idx]:
          gen_samples.append(sample) 
          label_samples.append(l[idx])         
          counter += 1
  gen_samples = torch.stack(gen_samples,0)
  label_samples = torch.stack(label_samples,0)

  print('----------------------------After Rejection Sampling---------------------------------------')
  print ("Sampled Synthetic Fraud: %d"%(counter))

  #Check Classifier's Performance after data augmentation
  runXGB(train_df, test_df, gen_samples)

  if show:
    draw_umap(train_df, gen_samples)   

**Triplet Loss**

In [None]:
class Triplet(nn.Module):
    def __init__(self):
      super(Triplet, self).__init__()
      # Generator
      self.lin1 = nn.Linear(data_dim, 10)
      self.lin2 = nn.Linear(10, 2)
      self.relu = nn.ReLU()

    def modelT(self,inp):
      h0=self.lin1(inp)
      h0=self.relu(h0)
      h1=self.lin2(h0)        
      return h1

    def forward(self, inp1, inp2, inp3):            
      o1 = self.modelT(inp1)
      o2 = self.modelT(inp2)
      o3 = self.modelT(inp3)
      return o1, o2, o3

In [None]:
class TripletLoss(nn.Module):
    def __init__(self):
        super(TripletLoss, self).__init__()
        self.margin = 1

    def forward(self, anchor, positive, negative):
        distance_positive = (anchor - positive).pow(2).sum(1)  # .pow(.5)
        distance_negative = (anchor - negative).pow(2).sum(1)  # .pow(.5)
        losses = F.relu(distance_positive - distance_negative + self.margin)
        return losses.mean()

**Siamese**

In [None]:
class Siamese(nn.Module):
    def __init__(self):
      super(Siamese, self).__init__()
      # Generator
      self.lin1 = nn.Linear(data_dim, data_dim)
      self.lin2 = nn.Linear(data_dim, 2)
      self.relu = nn.ReLU()

    def modelS(self,inp):
      h0=self.lin1(inp)
      h0=self.relu(h0)
      h1=self.lin2(h0)        
      return h1

    def forward(self, inp1, inp2):            
      o1 = self.modelS(inp1)
      o2 = self.modelS(inp2)
      return o1, o2

In [None]:
class ContrastiveLoss(nn.Module):
    def __init__(self):
        super(ContrastiveLoss, self).__init__()
        self.margin = 1
        self.eps = 1e-9

    def forward(self, output1, output2, target):
        distances = (output2 - output1).pow(2).sum(1)  # squared distances
        losses = 0.5 * (target.float() * distances + (1 + -1 * target).float() * F.relu(self.margin - (distances + self.eps).sqrt()).pow(2))
        return losses.mean()

**Training**

In [None]:
##Classifier performance without data augmentation
runXGB(train_df, test_df)

In [None]:
from sklearn.utils.class_weight import compute_class_weight

allc = [0,1]
class_weights = compute_class_weight('balanced', allc, train_df[label_cols].values.flatten())
# d_class_weights = dict(enumerate(class_weights))

for i in range(len(class_weights)):
      if (class_weights[i]) < 1:
            class_weights[i] = 1

class_weights

array([  1.        , 289.77325581])

In [None]:
# Initialize generator and discriminator and classifier
generator = Generator()
discriminator = Discriminator()
if loss_fn == 'Triplet':
  loss_triplet = TripletLoss()
  triplet = Triplet()
if loss_fn == 'Siamese':  
  loss_sia = ContrastiveLoss()
  siamese = Siamese()

classifier = MLP()
class_weights=torch.FloatTensor(class_weights).cuda()
auxiliary_loss = torch.nn.CrossEntropyLoss(class_weights)

if cuda:
    generator.cuda()
    discriminator.cuda()
    classifier.cuda()
    auxiliary_loss.cuda()  
    
    if loss_fn == 'Triplet':
      loss_triplet.cuda()
      triplet.cuda()
    if loss_fn == 'Siamese':  
      loss_sia.cuda()
      siamese.cuda()    

# Optimizers
optimizer_G = torch.optim.Adam(generator.parameters(), lr=learning_rate, betas=(b1, b2))
optimizer_D = torch.optim.Adam(discriminator.parameters(), lr=learning_rate, betas=(b1, b2))
if loss_fn == 'Triplet':
  optimizer_T = torch.optim.Adam(triplet.parameters(), lr=learning_rate, betas=(b1, b2))
if loss_fn == 'Siamese':  
  optimizer_S = torch.optim.Adam(list(siamese.parameters())+list(generator.parameters()), lr=learning_rate, betas=(b1, b2))
Tensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor 

cls_optim = torch.optim.Adam(list(classifier.parameters())+list(generator.parameters()), lr=1e-3, betas=(0.9, 0.999),weight_decay=1e-3)
cls_schedular=StepLR(cls_optim,step_size=200,gamma=0.9)

In [None]:
#dataloader for training
data = torch.tensor(train[data_cols].values)
label = torch.tensor(train[label_cols].values)
train_data = TensorDataset(data,label)
train_loader = DataLoader(train_data,batch_size,shuffle=True) 

real_data = torch.tensor(train_df.loc[ train_df['Class']==0][data_cols].values)
real_label = torch.tensor(train_df.loc[ train_df['Class']==0][label_cols].values)
real_samples = TensorDataset(real_data, real_label)
real_loader = DataLoader(real_samples,batch_size,shuffle=True) 

In [None]:
for epoch in range(nb_steps): 
  # (1) Update D network
  for p in discriminator.parameters():  # reset requires_grad
    p.requires_grad = True  # they are set to False below in netG update

  for iter_d in range(k_d):
    x,l = next(iter(train_loader))
    x = Variable(x.type(Tensor))          
    l = Variable(l.type(Tensor))
    z = Variable(Tensor(np.random.normal(0, 1, (x.shape[0], rand_dim))))

    if condition:    
      _x = generator(torch.cat([z,l], -1)).detach()
    else:
      _x = generator(z).detach()

    optimizer_D.zero_grad()

    if condition:
      D_real = torch.mean(discriminator(torch.cat([x,l], -1)))
      D_fake = torch.mean(discriminator(torch.cat([_x,l], -1)))

    else:
      D_real = torch.mean(discriminator(x))
      D_fake = torch.mean(discriminator(_x))

    #WGAN    
    gradient_penalty = calc_gradient_penalty(discriminator, x.data, _x.data, l)
    D_cost = D_fake - D_real + gradient_penalty
    Wasserstein_D = D_real - D_fake

    D_cost.backward()
    optimizer_D.step()          

  for p in discriminator.parameters():
      p.requires_grad = False  # to avoid gradient computation

  # (2) train siamese/triplet network  
  x,l = next(iter(train_loader))
  x = Variable(x.type(Tensor))          
  l = Variable(l.type(Tensor))
  z = Variable(Tensor(np.random.normal(0, 1, (x.shape[0], rand_dim))))

  y,_ = next(iter(real_loader))
  y = Variable(y.type(Tensor)) 

  if condition:    
    _x = generator(torch.cat([z,l], -1)).detach()
  else:
    _x = generator(z).detach()  

  if loss_fn=='Triplet' and use_network == True:    
    optimizer_T.zero_grad()      
    #triplet loss
    o1,o2,o3 = triplet(_x,x,y)
    T_loss = loss_triplet(o1,o2,o3)
    T_loss.backward()
    optimizer_T.step()

  if loss_fn=='Siamese' and epoch > 1000:    
    optimizer_S.zero_grad()      
    #Siamese loss
    o1,o2 = siamese(_x,x)
    S_loss = loss_sia(o1.cpu(),o2.cpu(), torch.ones(x.size()[0]))
    o1, o2 = siamese(_x, y)
    S_loss += loss_sia(o1.cpu(),o2.cpu(), torch.zeros(x.size()[0]))
    S_loss.backward()
    optimizer_S.step()

  # (3) Update classifier network  
  if epoch < 0:
    FloatTensor = torch.cuda.FloatTensor
    LongTensor = torch.cuda.LongTensor  
    dtrain = np.vstack( [x.cpu(), _x.cpu(), y.cpu()] )
    dlabels = np.hstack( [ np.ones(int(len(x))+int(len(_x))), np.zeros(int(len(y))) ] )
    # dtrain = np.vstack( [_x.cpu(), y.cpu()] )
    # dlabels = np.hstack( [ np.ones(int(len(_x))), np.zeros(int(len(y))) ] )    
    dtrain=(torch.from_numpy(dtrain)).type(FloatTensor)
    dlabels=(torch.from_numpy(dlabels)).type(LongTensor)  

    cls_schedular.step()
    cls_optim.zero_grad()  
    cls_out=classifier(dtrain)
    c_loss=auxiliary_loss(cls_out, dlabels)
    c_loss.backward()
    cls_optim.step()         

  # (4) Update G network  
  generator.zero_grad()      

  if condition:
    _x = generator(torch.cat([z,l],-1))
    D_fake = discriminator(torch.cat([_x,l],-1))
  else:
    _x = generator(z)
    D_fake = discriminator(_x)

  # WGAN
  G_cost = -torch.mean( D_fake )

  if loss_fn == 'Triplet' and use_network == False:
    T_loss = loss_triplet(_x,x,y)
    G_cost += T_loss

  # #triplet loss
  # if loss_fn == 'Triplet':
  #   if use_network:
  #     o1,o2,o3 = triplet(_x,x,y)
  #     T_loss = loss_triplet(o1,o2,o3)
  #   else:      
  #     T_loss = loss_triplet(_x,x,y)
  #   G_cost += T_loss

  # #siamese loss
  # if loss_fn == 'Siamese':
  #   o1,o2 = siamese(_x,x)
  #   S_loss = loss_sia(o1.cpu(),o2.cpu(), torch.ones(x.size()[0]))
  #   o1, o2 = siamese(_x, y)
  #   S_loss += loss_sia(o1.cpu(),o2.cpu(), torch.zeros(x.size()[0]))
  #   S_loss = S_loss.cuda()
  #   G_cost += 1.0 * S_loss

  G_cost.backward()  
  optimizer_G.step()          

  if epoch % log_interval == 0:
    print("========================================================================================================")
    print(
        "[Epoch %d/%d] [D loss: %f] [Wasserstein_D loss: %f] [G loss: %f]" % (epoch, nb_steps, D_cost.item(),  Wasserstein_D.item(), G_cost.item())
    )          
    #check the quality of generated samples
    sample()