In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors
import re
import random

In [3]:
def smote(n,input_path):

    data = input_path
    df1 = pd.read_csv(data)
    
    
    df1 = df1.loc[~(df1['simple_journal'] == 'Refused')]
    df1.loc[df1['simple_journal'] == 'Chargeback', 'simple_journal'] = 1 #fraud
    df1.loc[df1['simple_journal'] == 'Settled', 'simple_journal'] = 0

    df1.loc[df1['cardverificationcodesupplied'].isna(),'cardverificationcodesupplied'] = False
    df1.loc[df1['issuercountrycode'].isna(),'issuercountrycode'] = 'ZZ'
    df1.loc[df1['shoppercountrycode'].isna(),'shoppercountrycode'] = 'ZZ'

    bin_enc = LabelEncoder()
    bin_enc.fit(df1['bin'].unique())
    df1['bin'] = bin_enc.transform(df1.bin)


    card_enc = LabelEncoder()
    card_enc.fit(df1['card_id'])
    df1['card_id'] = card_enc.transform(df1.card_id)

    ip_enc = LabelEncoder()
    ip_enc.fit(df1['ip_id'])
    df1['ip_id'] = ip_enc.transform(df1.ip_id)

    df1['creationdate'] = pd.to_datetime(df1['creationdate'])
    df1['date'] = df1['creationdate'].dt.date

    unique_issuer_cc = df1['issuercountrycode'].unique()
    unique_shopper_cc = df1['shoppercountrycode'].unique()
    both = np.append(unique_issuer_cc, unique_shopper_cc)
    df_countrycodes = pd.DataFrame(both)
    unique_codes = df_countrycodes[0].unique()
    enc = LabelEncoder()
    enc.fit(unique_codes)
    df1['issuercountrycode'] = enc.transform(df1.issuercountrycode)
    df1['shoppercountrycode'] = enc.transform(df1.shoppercountrycode)

    enc1 = LabelEncoder()
    enc1.fit(df1['txvariantcode'])
    df1['txvariantcode'] = enc1.transform(df1.txvariantcode)

    enc2 = LabelEncoder()
    enc2.fit(df1['currencycode'])
    df1['currencycode'] = enc2.transform(df1.currencycode)

    enc3 = LabelEncoder()
    enc3.fit(df1['shopperinteraction'])
    df1['shopperinteraction'] = enc3.transform(df1.shopperinteraction)

    df1['accountcode'] = df1['accountcode'].apply(lambda x: re.sub('Account','',x))
    df1['accountcode_cc'] = 0
    df1.loc[(df1['accountcode'] == 'UK'),'accountcode_cc'] = 'GB'
    df1.loc[(df1['accountcode'] == 'Mexico'),'accountcode_cc'] = 'MX'
    df1.loc[(df1['accountcode'] == 'Sweden'),'accountcode_cc'] = 'SE'
    df1.loc[(df1['accountcode'] == 'APAC'),'accountcode_cc'] = 'APAC'

    enc4 = LabelEncoder()
    enc4.fit(df1['accountcode'])
    df1['accountcode'] = enc4.transform(df1.accountcode)

    enc5 = LabelEncoder()
    enc5.fit(df1['cardverificationcodesupplied'])
    df1['cardverificationcodesupplied'] = enc5.transform(df1.cardverificationcodesupplied)

    df1.loc[df1['mail_id'].str.contains('na',case=False),'mail_id'] = 'email99999'

    enc6 = LabelEncoder()
    enc6.fit(df1['mail_id'])
    df1['mail_id'] = enc6.transform(df1.mail_id)

    df1['bookingdate'] = pd.to_datetime(df1['bookingdate'])

    df1.loc[df1['cvcresponsecode'] > 2,'cvcresponsecode'] = 3

    df1['countries_equal'] = (df1['shoppercountrycode'] == df1['issuercountrycode'])
    df1.loc[df1['countries_equal'] == False,'countries_equal'] = 0
    df1.loc[df1['countries_equal'] == True,'countries_equal'] = 1

    df1['day_of_week'] = df1['creationdate'].dt.dayofweek

    df1['hour'] = df1['creationdate'].dt.hour

    dates= df1["creationdate"]
    dates=pd.DatetimeIndex(dates)
    dates=dates.astype(np.int64) // 10**9
    df1['creationdate']=dates

    df1.drop(columns=['txid','bookingdate','date','accountcode_cc'],inplace=True)

    df_fraud= df1.loc[df1["simple_journal"]==1]
    
    df_notfraud = df1.loc[df1["simple_journal"]==0]
    
    df_fraudmat = df_fraud.values

    nbrs = NearestNeighbors(n_neighbors=5, algorithm='brute').fit(df_fraudmat)
    distances, indices = nbrs.kneighbors(df_fraudmat)

    synthetic=[]
    
    T = len(df_fraudmat)
    N = n
    
    for i in range(T):
        value = random.sample(range(1, 4), N)
        for j in value:
            ind = indices[i][j]
            diff= df_fraudmat[ind]-df_fraudmat[i]
            synthetic.append((df_fraudmat[i]+(random.uniform(0, 1))*diff).astype(int))

    synthetic=pd.DataFrame(synthetic,columns=['issuercountrycode', 'txvariantcode', 'bin', 'amount', 'currencycode',
       'shoppercountrycode', 'shopperinteraction', 'simple_journal',
       'cardverificationcodesupplied', 'cvcresponsecode', 'creationdate',
       'accountcode', 'mail_id', 'ip_id', 'card_id', 'countries_equal',
       'day_of_week', 'hour'])       
    
    DF_Fraud = pd.concat([df_fraud,synthetic])
    
    DF_ALL = pd.concat([DF_Fraud, df_notfraud])
    
    def conv(row):
        currency_dict = {0: 0.86248, 1: 1.5911, 2: 21.2829, 3: 1.6805, 4: 10.635}
        return row['amount'] / (currency_dict[row['currencycode']]*100)

    DF_Fraud['amount_eur'] = DF_Fraud.apply(lambda x: conv(x), axis=1)
    
    DF_ALL['amount_eur'] = DF_ALL.apply(lambda x: conv(x), axis=1)
    
    return DF_Fraud,DF_ALL 

In [4]:
DF_fraud,DF_all=smote(2,"data_for_student_case.csv")

In [5]:
len(DF_fraud),len(DF_all)

(1035, 237726)

In [6]:
df_all_mat = DF_all.values
nbrs = NearestNeighbors(n_neighbors=2, algorithm='auto').fit(df_all_mat)
distances, indices = nbrs.kneighbors(df_all_mat[:1035])

In [7]:
notneeded=[]
for i in indices:
    if i[1]>1035:
        notneeded.append(i[0])

In [19]:
needed = list(set(np.arange(1035)) - set(notneeded))
len(needed)

636

In [25]:
needed_df = pd.DataFrame([DF_fraud.iloc[i] for i in needed])

In [26]:
#removed tomak links.
needed_df

Unnamed: 0,issuercountrycode,txvariantcode,bin,amount,currencycode,shoppercountrycode,shopperinteraction,simple_journal,cardverificationcodesupplied,cvcresponsecode,creationdate,accountcode,mail_id,ip_id,card_id,countries_equal,day_of_week,hour,amount_eur
1,104.0,2.0,2364.0,44900.0,2.0,104.0,1.0,1.0,1.0,0.0,1.435813e+09,1.0,834.0,194109.0,44739.0,1.0,3.0,4.0,21.096749
2,104.0,2.0,1965.0,149900.0,2.0,104.0,1.0,1.0,1.0,0.0,1.435847e+09,1.0,119287.0,189655.0,122802.0,1.0,3.0,14.0,70.432131
4,104.0,6.0,1301.0,89900.0,2.0,104.0,1.0,1.0,1.0,0.0,1.436381e+09,1.0,70647.0,196247.0,208481.0,1.0,2.0,18.0,42.240484
6,104.0,6.0,1301.0,69900.0,2.0,104.0,1.0,1.0,1.0,0.0,1.436409e+09,1.0,24698.0,196216.0,130614.0,1.0,3.0,2.0,32.843269
7,104.0,2.0,1781.0,129900.0,2.0,104.0,1.0,1.0,1.0,0.0,1.436418e+09,1.0,130776.0,187845.0,185092.0,1.0,3.0,4.0,61.034915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
681,44.0,3.0,495.0,601290.0,4.0,125.0,1.0,1.0,1.0,1.0,1.444698e+09,2.0,21544.0,33778.0,208180.0,0.0,1.0,0.0,565.387870
682,91.0,2.0,2220.0,159147.0,2.0,110.0,1.0,1.0,1.0,0.0,1.445564e+09,1.0,79371.0,17312.0,170630.0,0.0,3.0,6.0,74.776934
683,96.0,2.0,2284.0,146306.0,2.0,107.0,1.0,1.0,1.0,0.0,1.445419e+09,1.0,110979.0,12808.0,152199.0,0.0,2.0,5.0,68.743451
684,125.0,2.0,2393.0,224643.0,3.0,125.0,1.0,1.0,1.0,0.0,1.445805e+09,1.0,147925.0,80273.0,101692.0,1.0,4.0,15.0,1336.762868
