In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import roc_auc_score, recall_score, average_precision_score, precision_score

In [2]:
import seaborn as sns
from sklearn.metrics import roc_auc_score, recall_score, average_precision_score, precision_score

def generate_date_features(transactions_df:pd.DataFrame) -> pd.DataFrame:
    """Generates features from the creation date of a transaction

    Args:
        transactions_df (pd.DataFrame): The transaction dataframe

    Returns:
        pd.DataFrame: The transaction dataframe with the new features
    """
    transactions_df['time_bin'] = pd.qcut((transactions_df['CREATED_DATE'].dt.hour * 60 + transactions_df['CREATED_DATE'].dt.minute) * 60 + transactions_df['CREATED_DATE'].dt.minute, 20, labels=range(20)).astype('int')
    transactions_df['hour'] = transactions_df['CREATED_DATE'].dt.hour
    transactions_df['minute'  ] = transactions_df['CREATED_DATE'].dt.minute
    transactions_df['week'] = transactions_df['CREATED_DATE'].dt.isocalendar().week
    transactions_df['weekday'] = transactions_df['CREATED_DATE'].dt.weekday
    transactions_df['dayofyear'] = transactions_df['CREATED_DATE'].dt.dayofyear
    transactions_df['dayofyear_hour_bin'] = pd.qcut((transactions_df['dayofyear'] * transactions_df['hour']).rank(method='first'), 20, labels=range(20)).astype('int')
    transactions_df['weekday_hour'] = transactions_df['weekday'] * transactions_df['hour']
    
    return transactions_df

def draw_features(transactions:pd.DataFrame, use_features:list) -> None:
    """Produces a factor plot of every feature used where the percentage for each feature is

    Args:
        transactions (pd.DataFrame): _description_
        use_features (list): _description_
    """
    for feat in use_features:
        agg = transactions.groupby(['is_fraud', feat]).agg({'ID': 'count'})
        df = agg.groupby(level=0).apply(lambda x: x / float(x.sum())).reset_index()
        plt.figure(figsize=(14,10))
        ax = sns.factorplot(x=feat, y='ID', hue='is_fraud', data=df, kind='bar', size=6, aspect=2)
        ax.fig.suptitle(feat.upper()+' by is_fraud - % of transactions')
        ax.set_xticklabels(rotation=20)
        plt.show()

def model_metrics(y_v: list, y_t:list, score:list) -> pd.Series:
    leng = int(y_v.index.nunique()+y_t.index.nunique())
    targ_m = np.mean(y_v[y_v==1].index.nunique() / y_v.index.nunique())
    targ_q = leng * targ_m
    gini = np.round(2*roc_auc_score(y_v, score)-1, 4)
    auc = np.round(roc_auc_score(y_v, score), 4)
    aucpr = np.round(average_precision_score(y_v, score), 4)
    return pd.Series([leng, targ_m, targ_q, gini, auc, aucpr], index=["Length", "Target Mean", "Targets", "Gini", "AUC", "AUCPR"])

def model_metrics_cut(y_v: list, y_t:list, score:list, cutoff) -> pd.Series:
    
    pred = np.sum((score>cutoff).astype(int))
    prec = precision_score(y_v, (score>cutoff).astype(int))
    rec = recall_score(y_v, (score>cutoff).astype(int))
    corr = prec * pred
    incorr = pred-corr
    lift = prec/np.mean(y_v)
    lift = np.round(lift, 4)
    prec = np.round(prec,4)
    rec = np.round(rec, 4)
    extr = np.round(y_t.index.nunique()/y_v.index.nunique()+1)
    pred = np.round(pred*extr)
    corr = np.round(corr*extr)
    incorr = np.round(incorr*extr)
    rev_tp = corr * revenue_tp
    cost_ul_fp = incorr * cost_unlock_fp
    cost_ch_fp = incorr * cost_check_fp
    return pd.Series([cutoff, pred,corr, incorr, prec,rec, lift, rev_tp, cost_ul_fp, cost_ch_fp], 
                     index=['Cutoff','Predicted', 'Correct', 'Incorrect', 'Precision', 'Recall','Lift', 'Revenue_Correct', 'Cost_Unlock_Incorrect', 'Cost_Check_Incorrect'])
    
def calc_profit(mod_metr_cut_df:pd.DataFrame, i:int, j:int)-> float:
    rev_tp = mod_metr_cut_df['Revenue_Correct'][j] - (mod_metr_cut_df['Correct'][j] - mod_metr_cut_df['Correct'][i]) * cost_check_fp
    cost_ul_fp = mod_metr_cut_df['Cost_Unlock_Incorrect'][i]
    cost_ch_fp = mod_metr_cut_df['Cost_Check_Incorrect'][j]-mod_metr_cut_df['Cost_Check_Incorrect'][i]
    profit = (rev_tp - cost_ul_fp - cost_ch_fp) 
    return profit

def maximize_profit(mod_metr_cut_df:pd.DataFrame, cutoff:list):
    max_profit = 0 
    best_lock = 0
    best_check = 0

    for i in tqdm(range(len(cutoff)-1)):
      for j in range(i + 1, len(cutoff)-2):
        profit = calc_profit(mod_metr_cut_df, i, j)
        if (profit) > max_profit:
            max_profit = profit
            best_lock = i
            best_check = j
            score_lock = mod_metr_cut_df['Cutoff'][i]
            score_check = mod_metr_cut_df['Cutoff'][j]
    return max_profit, score_lock, score_check, best_lock, best_check
  
def display_importances_xgb(model, use_features:list)-> None:
    fmap = {'f'+ str(index): x for index, x in enumerate(use_features, start=0)}
    fimp_df = pd.DataFrame(list(model.get_booster().get_fscore().items()))
    fimp_df.columns = ['feature', 'importance']
    fimp_df['importance']=fimp_df['importance']/np.sum(fimp_df['importance'])
    fimp_df['feature'] = fimp_df['feature'].map(fmap)

    plt.figure(figsize=(14, 10))
    sns.barplot(x="importance", y="feature", data=fimp_df.sort_values(by="importance", ascending=False))
    plt.title('Feature Importance')
    plt.tight_layout()
    

In [3]:
# Examine variables and definitions
parse_dates = ['CREATED_DATE']
users_df = pd.read_csv("../data/revolut/users.csv", parse_dates=parse_dates)
fraudsters_df = pd.read_csv("../data/revolut/fraudsters.csv")
transactions_df = pd.read_csv("../data/revolut/transactions.csv", parse_dates=parse_dates)

In [4]:
transactions_df.head()

Unnamed: 0,ID,USER_ID,CREATED_DATE,TYPE,STATE,AMOUNT_GBP,CURRENCY
0,f659b44e-cfdf-48de-bcf3-06f47ef26e9f,fd7f3ff6-0ed6-4a85-a7b5-2f205e0ef72f,2019-04-20 18:04:03.930,CARD_PAYMENT,COMPLETED,13.12,PLN
1,2ae18b8b-b9bc-4c44-96b1-d43efd8d371d,3979518e-95f7-4b6c-81ae-2f828727d81a,2019-05-03 13:09:57.625,TOPUP,REVERTED,0.01,RON
2,0162d352-dd18-40ab-b3ee-cf6584c9a238,75aa5388-9c89-4f72-bc54-67501519585b,2019-04-25 15:37:46.837,TOPUP,COMPLETED,10.0,GBP
3,a4e176f7-49ca-462b-9164-2f0645622148,45598164-6362-4ee4-bd70-ffee3bd1d707,2019-04-28 13:52:15.256,EXCHANGE,COMPLETED,0.11,RON
4,f6f9135f-fb2b-4a58-bb65-dd9713306a71,5a501ce5-f03c-410d-aabc-434b2cad741d,2019-05-13 16:02:12.081,CARD_PAYMENT,COMPLETED,9.79,EUR


In [5]:
users_df.head()

Unnamed: 0,ID,CREATED_DATE,COUNTRY,BIRTH_DATE
0,46f44852-aaa5-4634-aadd-8cc4eefef3c8,2019-04-22 18:30:30.735,BG,1984-10-22
1,f17dd8af-2edb-4415-a950-d90a1b5e3e5b,2019-04-15 02:44:24.940,IE,1984-11-04
2,55e6fcef-f573-4c54-8b27-537adc417e19,2019-04-03 16:10:44.530,PL,1977-09-08
3,dc03019c-9cf1-4081-a70a-6922a44fe393,2019-04-13 14:16:11.928,FR,1992-09-06
4,bcd967e5-c273-45a7-a7f5-e7c9e3b19b7e,2019-04-03 15:46:43.997,IE,1993-10-22


In [10]:
fraudsters_df.head()

Unnamed: 0,USER_ID
0,2c831c76-2d62-41ce-a240-e12f505d389a
1,ce2a1146-831e-49a7-aa5f-a3045a2892af
2,447abe11-f89a-4819-bea2-e7978b1cf560
3,3a186446-c2fb-474b-a8d8-db362643b3d2
4,73fa6100-f6f0-4e22-b247-714f4743c125


In [7]:
# dataset merging
users_df = users_df.rename({'ID':'USER_ID', 'CREATED_DATE':'USER_CREATED_DATE'}, axis='columns')
transactions_df = pd.merge(transactions_df, users_df, on=['USER_ID'], how='left')

In [8]:
# Set the fraud labels
transactions_df['is_fraud'] = transactions_df['USER_ID'].isin(fraudsters_df['USER_ID'].unique()).astype(int)
# Get the first digit of the transaction amount to check distribution under Benford's Law
transactions_df['first_digit'] = transactions_df['AMOUNT_GBP'].astype('str').str[0].astype('int')
# Also get the last digit incase transactions are structured in a way to commit fraud.
transactions_df['last_digit'] = transactions_df['AMOUNT_GBP'].astype('str').str[-1].astype('int')
# Get the number of times each user has topped up their account 
transactions_df['user_cnt_topups'] = transactions_df['USER_ID'].map(transactions_df[transactions_df['TYPE']=='TOPUP'].groupby('USER_ID')['ID'].count())
# Get the mean amount of each topup
transactions_df['user_mean_topups'] = pd.qcut(transactions_df['USER_ID'].map(transactions_df[transactions_df['TYPE']=='TOPUP'].groupby('USER_ID')['AMOUNT_GBP'].mean()), 15, labels=range(15)).astype(int)
transactions_df['user_std_topups'] = pd.qcut(transactions_df['USER_ID'].map(transactions_df[transactions_df['TYPE']=='TOPUP'].groupby('USER_ID')['AMOUNT_GBP'].std().fillna(0)), 15, labels=range(15)).astype(int)

# Get the number of unique states in which the user interacts. 
transactions_df['user_uniq_states'] = transactions_df['USER_ID'].map(transactions_df[transactions_df['TYPE']=='TOPUP'].groupby('USER_ID')['STATE'].nunique())
# Create interger mapping for each unique country
transactions_df['country'] = pd.factorize(transactions_df['COUNTRY'])[0]
tmp = transactions_df.groupby(['USER_ID']).agg({'CREATED_DATE': [np.min, np.max], 'ID': 'count'}).reset_index()
tmp.columns = ['USER_ID', 'MIN_DATE', 'MAX_DATE', 'COUNT']
tmp['mean_time_to_tran_min'] = (np.round((((tmp['MAX_DATE']-tmp['MIN_DATE']).dt.seconds % 3600) // 60) / tmp['COUNT'])).astype('int')
transactions_df['mean_time_to_tran_min'] = transactions_df['USER_ID'].map(tmp[['USER_ID', 'mean_time_to_tran_min']].set_index('USER_ID').squeeze())
transactions = generate_date_features(transactions_df)


In [9]:
txn = pd.melt(transactions_df, id_vars=['USER_ID'], value_vars=['TYPE', 'STATE', 'COUNTRY', 'CURRENCY', 'time_bin'])
txg = pd.melt(transactions, id_vars=['USER_ID'], value_vars=['TYPE', 'STATE', 'COUNTRY', 'CURRENCY', 'time_bin'])
txg['TOKEN'] = txg['variable'].astype(str) + "_" + txg["value"].astype(str)
txg = txg[['USER_ID', 'TOKEN']].groupby(['USER_ID', 'TOKEN']).size().reset_index()
txg.columns = ['USER_ID', 'TOKEN', 'TF']
usg = txg[['USER_ID', 'TOKEN']].groupby(['USER_ID']).size().reset_index()
usg.columns = ['USER_ID', 'IDF']
txg = txg.merge(usg, on='USER_ID')
txg['TF_IDF'] = txg['TF'] / txg['IDF']
txg = txg[['USER_ID', 'TOKEN', 'TF_IDF']]

In [11]:
import networkx as nx
from networkx.algorithms import bipartite

In [12]:
edge_list = list(txg.itertuples(index=False))


In [13]:
g= nx.Graph()
g.add_nodes_from(txg['USER_ID'], bipartite=0)
g.add_nodes_from(txg['TOKEN'], bipartite=1)
g.add_weighted_edges_from(edge_list)

In [18]:
transactions.shape

(1068361, 27)

In [14]:
g.number_of_nodes()

200447

In [15]:
nx.is_bipartite(g)

True

In [16]:
nx.is_connected(g)

True

In [17]:
bottom_nodes, top_nodes = bipartite.sets(g)

In [20]:
biadjacency = nx.algorithms.bipartite.matrix.biadjacency_matrix(g)
# names_row = graph.names_row
# names_col = graph.names_col
# louvain = Louvain(resolution=1.7, modularity='newman')

TypeError: biadjacency_matrix() missing 1 required positional argument: 'row_order'