In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import networkx as nx

In [None]:
df = pd.read_csv('/Users/brandonowens/Downloads/Fraud_Payments 2/data/fraud_payment_data', sep=',', header=0)
df

In [None]:
#Converting time into a Datetime object
df['timestamp'] = pd.to_datetime(df['Time_step'])
#Removing redudant columns
df=df.drop('Time_step',axis=1)
df=df.drop('Sender_lob',axis=1)
df=df.drop('Sender_Sector',axis=1)
#Apparently some transactions amounted to zero dollars. None of them were fraudulent, so I've removed them.
df=df[df.USD_amount>0]
df

In [5]:
#we will take 70% of our dataset as training data, 15% as a validation set and use the remaining 15% as our test set.
cutoff = round(0.7*len(df)) 
df_train = df.head(cutoff)
not_train =  df.tail(len(df)-cutoff)
cutoff2 = round(0.5*len(df_train))
df_val = df.head(cutoff2)
df_test = df.tail(len(not_train)-cutoff2)

In [6]:
# Make sure that the distribution of fraudulent/legitimate transactions are consistent across the three different sets.
print('The distribution of fraud for the train data is:\n', df_train['Label'].value_counts(normalize=True))
print('The distribution of fraud for the validation set is:\n', df_val['Label'].value_counts(normalize=True))
print('The distribution of fraud for the test set is:\n', df_test['Label'].value_counts(normalize=True))

The distribution of fraud for the train data is:
 Label
0    0.97893
1    0.02107
Name: proportion, dtype: float64
The distribution of fraud for the validation set is:
 Label
0    0.979411
1    0.020589
Name: proportion, dtype: float64
The distribution of fraud for the test set is:
 Label
0    0.979412
1    0.020588
Name: proportion, dtype: float64


Looks good.

In [7]:
def network_grapher(dataframe):
    # create graph network from given dataframe
    G = nx.from_pandas_edgelist(
    dataframe,
    source = 'Sender_Account',
    target = 'Bene_Account',
    edge_attr=['USD_amount', 'Label', 'timestamp'],
    create_using=nx.DiGraph())
    return G


def dataframe_feature_engineerer(graph, dataframe):
    # augments a data frame with some engineered features

    # First we generate dataframes to store all the info.
    # Then we merge all of the dataframes together into one.
    # Then we merge the original dataframe with our dataframe consisting of engineered features.



    ## Feature Engineering 


    # determines which accounts are known to have been involved in a fraudulent transaction
    known_fraud_accounts = set(dataframe[dataframe['Label'] == 1]['Sender_Account']).union(set(dataframe[dataframe['Label'] == 1]['Bene_Account']))
    df_known_fraud = pd.DataFrame(known_fraud_accounts, columns=['Account'])
    df_known_fraud['is_known_fraud'] = 1

    # stores number of transactions going into each account
    df_in_degree = pd.DataFrame(graph.in_degree(), columns = ['Account', 'in_degree'])

    # stores number of tranactions going out of each account
    df_out_degree = pd.DataFrame(graph.out_degree(), columns = ['Account', 'out_degree'])

    # "computes a ranking of the nodes in the graph G based on the structure of the incoming links" (from networkx documentation)
    # accounts with high pagerank are highly connected. Could expose fraud rings?
    df_pagerank = pd.DataFrame(nx.pagerank(graph, weight='USD_amount').items(), columns=['Account', 'pagerank'])

    # another attempt to expose fraud rings. Google suggested I try it...
    # computes clustering coefficient (something to do with triangles...)
    # If you look at the augmented dataframe I generate later on, you'll see that the clustering coefficient is zero in the entire training set. maybe this isn't very useful...

    df_clustering = pd.DataFrame(nx.clustering(graph.to_undirected()).items(), columns=['Account', 'clustering_coeff'])



    df_first_num = pd.DataFrame(  ,columns = ['Account', 'first_num'])



    #df_katz_centrality = pd.DataFrame(nx.katz_centrality(graph).items(), columns=['Account', 'katz_centrality'])

    
    
    ## Merging the dataframes for engineered features
    account_features = (
    df_in_degree.merge(df_out_degree, on='Account', how='outer')
    .merge(df_pagerank, on='Account', how='outer')
    .merge(df_clustering, on='Account', how='outer')
    .merge(df_known_fraud, on='Account', how='left')
    .fillna(0) # Fill NaN values, assuming 0 for accounts without a specific feature
    )
     #.merge(df_katz_centrality, on='account', how = 'outer')

    # Rename columns for clarity before merging into transactions
    account_features_sender = account_features.add_prefix('Sender_')
    account_features_benefactor = account_features.add_prefix('Bene_')

    


    ## Merging features back into the main transaction DataFrame
    dataframe = dataframe.merge(account_features_sender, on='Sender_Account', how='left')
    new_dataframe = dataframe.merge(account_features_benefactor, on='Bene_Account', how='left')


    return new_dataframe



In [8]:
new_graph = network_grapher(df_train)

In [9]:
aug_df = dataframe_feature_engineerer(new_graph, df_train)

In [None]:
aug_df

In [11]:
aug_df.loc[aug_df['Sender_clustering_coeff']>1]

Unnamed: 0,Transaction_Id,Sender_Id,Sender_Account,Sender_Country,Bene_Id,Bene_Account,Bene_Country,USD_amount,Label,Transaction_Type,...,Sender_in_degree,Sender_out_degree,Sender_pagerank,Sender_clustering_coeff,Sender_is_known_fraud,Bene_in_degree,Bene_out_degree,Bene_pagerank,Bene_clustering_coeff,Bene_is_known_fraud


In [12]:

features = ['USD_amount','Sender_in_degree',
 'Sender_out_degree',
 'Sender_pagerank',
 'Sender_clustering_coeff',
 'Sender_is_known_fraud',
 'Bene_in_degree',
 'Bene_out_degree',
 'Bene_pagerank',
 'Bene_clustering_coeff',
 'Bene_is_known_fraud']

In [13]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV



In [31]:
fraud_count = aug_df['Label'].sum()
normal_count = len(aug_df['Label']) - fraud_count
scale_pos_weight = normal_count / fraud_count

xgb_search = GridSearchCV(XGBClassifier(random_state = 831, scale_pos_weight = scale_pos_weight ),
                       param_grid= {'learning_rate':[0.01, 0.1, 1,],
                       'n_estimators': [100, 300],
                       'max_depth': [10,30,50]},
                                    scoring = 'f1',
                                    cv = 5)

In [32]:
xgb_search.fit(aug_df[features], aug_df['Label'])

0,1,2
,estimator,"XGBClassifier...tate=831, ...)"
,param_grid,"{'learning_rate': [0.01, 0.1, ...], 'max_depth': [10, 30, ...], 'n_estimators': [100, 300]}"
,scoring,'f1'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [33]:
xgb_search.best_params_

{'learning_rate': 0.01, 'max_depth': 50, 'n_estimators': 300}

In [34]:
xgb_search.best_score_

np.float64(0.25899845568321456)

In [35]:
pd.DataFrame({'feature importance score':xgb_search.best_estimator_.feature_importances_}, index=features).sort_values(by= 'feature importance score', ascending=False)


Unnamed: 0,feature importance score
Bene_is_known_fraud,0.744161
Sender_is_known_fraud,0.255596
Bene_out_degree,5.4e-05
Bene_in_degree,4.1e-05
Sender_out_degree,3.3e-05
USD_amount,3e-05
Bene_pagerank,2.9e-05
Sender_pagerank,2.9e-05
Sender_in_degree,2.7e-05
Sender_clustering_coeff,0.0


In [25]:

print(f"Scale position weight for XGBoost: {scale_pos_weight}")

Scale position weight for XGBoost: 46.461440984793626


Area Under the Precision-Recall Curve as performance metric??