In [782]:
import pandas as pd
import numpy as np
from numpy import random
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix,ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score
import pickle

                                                       Data Cleaning

In [784]:
total_data = pd.read_csv('fraud_payment_data', sep=',', header=0) 

In [785]:
#Removing redudant columns
total_data=total_data.drop('Time_step',axis=1)
total_data=total_data.drop('Sender_lob',axis=1)
total_data=total_data.drop('Sender_Id',axis=1)
total_data=total_data.drop('Bene_Id',axis=1)
total_data=total_data.drop('Transaction_Id',axis=1)

#Apparently some transactions amounted to zero dollars. None of them were fraudulent, so I've removed them.
total_data=total_data[total_data.USD_amount>0]
#The NaNs represent self-transactions. Correct for these here.
total_data['Sender_Country']=total_data['Sender_Country'].fillna(total_data['Bene_Country'])
total_data['Bene_Country']=total_data['Bene_Country'].fillna(total_data['Sender_Country'])
total_data['Sender_Sector'] = total_data['Sender_Sector'].fillna(-1)
total_data['Sender_Account']=total_data['Sender_Account'].fillna(total_data['Bene_Account'])
total_data['Bene_Account']=total_data['Bene_Account'].fillna(total_data['Sender_Account'])

total_data=total_data.reset_index(drop=True)

                                                      Feature Engineering

In [745]:
#One-hot encode Transaction Type
Type_feature= pd.get_dummies(total_data['Transaction_Type'], drop_first=True).astype(int)
countries=list(set(total_data['Sender_Country']).union(set(total_data['Bene_Country'])))
sectors=list(set(total_data['Sender_Sector']))

In [747]:
#Used to model the probability distribution of amount spent/recieved in a transaction given a historical average.
#Returns probability of the transaction amount being greater than or equal to the one observed
from scipy.stats import expon
def amount_prob(mean,amount):
  return expon.cdf(x=amount, scale=mean)

In [749]:
## Function that takes as input a node (an account), and a graph of transactions, and outputs the smallest  
## number of transactions seperating the account from an account that had previously been involved in fraud. 
## The number of transactions is counted in the forward direction (outgoing transactions), and been restricted to at maximum length 5.
## If greater than 5, then just return a huge value so the corresponding feature has a negligent value.
def min_path_to_fraud(sender_node,G):
    reach=nx.single_source_shortest_path_length(G, sender_node, cutoff=5)
    Fraud_nodes=[node for node in reach.keys() if G.nodes[node]['Fraud_count']>0]
    shortest_lengths = [reach[node] for node in Fraud_nodes]
    if shortest_lengths:
        return min(shortest_lengths)
    else:
        return 100000000 

## An alternative measure of connectedness to the above. This measures what percentage of the nodes within 5
## transactions of the given node are historically involved in fraud.
def fraud_centrality(node,G):
    reach=nx.single_source_shortest_path_length(G, node, cutoff=5).keys()
    return len([node for node in reach if G.nodes[node]['Fraud_count']>0])/len(reach)

##Returns whether a pair of sender, beneficiary has had a fraudulent transaction before
def repeat_fraud(G,sender_node,bene_node):
    Fraud= False
    for transaction in G[sender_node][bene_node]:
        if G[sender_node][bene_node][transaction]['Label']==1:
            Fraud=True
    return Fraud

In [None]:
##Initiate graph
G = nx.MultiDiGraph()
##Define dictionaries to keep track of fraud rates by country and sector
country_dict=dict.fromkeys(countries, [0,0])
sector_dict=dict.fromkeys(sectors, [0,0])

##Initiate features we aim to engineer
##bene/sender_prob is the probability explained above, Fraud_percentage_bene/sender is the percentage of fraudulent transactions among all 
##transactions (ingoing and outgoing) the sender account has been involved in, and Fraud_index_bene/sender is the distance given by the 
##min_path function above. Fraud centrality is given by the above function.  
features_bene= pd.DataFrame(columns=['bene_prob','Fraud_percentage_bene','Fraud_index_bene',
                                   'Fraud_centrality_bene','bene_in_deg','bene_out_deg','fraud_rate_by_country_bene'])
features_sender= pd.DataFrame(columns=['sender_prob','Fraud_percentage_sender','Fraud_index_sender',
                                    'Fraud_centrality_sender','sender_in_deg','sender_out_deg','fraud_rate_by_country_sender'])

##Repeat fraud is given by the function above.
features_general=pd.DataFrame(columns=['Repeat_Fraud','fraud_rate_by_sector'])
for index, row in total_data.iterrows():
    if index%1000==0:
       print(index)
    if index%10000==0:
       total=pd.concat([features_bene,features_sender,features_general],axis=1)
       total.to_csv('total_features', index=False)
       print('Progress Saved!')
        
    ##First build general features
    fraud_rate_by_country_sender=country_dict[row['Sender_Country']][0]
    fraud_rate_by_country_Bene=country_dict[row['Bene_Country']][0]
    fraud_rate_by_sector=sector_dict[row['Sender_Sector']][0]

    country_dict[row['Sender_Country']][0]=(country_dict[row['Sender_Country']][0]*country_dict[row['Sender_Country']][1]+row['Label'])/(country_dict[row['Sender_Country']][1]+1)
    country_dict[row['Sender_Country']][1]+=1
    country_dict[row['Bene_Country']][0]=(country_dict[row['Bene_Country']][0]*country_dict[row['Bene_Country']][1]+row['Label'])/(country_dict[row['Bene_Country']][1]+1)
    country_dict[row['Bene_Country']][1]+=1
    sector_dict[row['Sender_Sector']][0]=(sector_dict[row['Sender_Sector']][0]*sector_dict[row['Sender_Sector']][1]+row['Label'])/(sector_dict[row['Sender_Sector']][1]+1)
    sector_dict[row['Sender_Sector']][1]+=1
    
    new= not(G.has_edge(row['Sender_Account'],row['Bene_Account']))
    repeatfraud=(not new) and repeat_fraud(G,row['Sender_Account'],row['Bene_Account'])
    features_general.loc[index]=[repeatfraud,fraud_rate_by_sector]
    
    ##Build features related to sender accounts 
    check1=G.has_node(row['Sender_Account'])
    if check1: ## If node already exists (i.e sender account involved in some transaction before)  
      sender_in_deg=G.in_degree(row['Sender_Account'])
      sender_out_deg=G.out_degree(row['Sender_Account'])
      Fraud_percentage_sender=G.nodes[row['Sender_Account']]['Fraud_count']/(sender_in_deg+sender_out_deg)                                                                               
      Fraud_centrality_sender=fraud_centrality(row['Sender_Account'],G)
      Fraud_index_sender=1/(1+min_path_to_fraud(row['Sender_Account'],G))
      if sender_out_deg>0: ## If node has been involved in an outgoing transaction
        ##Engineer sender account features 
        sender_prob=amount_prob(G.nodes[row['Sender_Account']]['total_out']/sender_out_deg,row['USD_amount'])
        features_sender.loc[index]=[sender_prob,Fraud_percentage_sender,Fraud_index_sender,
                                    Fraud_centrality_sender,sender_in_deg,sender_out_deg,fraud_rate_by_country_sender]
      else:
        ##Engineer sender account features with default value for the prob feature as zero if no outgoing transaction history.
        features_sender.loc[index]=[0,Fraud_percentage_sender,Fraud_index_sender,
                                    Fraud_centrality_sender,sender_in_deg,0,fraud_rate_by_country_sender]
        
    else:##If node does not exist, put 0 as default value for features where appropriate 
      features_sender.loc[index]=[0,0,0,0,0,0,fraud_rate_by_country_sender]
    
    ##Repeat the same for beneficiary account
    check2=G.has_node(row['Bene_Account'])
    if check2:
      Bene_in_deg=G.in_degree(row['Bene_Account'])
      Bene_out_deg=G.out_degree(row['Bene_Account'])
      Fraud_percentage_Bene=G.nodes[row['Bene_Account']]['Fraud_count']/(Bene_in_deg+Bene_out_deg)                                                                                 
      Fraud_centrality_Bene=fraud_centrality(row['Bene_Account'],G)
      Fraud_index_Bene=1/(1+min_path_to_fraud(row['Bene_Account'],G))
      if Bene_in_deg>0: 
        Bene_prob=amount_prob(G.nodes[row['Bene_Account']]['total_in']/Bene_in_deg,row['USD_amount'])
        features_bene.loc[index]=[Bene_prob,Fraud_percentage_Bene,Fraud_index_Bene,
                                    Fraud_centrality_Bene,Bene_in_deg,Bene_out_deg,fraud_rate_by_country_Bene]
      else:
        features_bene.loc[index]=[0,Fraud_percentage_Bene,Fraud_index_Bene,
                                    Fraud_centrality_Bene,0,Bene_out_deg,fraud_rate_by_country_Bene]
    else:
      features_bene.loc[index]=[0,0,0,0,0,0,fraud_rate_by_country_Bene]

    check3=(row['Sender_Account']==row['Bene_Account'])#For self-transactions
    ##Add/update edges and nodes in the graph corresponding to the transaction
    if check1:  
      G.nodes[row['Sender_Account']]['total_out']+=row['USD_amount']
      G.nodes[row['Sender_Account']]['Fraud_count']+=row['Label']
    else:
      G.add_node(row['Sender_Account'], total_out=row['USD_amount'], total_in=0, Fraud_count=row['Label'])
    if check2 or check3:  
      G.nodes[row['Bene_Account']]['total_in']+=row['USD_amount']
      G.nodes[row['Bene_Account']]['Fraud_count']+=row['Label']
    else:
      G.add_node(row['Bene_Account'], total_in=row['USD_amount'], total_out=0, Fraud_count=row['Label'])
    
    G.add_edge(row['Sender_Account'], row['Bene_Account'],Label=row['Label'])

total=pd.concat([features_bene,features_sender,features_general,Type_feature],axis=1)
total.to_csv('total_features', index=False)


In [None]:
##Save graph as a .pickle file
with open('Graph_total.pickle', 'wb') as f:
    pickle.dump(G, f)

                                                Training and Calibration of Models

In [751]:
##Optimizing XGBoost
xgb_search = GridSearchCV(XGBClassifier(random_state = 831, scale_pos_weight = 49 ),
                       param_grid= {'learning_rate':[0.01, 0.1, 1,],
                       'n_estimators': [100, 300, 500, 700],
                       'max_depth': [10,30,50,70,90,100]},
                                    scoring = 'f1',
                                    cv = 5)

In [753]:
final_features=pd.read_csv('total_features', sep=',', header=0) 

In [None]:
final_features

In [757]:
##Split for train, validate and test data
train_features=final_features[0:1000000]
validate_features=final_features[1000000:1250000]
test_features=final_features[1250000:-1]

y_train=total_data['Label'][0:1000000]
y_validate=total_data['Label'][1000000:1250000]
y_test=total_data['Label'][1250000:-1]

##Scaler for eventual Logistic regression
X=StandardScaler().fit_transform(train_features.values)

In [None]:
##Looking at the correlation matrix for our features in our training data
corr_matrix = pd.concat([train_features,total_data[:1000000]['Label']],axis=1).corr()
plt.figure(figsize=(12,8))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

In [None]:
xgb_search.fit(X,y_train)

In [None]:
print(xgb_search.best_params_)
print(xgb_search.best_score_)

                                                Results and Visualizations

In [None]:
##Model testing 
log_reg = LogisticRegression(class_weight='balanced', penalty=None)
xgb=XGBClassifier(learning_rate= 0.01, max_depth= 70, n_estimators= 700, scale_pos_weight = 50)

## fit the model
xgb.fit(X,y_train)
log_reg.fit(X,y_train)

In [None]:
##Test
cutoff = 0.85
## store the predicted probabilities
y_prob = log_reg.predict_proba(StandardScaler().fit_transform(test_features.values))[:,1]
## assign the value based on the cutoff
y_pred = 1*(y_prob >= cutoff)
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {log_reg.__class__.__name__}')
plt.show()


cutoff = 0.5
## store the predicted probabilities
y_prob = xgb.predict_proba(StandardScaler().fit_transform(test_features.values))[:,1]
## assign the value based on the cutoff
y_pred = 1*(y_prob >= cutoff)
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix for {xgb.__class__.__name__}')
plt.show()

In [None]:
##Generate Recall-Lift graph for logistic regression (balancing for the class imbalance). Variable recall is obtained by varying the cutoff.
lift_scores=np.zeros(100)
recall_scores=np.zeros(100)
log_reg = LogisticRegression(class_weight='balanced', penalty=None)
log_reg.fit(X,y_train)
y_prob = log_reg.predict_proba(StandardScaler().fit_transform(test_features.values))[:,1]
for i in range(100):
    y_pred = 1*(y_prob >= i/100)    
    recall = recall_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    ppr=(tp+fp)/(tn+fp+fn+tp)
    lift=recall/ppr
    recall_scores[i]=recall
    lift_scores[i]=lift

In [None]:
##Generate Recall-Lift graph for optimized XGBoost models with different cutoffs (balancing for the class imbalance). 
##Variable recall is obtained by varying the weight of the class.
for j in range(5): 
  lift_scores2=np.zeros(20)
  recall_scores2=np.zeros(20)
  for i in range(20):
    xgb=XGBClassifier(learning_rate= 0.01, max_depth= 70, n_estimators= 700, scale_pos_weight = 4**i)
    xgb.fit(X,y_train)
    y_prob=xgb.predict_proba(StandardScaler().fit_transform(test_features.values))[:,1]
    y_pred = 1*(y_prob >= (j+1)/10)
    recall = recall_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    ppr=(tp+fp)/(tn+fp+fn+tp)
    lift=recall/ppr
    recall_scores2[i]=recall
    lift_scores2[i]=lift
    print(i)
  plt.plot(recall_scores2, lift_scores2,label=('XGBoost with threshold='+str((j+1)/10))) 
plt.plot(recall_scores, lift_scores,label='Logistic Regression')
plt.plot(recall_scores, np.ones(len(recall_scores)),label='Baseline')
plt.legend()
plt.xlabel("Recall")
plt.ylabel("Lift")