In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import networkx as nx #Requires 'pip install networkx'
import community as community_louvain  # Requires 'pip install python-louvain'


In [2]:
df = pd.read_csv('fraud_payment_data', sep=',', header=0)
df

Unnamed: 0,Time_step,Transaction_Id,Sender_Id,Sender_Account,Sender_Country,Sender_Sector,Sender_lob,Bene_Id,Bene_Account,Bene_Country,USD_amount,Label,Transaction_Type
0,2022-03-15 10:24:00,EXCHANGE-10115,JPMC-CLIENT-10098,ACCOUNT-10108,USA,35537.0,CCB,,,,558.43,0,WITHDRAWAL
1,2022-03-15 10:24:00,QUICK-PAYMENT-10116,JPMC-CLIENT-10098,ACCOUNT-10109,USA,15287.0,CCB,CLIENT-10100,ACCOUNT-10106,CANADA,622.78,0,QUICK-PAYMENT
2,2022-03-15 10:24:00,DEPOSIT-CASH-9833,,,,,,JPMC-CLIENT-9812,ACCOUNT-9826,USA,802.54,0,DEPOSIT-CASH
3,2022-03-15 10:24:00,PAY-CHECK-9832,JPMC-CLIENT-9812,ACCOUNT-9825,USA,38145.0,CCB,JPMC-CLIENT-9814,ACCOUNT-9824,USA,989.09,0,PAY-CHECK
4,2022-03-15 10:24:00,DEPOSIT-CHECK-9806,,,,,,JPMC-CLIENT-9789,ACCOUNT-9800,USA,786.78,0,DEPOSIT-CHECK
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1498172,2054-09-14 13:17:24,PAY-BILL-2180018,JPMC-CLIENT-2179746,ACCOUNT-2179757,USA,24701.0,CCB,BILL-COMPANY-2179895,ACCOUNT-2179896,USA,159.64,0,MAKE-PAYMENT
1498173,2054-09-15 00:57:24,QUICK-PAYMENT-2181421,JPMC-CLIENT-2181147,ACCOUNT-2181158,USA,4953.0,CCB,CLIENT-2181279,ACCOUNT-2181280,USA,120.07,0,QUICK-PAYMENT
1498174,2054-09-15 01:17:24,PAYMENT-2180021,JPMC-CLIENT-2179746,ACCOUNT-2179757,USA,34784.0,CCB,BILL-COMPANY-2180022,ACCOUNT-2180023,USA,25.32,0,MAKE-PAYMENT
1498175,2054-09-15 12:57:24,QUICK-PAYMENT-2181422,JPMC-CLIENT-2181147,ACCOUNT-2181158,USA,23822.0,CCB,CLIENT-2181279,ACCOUNT-2181280,USA,610.91,0,QUICK-PAYMENT


In [3]:
#Converting time into a Datetime object
df['timestamp'] = pd.to_datetime(df['Time_step'])
#Removing redudant columns
df=df.drop('Time_step',axis=1)
df=df.drop('Sender_lob',axis=1)
#df=df.drop('Sender_Sector',axis=1)
#Apparently some transactions amounted to zero dollars. None of them were fraudulent, so I've removed them.
df=df[df.USD_amount>0]
df=df.reset_index(drop=True)
df

Unnamed: 0,Transaction_Id,Sender_Id,Sender_Account,Sender_Country,Sender_Sector,Bene_Id,Bene_Account,Bene_Country,USD_amount,Label,Transaction_Type,timestamp
0,EXCHANGE-10115,JPMC-CLIENT-10098,ACCOUNT-10108,USA,35537.0,,,,558.43,0,WITHDRAWAL,2022-03-15 10:24:00
1,QUICK-PAYMENT-10116,JPMC-CLIENT-10098,ACCOUNT-10109,USA,15287.0,CLIENT-10100,ACCOUNT-10106,CANADA,622.78,0,QUICK-PAYMENT,2022-03-15 10:24:00
2,DEPOSIT-CASH-9833,,,,,JPMC-CLIENT-9812,ACCOUNT-9826,USA,802.54,0,DEPOSIT-CASH,2022-03-15 10:24:00
3,PAY-CHECK-9832,JPMC-CLIENT-9812,ACCOUNT-9825,USA,38145.0,JPMC-CLIENT-9814,ACCOUNT-9824,USA,989.09,0,PAY-CHECK,2022-03-15 10:24:00
4,DEPOSIT-CHECK-9806,,,,,JPMC-CLIENT-9789,ACCOUNT-9800,USA,786.78,0,DEPOSIT-CHECK,2022-03-15 10:24:00
...,...,...,...,...,...,...,...,...,...,...,...,...
1498150,PAY-BILL-2180018,JPMC-CLIENT-2179746,ACCOUNT-2179757,USA,24701.0,BILL-COMPANY-2179895,ACCOUNT-2179896,USA,159.64,0,MAKE-PAYMENT,2054-09-14 13:17:24
1498151,QUICK-PAYMENT-2181421,JPMC-CLIENT-2181147,ACCOUNT-2181158,USA,4953.0,CLIENT-2181279,ACCOUNT-2181280,USA,120.07,0,QUICK-PAYMENT,2054-09-15 00:57:24
1498152,PAYMENT-2180021,JPMC-CLIENT-2179746,ACCOUNT-2179757,USA,34784.0,BILL-COMPANY-2180022,ACCOUNT-2180023,USA,25.32,0,MAKE-PAYMENT,2054-09-15 01:17:24
1498153,QUICK-PAYMENT-2181422,JPMC-CLIENT-2181147,ACCOUNT-2181158,USA,23822.0,CLIENT-2181279,ACCOUNT-2181280,USA,610.91,0,QUICK-PAYMENT,2054-09-15 12:57:24


In [4]:
df['Sender_Sector']=df['Sender_Sector'].fillna(-1)
df['Sender_Account']=df['Sender_Account'].fillna(df['Bene_Account'])
df['Bene_Account']=df['Bene_Account'].fillna(df['Sender_Account'])

In [5]:
#we will take about 70% of our dataset as training data, about 15% as a validation set and use the remaining 15% as our test set.
cutoff = round(0.7*len(df)) 
df_train = df.head(cutoff)
not_train =  df.tail(len(df)-cutoff)
cutoff2 = round(0.5*len(not_train))
df_val = not_train.head(cutoff2)
df_test = not_train.tail(len(not_train)-cutoff2)

In [17]:
def dataframe_feature_engineerer(graph, dataframe):
   
    # augments a data frame with some engineered features

    # First we generate dataframes to store all the info.
    # Then we merge all of the dataframes together into one.
    # Then we merge the original dataframe with our dataframe consisting of engineered features.




    ## ---- First we'll create a 'proximity_to_fraud' feature and integrate it into our original dataframe ------

    ## Identify fraudulent nodes
    fraudulent_nodes=set(dataframe.loc[dataframe['Label']==1]['Sender_Account']).union(set(dataframe.loc[dataframe['Label']==1]['Bene_Account']))


    # I asked Google to help write the following chunk of code... We can change it later.

    distance_to_fraud = {node: -1 for node in graph.nodes()}

    # For each node, find the shortest path length to *any* fraudulent node
    for source_node in graph.nodes():
        try:
            path_lengths = nx.shortest_path_length(graph, source=source_node,cutoff=5)
            min_distance = float('inf')
            for target_node in fraudulent_nodes:
                if target_node in path_lengths:
                    min_distance = min(min_distance, path_lengths[target_node])
            if min_distance != float('inf'):
                distance_to_fraud[source_node] = min_distance
        except nx.NetworkXNoPath:
            # If a node is not connected to the main component, the distance is infinite
            pass

    # Assign distance as an edge attribute ---
    # For each edge (transaction), assign a feature representing its proximity to fraud.
    # The proximity is defined as the minimum distance of its two endpoints to a fraudulent node.

    # First, create a dictionary to store the new attributes
    proximity_scores = {}
    for u, v, key, data in graph.edges(data=True, keys=True):
        # Get the distances for the two nodes connected by the edge
        dist_u = distance_to_fraud.get(u, -1)
        dist_v = distance_to_fraud.get(v, -1)
    
        # Handle disconnected components by assigning a large value
        if dist_u == -1 or dist_v == -1:
            proximity = -1 # Or a very large number like float('inf')
        else:
            # The proximity of the edge is the minimum of the two endpoint distances
            proximity = min(dist_u, dist_v)

        # Store the result, using the transaction_id as the key
        proximity_scores[data['Transaction_Id']] = proximity


        # Now, update the original DataFrame with the new feature
    dataframe['proximity_to_fraud'] = dataframe['Transaction_Id'].map(proximity_scores)
    dataframe['proximity_to_fraud'].fillna(-1, inplace=True)

    # Update graph so that it has 'proximity to fraud as an edge attribute

    new_graph = nx.from_pandas_edgelist(
    dataframe,
    source = 'Sender_Account',
    target = 'Bene_Account',
    edge_attr=['USD_amount', 'Label', 'timestamp', 'Transaction_Id', 'proximity_to_fraud'],
    create_using=nx.MultiDiGraph())



    ## Louvain Community Partitioner 
    partitions = community_louvain.best_partition(new_graph.to_undirected(), weight='proximity_to_fraud')
    Louvain_community_df = pd.DataFrame.from_dict(partitions, orient='index', columns=['community_id'])
    Louvain_community_df.index.name = 'Account'
    #Louvain_community_df = Louvain_community_df.reset_index()
    #Louvain_community_df['Account'] = Louvain_community_df['Account'].astype(str)


    # Merge community information back into the transaction DataFrame
    dataframe = pd.merge(
        dataframe,
        Louvain_community_df, 
        left_on='Sender_Account',
        right_on='Account', 
        right_index=True, 
        how='left', 
        suffixes=('', '_sender')
    )
    dataframe = dataframe.merge(
        Louvain_community_df, 
        left_on='Bene_Account',
        right_on='Account', 
        right_index=True, 
        how='left', 
        suffixes=('_sender', '_beneficiary')
    )

    # Create community-based features
    dataframe['same_community'] = (dataframe['community_id_sender'] == dataframe['community_id_beneficiary']).astype(int)

    # Calculate and map the average fraud rate within each community
    community_fraud_rate = dataframe.groupby('community_id_sender')['Label'].mean().to_dict()
    dataframe['community_fraud_rate_sender'] = dataframe['community_id_sender'].map(community_fraud_rate)
    dataframe['community_fraud_rate_beneficiary'] = dataframe['community_id_beneficiary'].map(community_fraud_rate)

    ''''# Calculate and map the size of each community
    community_size = Louvain_community_df.groupby('community_id')['Account'].count().to_dict()
    dataframe['community_size_sender'] = dataframe['community_id_sender'].map(community_size)
    dataframe['community_size_beneficiary'] = dataframe['community_id_beneficiary'].map(community_size)'''



    ## Feature Engineering 


    # determines which accounts are known to have been involved in a fraudulent transaction
    known_fraud_accounts = set(dataframe[dataframe['Label'] == 1]['Sender_Account']).union(set(dataframe[dataframe['Label'] == 1]['Bene_Account']))
    df_known_fraud = pd.DataFrame(known_fraud_accounts, columns=['Account'])
    df_known_fraud['is_known_fraud'] = 1

    # stores number of transactions going into each account
    df_in_degree = pd.DataFrame(new_graph.in_degree(), columns = ['Account', 'in_degree'])

    # stores number of tranactions going out of each account
    df_out_degree = pd.DataFrame(new_graph.out_degree(), columns = ['Account', 'out_degree'])

    # "computes a ranking of the nodes in the graph G based on the structure of the incoming links" (from networkx documentation)
    # accounts with high pagerank are highly connected. Could expose fraud rings?
    df_pagerank = pd.DataFrame(nx.pagerank(new_graph, weight='USD_amount').items(), columns=['Account', 'pagerank'])
    
    
    ## Merging the dataframes for engineered features
    account_features = (
    df_in_degree.merge(df_out_degree, on='Account', how='outer')
    .merge(df_pagerank, on='Account', how='outer')
    #.merge(df_clustering, on='Account', how='outer')
    .merge(df_known_fraud, on='Account', how='left')
    .fillna(0) # Fill NaN values, assuming 0 for accounts without a specific feature
    )
     #.merge(df_katz_centrality, on='account', how = 'outer')

    # Rename columns for clarity before merging into original dataframe
    account_features_sender = account_features.add_prefix('Sender_')
    account_features_benefactor = account_features.add_prefix('Bene_')

    


    ## Merging features back into the main transaction DataFrame
    dataframe = dataframe.merge(account_features_sender, on='Sender_Account', how='left')
    new_dataframe = dataframe.merge(account_features_benefactor, on='Bene_Account', how='left')

    new_dataframe['First_Number'] = new_dataframe['USD_amount'].astype(str).str[0]


    return new_dataframe



In [None]:
import pickle # Requires 'pip install pickle'
with open('Graph_finale_train.pickle', 'rb') as f:
        G_train = pickle.load(f)
aug_df_train = dataframe_feature_engineerer(G_train, df_train)
aug_df_train.to_csv('augmented_train.csv', index=False)

In [None]:
import pickle # Requires 'pip install pickle'
with open('Graph_finale_val.pickle', 'rb') as f:
        G_val = pickle.load(f)
aug_df_val = dataframe_feature_engineerer(G_val, df_val)
aug_df_val.to_csv('augmented_val.csv', index=False)

In [None]:
import pickle  # Requires 'pip install pickle'
with open('Graph_finale_test.pickle', 'rb') as f:
        G_test = pickle.load(f)
aug_df_test = dataframe_feature_engineerer(G_test, df_test)
aug_df_test.to_csv('augmented_test.csv', index=False)