In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
import community as community_louvain
import networkx as nx

In [None]:
df = pd.read_csv('/Users/noimotbakare/Dropbox/Fraud_Payments/data/fraud_payment_data', sep=',', header=0)


In [None]:
#Converting time into a Datetime object
df['timestamp'] = pd.to_datetime(df['Time_step'])
#Removing redudant columns
df=df.drop('Time_step',axis=1)
#df=df.drop('Sender_lob',axis=1)
#df=df.drop('Sender_Sector',axis=1)
#Apparently some transactions amounted to zero dollars. None of them were fraudulent, so I've removed them.
#=====>Many credit card fraud datasets contain transactions with an amount of zero, which are actually merchants verifying that a card is active and functional.
#While these are not fraudulent, they are part of a regular transaction pattern.
#Removing them would eliminate a significant piece of normal customer behavior, which the model needs to understand. 
#df=df[df.USD_amount>0]
#df

In [None]:
#Engineering Features that do not rely on Statistics before train test split

#First Digit 
df['USD_amount'] = pd.to_numeric(df['USD_amount'], errors='coerce')
#Extract the first digit from each amount
df['first_digit'] = df['USD_amount'].astype(str).str.strip().str[0]
df = df[df['first_digit'].str.isdigit()]
df['first_digit'] = df['first_digit'].astype(int)

#Last Digit 
#Extract the last digit after decimal USD_amount
df['last_digit_after_dec'] = df['USD_amount'].astype(str).str.split('.').str[1].str[-1]
df = df[df['last_digit_after_dec'].str.isdigit()]
df['last_digit_after_dec'] = df['last_digit_after_dec'].astype(int)

In [None]:
#we will take 70% of our dataset as training data, 15% as a validation set and use the remaining 15% as our test set.
cutoff = round(0.7*len(df)) 
df_train = df.head(cutoff)
not_train =  df.tail(len(df)-cutoff)
cutoff2 = round(0.5*len(df_train))
df_val = df.head(cutoff2)
df_test = df.tail(len(not_train)-cutoff2)

In [None]:
# Make sure that the distribution of fraudulent/legitimate transactions are consistent across the three different sets.
print('The distribution of fraud for the train data is:\n', df_train['Label'].value_counts(normalize=True))
print('The distribution of fraud for the validation set is:\n', df_val['Label'].value_counts(normalize=True))
print('The distribution of fraud for the test set is:\n', df_test['Label'].value_counts(normalize=True))

In [None]:
df_train

I am running the graph on only the training df. 
I use an undirected graph for clustering because most clustering algorithms (like nx.clustering) are defined for undirected graphs,and treats all connections as bidirectional. It measures how well-connected a node’s neighbors are to each other (triangles).

In [None]:

def calculate_all_graph_maps(df_train):
    
    #Calculates all graph features (Directed and Undirected) on the training data.

    
    #Build the Graphs (Directed and Undirected)
    # The directed graph (G_dir) is needed for PageRank, In-Degree, Out-Degree
    G_dir = nx.from_pandas_edgelist(
        #training df
        df_train,
        source='Sender_Account',
        target='Bene_Account',
        edge_attr='USD_amount',
        create_using=nx.DiGraph() 
    )
    
    #I need the undirected graph (G_undir) for Clustering and Louvain

    G_undir = G_dir.to_undirected()
    
    
    #Calculate and Store Directed Features
    
    #In-Degree and Out-Degree Maps 
    in_degree_map = pd.Series(dict(G_dir.in_degree()), name='in_degree')
    out_degree_map = pd.Series(dict(G_dir.out_degree()), name='out_degree')
    
    #PageRank
    pagerank_map = pd.Series(nx.pagerank(G_dir, weight='USD_amount'), name='pagerank')
    
    
    #Calculate and Store Undirected Features
    
    #Clustering Coefficient Map (Your missing feature is now here)
    clustering_map = pd.Series(nx.clustering(G_undir), name='clustering_coeff')
    
    # Louvain Community Map (MUST be on the undirected graph)
    # This result is a dictionary: {node_id: community_id}
    #louvain_map = community_louvain.best_partition(G_undir)
    
      #Unweighted Louvain
    louvain_unweighted = community_louvain.best_partition(G_undir)

    # Weighted Louvain using USD_amount
    louvain_weighted = community_louvain.best_partition(G_undir, weight='USD_amount')

    
    #Combining all maps into one dictionary
    return {
        # Directed Features
        'pagerank': pagerank_map,
        'in_degree': in_degree_map,
        'out_degree': out_degree_map, 
        
        # Undirected Features
        'clustering': clustering_map, 
        'louvain_unweighted': louvain_unweighted,
        'louvain_weighted': louvain_weighted,        
    }


GLOBAL_GRAPH_MAPS = calculate_all_graph_maps(df_train) 


Louvain Community
I compute Louvain Community ID for weighted and unweighted. 

Unweighted (reach and connectivity) treats all connections equally, regardless of the number of transactions or amounts involved. Edge values are 0 or 1 ( edge exist or doesn't). Based on the count of edges Q(modularity) is calculated to measure the difference between the ACTUAL number of edges within a community and the EXPECTED number of edges if the connections were random. 

Such that a single transaction between teo nodes has the same influence on the community structure as 1,000 transactions. It focuses purely on reach and answers the question WHO IS CONNECTED TO WHOM?

Weighted (Strength and Volume) Louvain (often preffered for transactions) The weighted approach incorporates an edge attribute which allows algorithms to prioritize stronger, more frequent, or higher value relationships. 

Such that stronger influences come from connections with higher weight. For example: two accounts that transfer a $1,000,000 are much more likely to be grouped into the same community than two accounts that transferred a much lower amount ($10). 


Weight are important because Unweighted might group a high-volume, low-value fraud ring with a legitimate large institution just because they share one tiny link with a random neighbor.

Weighted will strongly group accounts that have large or numerous transactions, correctly identifying them as a high-flow module. This allows the algorithm to distinguish a high-value money laundering ring (strong weight) from incidental, low-value connections (weak weight).

Because weighted (volume) and unweighted (reach) communities capture fundamentally different aspect of the network structure it is good practice to INCLUDE BOTH.

Robustness -  If an attacker tries to hide fraudulent activity by using very low amounts (thus minimizing the Weighted influence), the Unweighted feature might still flag the connection based on the existence of the link.

In [None]:

GLOBAL_GRAPH_MAPS = calculate_all_graph_maps(df_train)

#Map communities back into the dataframe
df_train['louvain_unweighted'] = df_train['Sender_Account'].map(GLOBAL_GRAPH_MAPS['louvain_unweighted']).fillna(-1)
df_train['louvain_weighted'] = df_train['Sender_Account'].map(GLOBAL_GRAPH_MAPS['louvain_weighted']).fillna(-1)


In [None]:
GLOBAL_GRAPH_MAPS

In [None]:
#Fraud rates sorted ascending
print("Unweighted Louvain Fraud Rates:")
print(df_train.groupby('louvain_unweighted')['Label'].mean().sort_values(ascending=False).head(10))

print("\nWeighted Louvain Fraud Rates:")
print(df_train.groupby('louvain_weighted')['Label'].mean().sort_values(ascending=False).head(10))



Total communities
5669 unweighted 
5705 weighted 
Weighted Louvain slightly splits more communities, meaning transaction volume influences clustering.
#Note to self - values change every run.....add a random state for reproducabiltiy 

Communities with ≥1 fraud
3180 (56%)
3166 (55%)
In both approaches, more than half of all communities contain fraud — fraud is not isolated to just a few clusters, but community detection still helps identify where it is concentrated. This suggests transaction amount influences structure, but fraud is not strictly driven by high-value connections only.

Again, we should keep both features because they capture different fraud mechanisms.

Over 55% of communities have fraud → fraud is not isolated.
This implies fraud is diffused but structured.
So the real value will come from how fraud concentrates (fraud rate) rather than just where it exists.

In [None]:
print("Unique unweighted communities:", df_train['louvain_unweighted'].nunique())
print("Unique weighted communities:", df_train['louvain_weighted'].nunique())

print("Communities with at least one fraud (unweighted):", (df_train.groupby('louvain_unweighted')['Label'].sum() > 0).sum())
print("Communities with at least one fraud (weighted):", (df_train.groupby('louvain_weighted')['Label'].sum() > 0).sum())


In [None]:
df_train

Weighted Louvain looks at communities (groups of accounts that interact frequently). Then tells us how big each community is and also what fraction of accounts in that community are fraudulent.
 the X axis represents communities- lower end of the X axis small communities, while the higher end represents large communities ( known as Whale group).
Yaxis represents fraud rate.
So the high risk fraud rings are represented by point (0, 1.0) at 100% fraud rate, are indicative of tight knit fraud rings or mule ("mule is an individual recruited by fraudsters to move illegally obtained funds through their own account”) account clusters.
The majority of the communities (nomal user population) clustered around (0,0) these are small communities with zero or near zero fraud rate. They are typical users with low fraud risk.
Then there is another oulier.
the point with 24,000 accounts and ~1% fraud rate, these are likely the core of the legitimate userbase, lots of accounts and very little fraud.
Basically, this tells us that fraud happens in clusters. And features that identify the clusters have an advantage over the individual-level
 features.  

In [None]:
# Weighted community stats
community_stats_weighted = df_train.groupby('louvain_weighted').agg(
    community_size=('Sender_Account', 'count'),
    fraud_rate=('Label', 'mean')
).reset_index()

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.scatter(
    community_stats_weighted['community_size'],
    community_stats_weighted['fraud_rate'],
    alpha=0.5
)
plt.title("Weighted Louvain: Community Size vs Fraud Rate")
plt.xlabel("Community Size (# of Accounts)")
plt.ylabel("Fraud Rate")
plt.grid(True)
plt.show()


The unweighted louvain treats each connection between accounts equally. It only cares about connections, not how much or how often they move money. unweighted Louvain method successfully isolates small, tightly connected fraud communities. It just tells us the accounts are closely linked. This doesn’t breakdown communities as granular as including weight = USD amount. You can see that the largest community in the unweighted is 14,000 but for weighted it is 24,000.


Both plots tell a similar story, the weighted community structure provides real financial behavior and gives a sharper segmentation of high-risk groups. It's suggested that we use both in our model.

In [None]:
# Unweighted community stats
community_stats_unweighted = df_train.groupby('louvain_unweighted').agg(
    community_size=('Sender_Account', 'count'),
    fraud_rate=('Label', 'mean')
).reset_index()

import matplotlib.pyplot as plt

# Set the figure size
plt.figure(figsize=(10, 6))

# Create the scatter plot using the unweighted stats
plt.scatter(
    community_stats_unweighted['community_size'],
    community_stats_unweighted['fraud_rate'],
    alpha=0.5
)

# Update the title to reflect unweighted analysis
plt.title("Unweighted Louvain: Community Size vs Fraud Rate")

# X and Y labels remain the same
plt.xlabel("Community Size (# of Accounts)")
plt.ylabel("Fraud Rate")
plt.grid(True)

# Save the plot
plt.savefig('unweighted_louvain_community_size_vs_fraud_rate.png')

I want to calculate the community fraud rate, the sender community fraud rate and the Beneficiary Community Fraud Rate 
 
 I am going to use Bayesian smoothing to handle communities with very few transactions. The purpose of Bayesian smoothing is to stabilize fraud rates of small communities by blending them with the gloabl average community fraud rate. In essence this prevents a tiny community with 1 fraud out of 1 transaction from having a perfect rate (100%). While a large community of say 100,000 has 1,000 fraud transactions and would get a fraud rate of 1%. So Bayesian smoothing wil pull the extreme rate of the small community towards the gloabal average to give it a more realistic estimate. Benefits - Stability, fairness, prevent overfitting, better model performance (generalizes better to unseen data).

In [None]:
#Community Fraud Rate 

community_stats_weighted = df_train.groupby('louvain_weighted').agg(
    community_size=('Sender_Account', 'count'),
    fraud_rate=('Label', 'mean')
).reset_index()

#Overall fraud count and total transaction count
global_fraud_count = df_train['Label'].sum()
global_total_tx = len(df_train)

# Global average fraud rate 
global_avg_rate = global_fraud_count / global_total_tx

#25th percentile of community sizes as smoothing count 
#Communities smaller than this will be heavily smoothed
alpha = community_stats_weighted['community_size'].quantile(0.25)

#Bayesian Smoothing
community_stats_weighted['fraud_tx_count'] = community_stats_weighted['community_size'] * community_stats_weighted['fraud_rate']
community_stats_weighted['Smoothed_Fraud_Rate'] = (
    (alpha * global_avg_rate) + community_stats_weighted['fraud_tx_count']
) / (
    alpha + community_stats_weighted['community_size']
)
#rename 
community_stats_weighted = community_stats_weighted.rename(
    columns={'fraud_rate': 'Raw_Fraud_Rate'}
)

community_stats_weighted

print("\n--- Smoothed Weighted Community Fraud Rates (Top 10) ---")
print(community_stats_weighted[['louvain_weighted', 'community_size', 'Raw_Fraud_Rate', 'Smoothed_Fraud_Rate']].sort_values(by='Smoothed_Fraud_Rate', ascending=False).head(10))

In [None]:


class FraudFeatureEngineer(BaseEstimator, TransformerMixin):

    def __init__(self, add_transaction_features=True,
                  add_sector_encoding=True, 
                  add_graph_features=True
                  ):
        


        #flags in the fit method
        self.add_transaction_features = add_transaction_features
        self.add_sector_encoding = add_sector_encoding
        self.add_graph_features = add_graph_features

        #Attributes to be learned during fit() initialized
        self.sender_count_map = None
        self.sender_mean_map = None
        self.sender_std_map = None
        self.bene_count_map = None
        self.bene_mean_map = None
        self.bene_std_map = None
        self.country_fraud_ratio_map = None
        self.sender_avg_time_map = None

        #Default in case of missing values 
        self.global_country_ratio_default = 0.0
        self.sector_target_map = None 
         #Other option is  median/mean of all time diffs
        self.global_time_diff_default = 0.0 #

        #Graph Features 
        self.pagerank_map = None
        self.in_degree_map = None
        self.out_degree_map = None
        self.clustering_map = None
        self.louvain_unweighted_map = None
        self.louvain_weighted_map = None

        self.community_fraud_rate_map = None
        self.global_community_fraud_rate_ = 0.0

    #FIT METHOD: Learns the statistics ONLY from the training data (X)
    def fit(self, X, y=None):
        
       
        if y is not None and 'Label' not in X.columns:
            X = X.copy()
            X['Label'] = y 
        #=====================
        #Transaction Features
        #====================
        if self.add_transaction_features:
            quick = X[X['Transaction_Type'] == 'QUICK-PAYMENT']
            
            #Quick Payment Transaction Features
            self.sender_count_map = quick.groupby('Sender_Account')['USD_amount'].count()
            self.sender_mean_map = quick.groupby('Sender_Account')['USD_amount'].mean()
            self.sender_std_map = quick.groupby('Sender_Account')['USD_amount'].std()

            self.bene_count_map = quick.groupby('Bene_Account')['USD_amount'].count()
            self.bene_mean_map = quick.groupby('Bene_Account')['USD_amount'].mean()
            self.bene_std_map = quick.groupby('Bene_Account')['USD_amount'].std()

            #Country Fraud Ratio (Target Leakage SAFE because we only FIT on X_train)
            fraud = X[X['Label'] == 1]
            total_count = X.groupby('Sender_Country')['Label'].count()
            fraud_count = fraud.groupby('Sender_Country')['Label'].count().fillna(0)
            self.country_fraud_ratio_map = fraud_count / total_count
            #Default for new countries
            self.global_country_ratio_default = fraud_count.sum() / total_count.sum() 

            #Time Difference Feature
            X_sorted = X.sort_values(['Sender_Account', 'timestamp'])
            # Note: diff() and dt.total_seconds() need pandas datetime objects
            X_sorted['time_diff'] = X_sorted.groupby('Sender_Account')['timestamp'].diff().dt.total_seconds()
            self.sender_avg_time_map = X_sorted.groupby('Sender_Account')['time_diff'].mean()
            #median for robustness
            self.global_time_diff_default = self.sender_avg_time_map.median() 
        #=====================
        #Sector Feature
        #=====================   
         
        if self.add_sector_encoding:
            X['Sender_Sector'] = X['Sender_Sector'].fillna(-1) 
            fraud_rate_by_sector = X.groupby('Sender_Sector')['Label'].mean()
            self.sector_target_map_ = fraud_rate_by_sector
            self.global_fraud_rate_ = X['Label'].mean()  
        #====================
        #Graph Feature 
        #=====================
      
        if self.add_graph_features:
            #Graphs (Directed and Undirected)
            G_dir = nx.from_pandas_edgelist(
                X, source='Sender_Account', target='Bene_Account', 
                edge_attr='USD_amount', create_using=nx.DiGraph() 
            )
            G_undir = G_dir.to_undirected()
            
            #Calculate and Store Directed Features
            self.in_degree_map = pd.Series(dict(G_dir.in_degree()), name='in_degree')
            self.out_degree_map = pd.Series(dict(G_dir.out_degree()), name='out_degree')
            self.pagerank_map = pd.Series(nx.pagerank(G_dir, weight='USD_amount'), name='pagerank')

            #Calculate and Store Undirected Features
            self.clustering_map = pd.Series(nx.clustering(G_undir), name='clustering_coeff')
            self.louvain_unweighted_map = community_louvain.best_partition(G_undir)
            self.louvain_weighted_map = community_louvain.best_partition(G_undir, weight='USD_amount')

            #Calculate Louvain Community Fraud Rate

            #Weighted
            X['Sender_Community_ID'] = X['Sender_Account'].map(self.louvain_weighted_map).fillna(-1)

            #Stats for Smoothing
            community_stats = X.groupby('Sender_Community_ID').agg(
                community_size=('Sender_Account', 'count'), 
                fraud_rate=('Label', 'mean'),
                fraud_tx_count=('Label', 'sum') 
            ).reset_index()

            #Global Metrics (The Prior)
             #This is the global average rate
            self.global_community_fraud_rate_ = X['Label'].mean()


            #Smoothing Factor (alpha)
            #Use the 25th percentile of community sizes as the smoothing strength ( I forgot why I chose this)
            alpha = community_stats['community_size'].quantile(0.25)

            #Apply Bayesian Smoothing
            community_stats['Smoothed_Fraud_Rate'] = (
            (alpha * self.global_community_fraud_rate_) + community_stats['fraud_tx_count']
            ) / ( alpha + community_stats['community_size'] )

            #smoothed rate map
            self.community_fraud_rate_map = community_stats.set_index('Sender_Community_ID')['Smoothed_Fraud_Rate']



        
        #print("Fit method completed successfully.")
        return self

 
    #=============
    #TRANSFORM
    #=============

    #TRANSFORM METHOD: Applies the learned statistics to create the new features
    def transform(self, X):
        X_transformed = X.copy()
        
        # Helper function for merging and imputing (Applies to all maps)
        def apply_map(df, map_series, on_col, new_col_name, default_val):
            # Convert series to a DataFrame for clean merging
            map_df = map_series.to_frame(name=new_col_name)
            
          
            df = df.merge(map_df, on=on_col, how='left')
            
            # Impute (Fill NaNs with the global default from the training set)
            df[new_col_name].fillna(default_val, inplace=True)
            return df

        if self.add_transaction_features:
            # Apply all quick payment maps
            X_transformed = apply_map(X_transformed, self.sender_count_map, 'Sender_Account', 'Sender_quick_count', 0)
            X_transformed = apply_map(X_transformed, self.sender_mean_map, 'Sender_Account', 'Sender_quick_mean', 0)
            X_transformed = apply_map(X_transformed, self.sender_std_map, 'Sender_Account', 'Sender_quick_std', 0)
            
            X_transformed = apply_map(X_transformed, self.bene_count_map, 'Bene_Account', 'Bene_quick_count', 0)
            X_transformed = apply_map(X_transformed, self.bene_mean_map, 'Bene_Account', 'Bene_quick_mean', 0)
            X_transformed = apply_map(X_transformed, self.bene_std_map, 'Bene_Account', 'Bene_quick_std', 0)
            
            #Apply Country Fraud Ratio
            X_transformed = apply_map(X_transformed, self.country_fraud_ratio_map, 'Sender_Country', 'Country_Fraud_Ratio', self.global_country_ratio_default)
            
            #Apply Sender Avg Time
            X_transformed = apply_map(X_transformed, self.sender_avg_time_map, 'Sender_Account', 'Sender_Avg_Time_Diff', self.global_time_diff_default)

    
        if self.add_sector_encoding:
            X_transformed['Sender_Sector'] = X_transformed['Sender_Sector'].fillna(-1)
            X_transformed['Sender_Sector_target_enc'] = X_transformed['Sender_Sector'].map(self.sector_target_map_)
            X_transformed['Sender_Sector_target_enc'].fillna(self.global_fraud_rate_, inplace=True)


        if self.add_graph_features:
            def apply_graph_map_pair(df, map_series, map_name, default_val):
                # Apply map to SENDER
                df[f'Sender_{map_name}'] = df['Sender_Account'].map(map_series).fillna(default_val)
                # Apply map to BENEFACTOR
                df[f'Bene_{map_name}'] = df['Bene_Account'].map(map_series).fillna(default_val)
                return df
            
            #Apply all structural features (Default usually 0 for centrality features)
            X_transformed = apply_graph_map_pair(X_transformed, self.pagerank_map, 'pagerank', 0)
            X_transformed = apply_graph_map_pair(X_transformed, self.in_degree_map, 'in_degree', 0)
            X_transformed = apply_graph_map_pair(X_transformed, self.out_degree_map, 'out_degree', 0)
            X_transformed = apply_graph_map_pair(X_transformed, self.clustering_map, 'clustering_coeff', 0)

            # Apply Weighted Louvain Community IDs (Default is -1 for new accounts) (USED FOR FRAUD RATE CALCULATION)
            X_transformed['Sender_Community_ID_W'] = X_transformed['Sender_Account'].map(self.louvain_weighted_map).fillna(-1)
            X_transformed['Bene_Community_ID_W'] = X_transformed['Bene_Account'].map(self.louvain_weighted_map).fillna(-1)


            #Apply Unweighted Louvain Community IDs ---
            X_transformed['Sender_Community_ID_UW'] = X_transformed['Sender_Account'].map(self.louvain_unweighted_map).fillna(-1)
            X_transformed['Bene_Community_ID_UW'] = X_transformed['Bene_Account'].map(self.louvain_unweighted_map).fillna(-1)
            
            #Apply the New Community Fraud Rate Feature!
            X_transformed['Community_Fraud_Rate'] = (
                X_transformed['Sender_Community_ID_W']
                .map(self.community_fraud_rate_map)
                .fillna(self.global_community_fraud_rate_) # Impute with global training mean
            )

        return X_transformed
    

In [None]:

fe = FraudFeatureEngineer(
    add_transaction_features=True,
    add_sector_encoding=True,
    add_graph_features=True
)


#Transformer learns the relationship from the training data 
fe.fit(df_train)
#FE training set - adding colums into df_train
df_train_transformed = fe.transform(df_train)
#Applying same feature engineering to test set 
df_test_transformed = fe.transform(df_test)

In [None]:
#Feature Engineering For Training Set Manual Calculation to ensure correct calculations 

quick_payments_train = df_train[df_train['Transaction_Type'] == 'QUICK-PAYMENT']

# Calculate the mean 'Amount' for 'Quick-Payments' per sender
sender_mean_map = quick_payments_train.groupby('Sender_Account')['USD_amount'].mean()

# Group the filtered data by Sender_ID and calculate the mean of the Amount. 
# This creates sender-to-mean mapping.
sender_mean_map = sender_mean_map.to_frame(name='Sender_mean_quick_payments')

# df_train = df_train.merge(
#     sender_mean_map, 
#     on='Sender_Account', 
#     how='left'
# )

In [None]:
sender_mean_map.head(10)