In [None]:
# import the packages we'll use
## For data handling
import pandas as pd
import numpy as np

## For plotting
import matplotlib.pyplot as plt
import seaborn as sns

## This sets the plot style
## to have a grid on a white background
sns.set_style("whitegrid")

In [None]:
#read file into python 
df = pd.read_csv('/Users/noimotbakare/Dropbox/Fraud_Payments/data/fraud_payment_data') # storing the data in a pandas DataFrame called df.

In [None]:
#1.4M observation, and 13 features
print(df.shape)  # rows, columns
print(df.head()) 
unique_transaction_types = df['Transaction_Type'].unique()

Deciding between Sender Id and Sender Account as an identifier 

Most Sender_IDs map to only one Sender_Account (50,333 IDs).
But a significant number (8,738) map to multiple accounts.

one-to-many relationship — meaning a single Sender_ID can be associated with multiple Sender_Account values.

Each Sender_Account maps to exactly 1 Sender_ID

Sender_Account is the more granular identifier, it uniquely identifies an account.
Sender_Id is a higher-level entity identifier, it identifies the customer, who may own many accounts.

For our analysis we will use Sender and Bene Accounts as unique Identifiers.

In [None]:
# Check if both are non-null in same rows
both_present = df[['Sender_Id', 'Sender_Account']].notnull().all(axis=1).mean()
print(f"Percentage of rows where BOTH Sender_ID and Sender_Account are present: {both_present * 100:.2f}%")
#Does each sender ID map to a Sender Account 
id_to_account = df.groupby('Sender_Id')['Sender_Account'].nunique().value_counts()
print("Unique Sender_Account counts per Sender_ID:")
print(id_to_account.head())


account_to_id = df.groupby('Sender_Account')['Sender_Id'].nunique().value_counts()
print("Unique Sender_ID counts per Sender_Account:")
print(account_to_id.head())



Investigating column Sender Sector 
maybe industry classification

These fraud rates by sector for example 21.4%, 21.0% are 10x the base fraud rate of ~2%. That means some sectors are highly concentrated with fraud.
This suggests Sender_Sector is informative and should be kept. 
I also found that it is categorical in nature.
Has many unique values (high cardinality) and shows clear fraud signal (target separation)

Because Sender_Sector is a categorical feature represented with numbers, not a true continuous numeric variable. NaN likely doesn't mean that it's randomly missing. It likely indicates a specific category such as unknown sector, unclassified customer, or some sort of corporate default. We want the model to use the missing info. I use "-1" to create a clear label for missing/unknown category instead of letting the model guess. 



Creates a distinct category - Allows the model to learn if missing sector data is associated with higher or lower fraud risk
Prevents errors in graph/network or encoding steps.
Keeps the value outside real sector codes	-1 is clearly not a valid sector ID and won’t be confused with real values.
Often missing sector itself is a predictive feature	In fraud data, missing info is frequently a red flag.

In [None]:
#print(df['Sender_Sector'].value_counts(dropna=False).head(20)) 
fraud_rate_by_sector = df.groupby('Sender_Sector')['Label'].mean().sort_values(ascending=False)
#print(fraud_rate_by_sector.head(50))
#print(fraud_rate_by_sector.tail(50))
#All the cases where fraud rate by sender sector >0
print(fraud_rate_by_sector.loc[fraud_rate_by_sector['Label']>0]) 
# #Checking for correlation 
# from sklearn.feature_selection import mutual_info_classif
# import numpy as np

# # Drop NaN for mutual info calculation
# subset = df[['Sender_Sector', 'Label']].dropna()
# mutual_info = mutual_info_classif(subset[['Sender_Sector']], subset['Label'], discrete_features='auto')
# mutual_info




I will calculate average fraud rate for each Sender_Sector, the fraud likelihood by sector (on the training data).

In [None]:
#manual calculation of sender sector to compare to transform
df['Sender_Sector'] = df['Sender_Sector'].fillna(-1)
sector_target_map = df.groupby('Sender_Sector')['Label'].mean()
df['Sender_Sector_target_enc'] = df['Sender_Sector'].map(sector_target_map)
global_fraud_rate = df['Label'].mean()
df['Sender_Sector_target_enc'].fillna(global_fraud_rate, inplace=True)

Sender Lob 

In [None]:
#Frequency 
df['Sender_lob'].value_counts(dropna=False)


In [None]:
df.groupby('Sender_lob')['Label'].mean().sort_values(ascending=False)


In [None]:
df.groupby('Sender_lob').agg(
    count=('Label', 'count'),
    fraud_rate=('Label', 'mean')
)


In [None]:
#Checking Sender Lob against sender sector_target_map
pd.crosstab(df['Sender_lob'], df['Sender_Sector'].isna())


Benford's Law predicts that in many real-world datasets, the first digit of a transaction is not random, but follows a logarithmic distribution ( distributed exponentially) where '1' is the most frequent (30.1%), followed by '2' (17.6%), and so on, with '9' being the least frequent (4.6%). Conversely, the last digit of a transaction is expected to be uniformly distributed, meaning each digit (0 to 9)  has roughly an equal chance of appearing, though this is a less common application of the law.
 

Having a sudden jump in popularity of some particular first digits, like 9 or 5, could suggest fraud, maybe to avoid reporting threshold because if you make a transaction like 9,999 it will not be above the reporting threshold of say 10,000. All transaction in US banks that are 10,000 or more.

In [None]:
# ----------------------------------------------------------------------
# Calculating  Benford's Law in the data 
# ----------------------------------------------------------------------
df['USD_amount'] = pd.to_numeric(df['USD_amount'], errors='coerce')
#Filter out values less than 1
#df = df[df['USD_amount'] >= 1]

#Extract the first digit from each amount
df['first_digit'] = df['USD_amount'].astype(str).str.strip().str[0]
df = df[df['first_digit'].str.isdigit()]
df['first_digit'] = df['first_digit'].astype(int)


#Count how many times each first digit appears per group
digit_counts = (
    df.groupby(['first_digit', 'Label'])
      .size()
      .reset_index(name='count')
)
#Compute percentage distribution by label
digit_counts['percentage'] = (
    digit_counts.groupby('Label')['count']
    .apply(lambda x: x / x.sum() * 100)
    .reset_index(drop=True)
)

plot_df = digit_counts.pivot(index='first_digit', columns='Label', values='percentage')

# ----------------------------------------------------------------------
# Add Benford's Law reference
# ----------------------------------------------------------------------
# Calculate Benford's Law distribution
digits = np.arange(1, 10)
benford_prop = np.log10(1 + 1 / digits) * 100
benford_df = pd.DataFrame({'benford_percentage': benford_prop}, index=digits)

# Create the plot
fig, ax = plt.subplots(figsize=(10, 7))

# Plot the observed distributions as bars
plot_df.plot(kind='bar', ax=ax, width=0.8, align='center', alpha=0.7)

# Plot the Benford's Law distribution as a line
ax.plot(benford_df.index, benford_df['benford_percentage'], 
        marker='o', color='red', linestyle='--', linewidth=2, label='Benford\'s Law')

# ----------------------------------------------------------------------
# plot formatting
# ----------------------------------------------------------------------
ax.set_xlabel('First Digit of USD Amount')
ax.set_ylabel('Percentage of Transactions (%)')
ax.set_title('Benford\'s Law Analysis by % of Transaction')
ax.set_xticklabels(plot_df.index.astype(int), rotation=0)
ax.legend(title='Fraud Label')
ax.grid(axis='y', linestyle='--', alpha=0.6)

plt.tight_layout()
plt.show()


In [None]:
# df['last_digit__rounded'] = df['USD_amount'].round(0)
df

Most Common approch - The Last Digit of the Cents (the second decimal place, e.g., the '4' in $12.34).Should be ≈10% for each digit (0-9).Fraudsters sometimes avoid or favor specific cents values (like .00 or .99), distorting this uniform expectation.

Less Common, the Digit Before the Decimal (the units place, e.g., the '2' in $12.34).Should also be ≈10%for each digit (0-9).This is less commonly tested on its own because the digits before the decimal are already constrained by Benford's Law, which predicts a non-uniform, logarithmic distribution for those positions.


I don't think it makes sense to do this for bene because its the same amount that's going from sender to beneficiary, therefore it maybe redundant. 


In [None]:

# Extract the last digit
#last digit before decimal 



#Lastdigit before the decimal 
#df['last_digit_before_dec'] = df['USD_amount'].astype(str).str.split('.').str[0].str[-1]
#Extract the last digit after decimal USD_amount
df['last_digit_after_dec'] = df['USD_amount'].astype(str).str.split('.').str[1].str[-1]
#Keep only rows where the last character is a digit
df = df[df['last_digit_after_dec'].str.isdigit()]
df['last_digit_after_dec'] = df['last_digit_after_dec'].astype(int)

# Filter 
#df = df[df['USD_amount'] >= 1]

#Count how often each last digit appears per fraud label
# Group by last digit and Label (1 = fraud, 0 = not fraud)
digit_counts1 = (
    df.groupby(['last_digit_after_dec', 'Label'])
      .size()
      .reset_index(name='count')
)
#Calculate percentages within each label
# Compute percentages within each Label group
digit_counts1['percentage'] = (
    digit_counts1.groupby('Label')['count']
    .apply(lambda x: x / x.sum() * 100)
    .reset_index(drop=True)
)


plot_df2 = digit_counts1.pivot(index='last_digit_after_dec', columns='Label', values='percentage')
# Create the plot
fig, ax = plt.subplots(figsize=(10, 7))
plot_df2.plot(kind='bar', ax=ax, width=0.8, align='center', alpha=0.7)

# Add a uniform distribution reference line
ax.axhline(y=10, color='red', linestyle='--', linewidth=2, label='Expected Uniform (10%)')

ax.set_xlabel('Last Digit of USD Amount (after decimal)')
ax.set_ylabel('Percentage of Transactions (%)')
ax.set_title('Last Digit Distribution by Transaction Type')
ax.set_xticklabels(plot_df2.index.astype(int), rotation=0)
ax.legend(title='Transaction Type')
ax.grid(axis='y', linestyle='--', alpha=0.6)

plt.tight_layout()
plt.show()




In [None]:

# Extract the last digit
#last digit before decimal 



#Lastdigit before the decimal 
#df['last_digit_before_dec'] = df['USD_amount'].astype(str).str.split('.').str[0].str[-1]
#Extract the last digit after decimal USD_amount
#df['last_digit_after_dec'] = df['USD_amount'].astype(str).str.split('.').str[1].str[-1]
#Keep only rows where the last character is a digit
df3 = df[df['last_digit_before_dec'].str.isdigit()]
df3['last_digit_before_dec'] = df3['last_digit_before_dec'].astype(int)

# Filter 
#df = df[df['USD_amount'] >= 1]

#Count how often each last digit appears per fraud label
# Group by last digit and Label (1 = fraud, 0 = not fraud)
digit_counts3 = (
    df3.groupby(['last_digit_before_dec', 'Label'])
      .size()
      .reset_index(name='count')
)
#Calculate percentages within each label
# Compute percentages within each Label group
digit_counts3['percentage'] = (
    digit_counts3.groupby('Label')['count']
    .apply(lambda x: x / x.sum() * 100)
    .reset_index(drop=True)
)


plot_df3 = digit_counts3.pivot(index='last_digit_before_dec', columns='Label', values='percentage')
# Create the plot
fig, ax = plt.subplots(figsize=(10, 7))
plot_df3.plot(kind='bar', ax=ax, width=0.8, align='center', alpha=0.7)

# Add a uniform distribution reference line
ax.axhline(y=10, color='red', linestyle='--', linewidth=2, label='Expected Uniform (10%)')

ax.set_xlabel('Last Digit of USD Amount (before decimal)')
ax.set_ylabel('Percentage of Transactions (%)')
ax.set_title('Last Digit Distribution by Transaction Type')
ax.set_xticklabels(plot_df3.index.astype(int), rotation=0)
ax.legend(title='Transaction Type')
ax.grid(axis='y', linestyle='--', alpha=0.6)

plt.tight_layout()
plt.show()





Sender_mean_quick_payments -Follows a U shaped pattern. Starting from bin 0 to bin 14, the fraud rate decreases at first, reaches a minimum around bins 6–8, then increases significantly toward the highest bin (14). The highest fraud percentage (25.6%) occurs in the top quantile. As the mean feature increases into the highest range, fraud likelihood increases sharply. The trend is not perfectly monotonic, the extremes show strong separation: low bins have around ~19–20% fraud, but the highest bin jumps to ~26%. Feature may have predictive power.


Sender_std_quick_payments- Fraud rate drops sharply from bin 0 to bin 5 and then spikes again in bin 7–8.s The strong separation pattern, tells us that variability in transaction amounts is predictive.


In [None]:
#compare relative risk of quick payments across users normalized way (like the example you're replicating)
quick_payments_df = df[df['Transaction_Type'] == 'QUICK-PAYMENT'].copy()

# Perform the groupby and aggregation correctly
# Group by both Sender_Id and Label to get separate stats for fraudulent and non-fraudulent payments
sender_quick_stats = quick_payments_df.groupby(['Sender_Account', 'Label']).agg(
    count=('USD_amount', 'count'),
    mean=('USD_amount', 'mean'),
    std=('USD_amount', 'std')
).reset_index()

# Now the `std` column exists and can be accessed
# Handle cases where a user has only one transaction, which results in a NaN for std
sender_quick_stats['std'] = sender_quick_stats['std'].fillna(0)

#print(sender_quick_stats.head())

bins = 15
sender_quick_stats['count_bin'] = pd.qcut(sender_quick_stats['count'], bins, labels=False, duplicates='drop')
sender_quick_stats['mean_bin'] = pd.qcut(sender_quick_stats['mean'], bins, labels=False, duplicates='drop')
sender_quick_stats['std_bin'] = pd.qcut(sender_quick_stats['std'], bins, labels=False, duplicates='drop')


sender_quick_stats['mean_bin'] = pd.qcut(sender_quick_stats['mean'], bins, labels=False, duplicates='drop')
# Calculate % of fraud per bin
mean_fraud_dist = sender_quick_stats.groupby(['mean_bin', 'Label']).size()
mean_fraud_percent = mean_fraud_dist.groupby(level=0).apply(lambda x: x / x.sum()).unstack(fill_value=0)

# # Plot
mean_fraud_percent.plot(kind='bar', figsize=(14, 6))
plt.title('Sender Mean Quick Pay by is fraud - % of transactions')
plt.xlabel('Sender Mean Quick Pay')
plt.ylabel('% of transactions')
plt.legend(title='is fraud')
plt.show()

In [None]:
# Calculate % of fraud per bin
std_fraud_dist = sender_quick_stats.groupby(['std_bin', 'Label']).size()
std_fraud_percent = std_fraud_dist.groupby(level=0).apply(lambda x: x / x.sum()).unstack(fill_value=0)

# # Plot
std_fraud_percent.plot(kind='bar', figsize=(14, 6))
plt.title('Sender STD Quick Pay by is fraud - % of transactions')
plt.xlabel('Sender STD Quick Pay')
plt.ylabel('% of transactions')
plt.legend(title='is fraud')

In [None]:
# # Calculate the total transaction count for each Sender_Country
# Sender_transactions_by_country = df['Sender_Country'].value_counts()
# #Bene_transactions_by_country = df['Bene_Country'].value_counts()
# # Print the result
# print(Sender_transactions_by_country)
# # Print the result
# #print(Bene_transactions_by_country)
# # Calculate the total count of fraud transactions (Label=1) for each Sender_Country
# fraud_transactions_by_country = (
#     df.groupby('Sender_Country')['Label']
#     .sum()  # Sums the '1's (fraudulent transactions) within each country group
#     .sort_values(ascending=False)
# )
# print(fraud_transactions_by_country)


Fraud rate by country 
The Financial Action Task Force (FATF) leads global action to tackle money laundering, terrorist and proliferation financing.
FATF suggest that some countries have higher fraud risk. 


I visualize top 50 offenders fraud rate by country. 

High-Volume Countries - countries that have a sufficient number of transactions, making their calculated rate statistically reliable. They will use their specific calculated country fraud rate.

Low-Volume Countries - that have  small sample data, but the small sample size makes their calculated rate noisy or statistically unreliable. Use global average is used to smooth this rate toward the mean ( Bayesian averaging), making it more stable and reliable.

Unseen/New Countries - countries that have no historical data on which to base a rate, so the calculated country-specific feature is NaN. The global average serves as the most logical, unbiased estimate for their rate.

The gloabl fraud rate is - 2.06


(if agreed on I can add to main df and use global average for countries wiht no historical data)

In [None]:

# Factorize Sender_Country into numeric codes
#Sender_Country_df['sender_country_code'] = pd.factorize(Sender_Country_df['Sender_Country'])[0]


#Calculate Total Transactions per country
total_transactions = df['Sender_Country'].value_counts().rename('Total Transactions per country')

#Calculate Fraudulent Transactions per country
fraud_transactions = df.groupby('Sender_Country')['Label'].sum().rename('Fraudulent Transactions per country')

#Combine the two Series into a single DataFrame
results_df = pd.concat([total_transactions, fraud_transactions], axis=1)

# 4. Final formatting and Fraud Rate calculation
results_df = results_df.reset_index().rename(columns={'index': 'Sender_Country'})
results_df['Fraud Rate (%)'] = (results_df['Fraudulent Transactions per country'] / results_df['Total Transactions per country']) * 100

# Sort by Fraud Rate (%) and get top 50 offenders
top_50_offenders = results_df.sort_values(by='Fraud Rate (%)', ascending=False).head(50)

# Display the result
top_50_offenders

# 1. Prepare the data for plotting
countries = top_50_offenders['Sender_Country']
fraud_rates = top_50_offenders['Fraud Rate (%)']

# 2. Create the figure and axes
# Use a large figure size for 50 bars to ensure readability
plt.figure(figsize=(15, 8))

# 3. Create the bar chart
plt.bar(countries, fraud_rates)

# 4. Set labels and title
plt.xlabel('Sender Country', fontsize=12)
plt.ylabel('Fraud Rate (%)', fontsize=12)
plt.title('Top 50 Countries by Fraud Rate', fontsize=14)

# 5. Rotate the X-axis labels for better visibility
plt.xticks(rotation=90, ha='right', fontsize=10)

# 6. Adjust layout to prevent labels from being cut off
plt.tight_layout()

# 7. Save the plot
plt.savefig('top_50_fraud_rate_by_country.png')

print("Plot saved as top_50_fraud_rate_by_country.png")

In [None]:
#imputation with the global mean

# Calculate the overall global fraud rate
global_total_fraud = df['Label'].sum() 
global_total_tx = len(df)
global_avg_rate = (global_total_fraud / global_total_tx) * 100
print(global_avg_rate)
# Fill NaN values (new/unseen countries) with the global average
#df['Country_Fraud_Rate_Feature'] = df['Country_Fraud_Rate_Feature'].fillna(global_avg_rate)

# For better smoothing consider using a weighted average technique, 


In [None]:
df

Time related variables 

Everything below this line is not concrete. 

In [None]:

#dtttm format
df['Time_step']= pd.to_datetime(df['Time_step'])

#Sort by sender and time to correctly calculate time based features 
df=df.sort_values(by=['Sender_Account', 'Time_step'])
# ===========
#mean time between transactioin - this tells us whether users sends transactions frequently or infrequently ( good for detecting usual behavior vs. sudden shifts in behavior)
# ==============
#Time between transactions 
df['time_diff']= df.groupby('Sender_Account')['Time_step'].diff()
df['time_diff_seconds']= df['time_diff'].dt.total_seconds()
# df['time_diff_minutes']= df['time_diff'].dt.total_minutes()
# df['time_diff_hours']= df['time_diff'].dt.total_hours()
# Aggregate to compute mean time between transactions per sender
mean_time_df = df.groupby('Sender_Account')['time_diff_seconds'].mean().reset_index()
mean_time_df.rename(columns={'time_diff_seconds': 'mean_time_between_transactions'}, inplace=True)

#filling NAN with median 
median_time = mean_time_df['mean_time_between_transactions'].median()
mean_time_df['mean_time_between_transactions'].fillna(median_time, inplace=True)

mean_time_df

In [None]:

#=======
#Transaction Velocity 
#=======
#sender velocity on the other hand tells us the number of transactions over a moving time window. Velocity = Number of transactions in a window/ window duration 
#High velocity can signal money laundering - bursty or rapid behavior. It's more about transaction intensity 
# Function to compute rolling count for each sender
def compute_velocity(group, window='24H'):
    return (
        group
        .set_index('Time_step')  # use Time_Step as index temporarily
        .rolling(window=window)['Transaction_Id']
        .count()
        .values  # get array aligned to original group index
    )

# Apply function group-wise using transform-like behavior
df['velocity_last_24h'] = df.groupby('Sender_Account', group_keys=False).apply(
    lambda g: compute_velocity(g, window='24H')
)

# Fill any NaN (usually first transactions) with 0
df['velocity_last_24h'] = df['velocity_last_24h'].fillna(0)

print(df[['Sender_Account', 'Time_step', 'velocity_last_24h']].head(10))

In [None]:
print(df[['Sender_Account', 'Time_step', 'velocity_last_24h']].head(10))

In [None]:
#Merge Mean time between txn
df = df.merge(mean_time_df, on='Sender_Account', how='left')
df['mean_time_between_transactions'] = df['mean_time_between_transactions']
# # Merge Velocity in last 24 hours to df
# df = df.merge(mean_time_df, on='Sender_Account', how='left')
# df['velocity_last_24h'] = df['velocity_last_24h'].fillna(0)




In [None]:
df 

In [None]:


plt.figure(figsize=(12,5))

# Boxplot for mean time between transactions
plt.subplot(1,2,1)
sns.boxplot(x='Label', y='mean_time_between_transactions', data=df)
plt.title('Mean Time Between Transactions vs Fraud Label')
plt.xticks([0,1], ['Non-Fraud', 'Fraud'])

# Boxplot for velocity
plt.subplot(1,2,2)
sns.boxplot(x='Label', y='velocity_last_24h', data=df)
plt.title('Velocity (Last 24h) vs Fraud Label')
plt.xticks([0,1], ['Non-Fraud', 'Fraud'])

plt.tight_layout()
plt.show()


In [None]:
df

In [None]:
df.drop(mean_time_between_transactions_x,mean_time_between_transactions_y)
df

Recalcualte this to be a colum nonfraud transaction rate =  non fraud transaction/ Total Transaction  and Fraud Transaction rate=  Fraud Transaction/ Total Transaction 

In [None]:
# idk if this is RIGHT _ Including Fraud label is throwing off the calculation and this makes sense mathematically 
# Filter out rows with missing Sender_Country
Sender_Country_df = df[df['Sender_Country'].notna()].copy()

# Factorize Sender_Country into numeric codes
Sender_Country_df['sender_country_code'] = pd.factorize(Sender_Country_df['Sender_Country'])[0]

# Group by both Sender_Country and Label (0 = non-fraud, 1 = fraud)
sender_country_fraud_stats = Sender_Country_df.groupby(['Sender_Country','sender_country_code','Label']).agg(
    total_tx=('Sender_Account', 'count'),   # total transactions per group
    fraud_tx=('Label', 'count')      # fraud count (Label=1)
).reset_index()

sender_country_fraud_stats['fraud_rate'] = sender_country_fraud_stats['fraud_tx'] / sender_country_fraud_stats['total_tx']
sender_country_fraud_stats
sender_country_fraud_stats['non_fraud_rate'] = sender_country_fraud_stats['fraud_tx'] / sender_country_fraud_stats['total_tx']
sender_country_fraud_stats

# Sort by Fraud Rate (%) and get top 50 offenders
top_50_offenders1 = sender_country_fraud_stats.sort_values(by='fraud_rate', ascending=False).head(50)

# Display the result
top_50_offenders1