# Import master data and conditions files

In [None]:
#Import the necessary libraries
import pandas as pd
import numpy as np
import random

In [None]:
#Install Faker to enable generation of random customer names
!pip install faker

Collecting faker
  Downloading faker-37.5.3-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.5.3-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/1.9 MB[0m [31m22.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.5.3


In [None]:
from faker import Faker
faker = Faker()
Faker.seed(42)

In [None]:
# Set the path to the master data file and conditions file
master_data_path = "MasterData.xlsx"
conditions_data_path = "Conditions.xlsx"

# Load all sheets in both the files
master_data = pd.read_excel(master_data_path, sheet_name=None)
conditions_data = pd.read_excel(conditions_data_path, sheet_name=None)

# Print the sheet names
print("Master Data Sheets:", list(master_data.keys()))
print("Conditions Data Sheets:", list(conditions_data.keys()))

Master Data Sheets: ['Customer Information', 'Account Information', 'Transaction data', 'Business', 'Geography', 'Customer Type', 'Product', 'Channels', 'TranType', 'Currency', 'watch list data']
Conditions Data Sheets: ['Prod-TranType Mapping', 'Channe-TranType Mapping']


In [None]:
#Extract each masterdata excel sheet into a separate dataframe
business_master = master_data["Business"]
currency_master = master_data["Currency"]
geo_master = master_data["Geography"]
ctype_master = master_data["Customer Type"]
prod_master = master_data["Product"]
chann_master = master_data["Channels"]
tran_master = master_data["TranType"]

In [None]:
#Check the business_master dataframe
business_master

Unnamed: 0,BusinessCode,BusinessDesc,RiskRating
0,1001,Food,LR
1,1002,Textiles,LR
2,1003,Machinary and Equipment,LR
3,1004,Chemicals,LR
4,1005,Electronics,LR
5,1006,Metal & Metal Products,LR
6,1007,Retail,LR
7,1008,Information Tech,LR
8,1009,Construction,LR
9,1010,Transport,MR


In [None]:
#Check the geo_master dataframe
geo_master

Unnamed: 0,GeoCode,GeoDesc,RiskRating
0,CAN,Canada,LR
1,AUT,Austria,LR
2,COL,Colambia,LR
3,BRA,Brazil,LR
4,NPL,Nepal,HR
5,IND,India,LR
6,USD,United States of America,LR
7,ARE,UAE,LR
8,AUS,Australia,LR
9,HKG,Hongkong,LR


# Generate customer data

In [None]:
#Define a function to ensure the synthetic customer generation has a realistic mix of High Risk, Med Risk and Low risk master data (Business Type, Geography, Customer Type)
#Set thresholds for the same High Risk-0.5%, Med Risk-1% and Low risk-98.5% for Business type for example
def sample_with_risk_distribution(master_df, risk_col, code_col, target_dist, total_needed):
    samples = []

    for risk_level, fraction in target_dist.items():
        subset = master_df[master_df[risk_col] == risk_level]
        n_samples = int(total_needed * fraction)

        if len(subset) == 0:
            continue  # Skip if no rows with this risk rating

        sampled = subset.sample(n=n_samples, replace=(n_samples > len(subset)))
        samples.append(sampled[[code_col]])

    return pd.concat(samples, ignore_index=True)

# Desired distributions
business_risk_dist = {"HR": 0.005, "MR": 0.01, "LR": 0.985}
geo_risk_dist = {"HR": 0.005, "MR": 0.01, "LR": 0.985}
cust_type_risk_dist = {"HR": 0.005, "MR": 0.01, "LR": 0.985}

num_customers = 400

# Sample for business
business_sample = sample_with_risk_distribution(
    business_master, "RiskRating", "BusinessCode", business_risk_dist, num_customers
)

geo_sample = sample_with_risk_distribution(
    geo_master, "RiskRating", "GeoCode", geo_risk_dist, num_customers
)

cust_type_sample = sample_with_risk_distribution(
    ctype_master, "RiskRating", "CustomerTypeCode", cust_type_risk_dist, num_customers)


In [None]:
# Generate 400 customers
# Add a meaningful suffix with the names of the customers based on customer type
customers = []
for i in range(400):
    cust_type = cust_type_sample.iloc[i % len(cust_type_sample)]["CustomerTypeCode"]
    name = faker.name()
    suffix_map = {
    "PUBL": " Pub Ltd Company",
    "PVTL": " Pvt Ltd Company",
    "GOVT": " Govt Co",
    "SPRF": " Sole Prop firm",
    "PART": " and Partners",
    "LLPF": " Limited LP",
    "TRST": " Trust Co",
    "CLSO": " Club",
    "NBFC": " Non-Banking Fin Co",
    "BFIS": " Banking Ltd",
    "SGOV": " State Gov Co",
    "ASSO": " & Association",
    "MBNK": " Banking Ltd",
    "NGOS": " Non-Govt Org",
    "FCOM": " Foreign Co"
     }
    suffix = suffix_map.get(cust_type, "")
    name += suffix
    cust = {
        "CustomerID": f"CUST{i+1:04}",
        "Name": name,
        "PEP_Flag": 'Y' if np.random.rand() < 0.02 else 'N',
        "Business": business_sample.iloc[i % len(business_sample)]["BusinessCode"],
        "Geography": geo_sample.iloc[i % len(geo_sample)]["GeoCode"],
        "CustomerType": cust_type
    }
    customers.append(cust)


df_customers = pd.DataFrame(customers)

In [None]:
#Check the structure of the customers dataframe
df_customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CustomerID    400 non-null    object
 1   Name          400 non-null    object
 2   PEP_Flag      400 non-null    object
 3   Business      400 non-null    int64 
 4   Geography     400 non-null    object
 5   CustomerType  400 non-null    object
dtypes: int64(1), object(5)
memory usage: 18.9+ KB


In [None]:
#Check the sample data in the customers dataframe
df_customers.head()

Unnamed: 0,CustomerID,Name,PEP_Flag,Business,Geography,CustomerType
0,CUST0001,Allison Hill Club,N,1014,NPL,CLSO
1,CUST0002,Noah Rhodes Club,N,1014,NPL,CLSO
2,CUST0003,Angie Henderson Sole Prop firm,N,1012,KEN,SPRF
3,CUST0004,Daniel Wagner Sole Prop firm,N,1012,ZAF,SPRF
4,CUST0005,Cristian Santos Sole Prop firm,N,1010,KEN,SPRF


In [None]:
#Download the synthetic customers dataframe to a csv file
df_customers.to_csv("customers.csv", index=False)

# Generate account Data

In [None]:
#Define a function to ensure the synthetic accounts data generation has a realistic mix of High Risk, Med Risk and Low risk master data (Product Type and Channel)
#Set thresholds for the same High Risk-1%, Med Risk-2% and Low risk-96% for Product for example
product_risk_dist = {"HR": 0.01, "MR": 0.02, "LR": 0.97}
currency_risk_dist = {"HR": 0.005, "MR": 0.01, "LR": 0.985}
num_accounts = 1000

# Sample products and currencies as per risk split
product_sample = sample_with_risk_distribution(
prod_master, "RiskRating", "ProductCode", product_risk_dist, num_accounts
)

currency_sample = sample_with_risk_distribution(
    currency_master, "RiskRating", "CurrencyCode", currency_risk_dist, num_accounts
)

In [None]:
#Define thresholds for synthetic generation based on currency, for example 70% of the accounts to be in USD currency, to reflect a realistic composition of accounts
num_accounts = 1000
num_customers = len(df_customers)
assert num_accounts >= num_customers, "Number of accounts must be >= number of customers"

# Phase 1: One account per customer
accounts = []

currency_list = currency_master["CurrencyCode"].tolist()

explicit_weights = {
    "USD": 0.7,
    "EUR": 0.1,
    "GBP": 0.05,
    "CAD": 0.03,
    "INR": 0.02,
    "JPY": 0.02,
    "AUD": 0.02,
}

def choose_currency_weighted(currency_list, weights_dict):
    weights = [weights_dict.get(curr, 0.06 / (len(currency_list) - len(weights_dict)))
               if curr not in weights_dict else weights_dict[curr]
               for curr in currency_list]
    return random.choices(currency_list, weights=weights, k=1)[0]

# Sample products and currencies upfront
product_sample = sample_with_risk_distribution(
    prod_master, "RiskRating", "ProductCode", product_risk_dist, num_accounts
)
currency_sample = sample_with_risk_distribution(
    currency_master, "RiskRating", "CurrencyCode", currency_risk_dist, num_accounts
)

for i, cust_id in enumerate(df_customers["CustomerID"]):
    acc = {
        "AccountNumber": f"ACC{i+1:05}",
        "CustomerID": cust_id,
        "ProductCode": product_sample.iloc[i % len(product_sample)]["ProductCode"],
        "CurrencyCode": choose_currency_weighted(currency_list, explicit_weights),
        "Balance": round(np.random.uniform(1000, 1000000), 2)
    }
    accounts.append(acc)

# Phase 2: Remaining random accounts
remaining_accounts = num_accounts - num_customers
for i in range(remaining_accounts):
    acc = {
        "AccountNumber": f"ACC{num_customers + i + 1:05}",
        "CustomerID": np.random.choice(df_customers["CustomerID"]),
        "ProductCode": product_sample.iloc[(num_customers + i) % len(product_sample)]["ProductCode"],
        "CurrencyCode": choose_currency_weighted(currency_list, explicit_weights),
        "Balance": round(np.random.uniform(1000, 1000000), 2)
    }
    accounts.append(acc)

df_accounts = pd.DataFrame(accounts)

In [None]:
#Check the structure of the accounts dataframe
df_accounts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   AccountNumber  1000 non-null   object 
 1   CustomerID     1000 non-null   object 
 2   ProductCode    1000 non-null   object 
 3   CurrencyCode   1000 non-null   object 
 4   Balance        1000 non-null   float64
dtypes: float64(1), object(4)
memory usage: 39.2+ KB


In [None]:
#Check the sample data in the accounts dataframe
df_accounts.head()

Unnamed: 0,AccountNumber,CustomerID,ProductCode,CurrencyCode,Balance
0,ACC00001,CUST0001,BUSL,USD,787832.95
1,ACC00002,CUST0002,BUSL,USD,431871.46
2,ACC00003,CUST0003,BUSL,USD,981516.44
3,ACC00004,CUST0004,BUSL,USD,202038.16
4,ACC00005,CUST0005,BUSL,USD,525102.99


In [None]:
#Download the synthetic accounts dataframe to a csv file
df_accounts.to_csv("accounts.csv", index=False)

# Generate transactions data

In [None]:
#Define a function to set thresolds based on channel, currency, geography and transaction type
def weighted_sample(df, rating_col, code_col, total=10000, weights={"HR":0.005, "MR":0.01, "LR":0.985}):
    samples = []
    for risk, pct in weights.items():
        subset = df[df[rating_col] == risk]
        if not subset.empty:
            count = int(total * pct)
            samples.extend(subset.sample(n=count, replace=True)[code_col].tolist())
    return samples

tran_types = weighted_sample(tran_master, "RiskRating", "TranTypeCode")
channels = weighted_sample(chann_master, "RiskRating", "ChannelCode", total=len(tran_types))
currencies = weighted_sample(currency_master, "RiskRating", "CurrencyCode", total=len(tran_types))
geos = weighted_sample(geo_master, "RiskRating", "GeoCode", total=len(tran_types))

In [None]:
#Import the conditions of allowed combinations (Product-Transaction type and Product-Channel) in transactions
prod_tran_map = conditions_data["Prod-TranType Mapping"]
channel_tran_map = conditions_data["Channe-TranType Mapping"]

In [None]:
#Check the allowed combination for Channel-TranType mapping in the dataframe
channel_tran_map

Unnamed: 0,ChannelCode,Channel Name,TranTypeCode
0,BRAN,Branch,"CDEP, CWIT,FTDO,FTIN, LDIST"
1,CATM,ATM,"CDEP, CWIT,FEEC"
2,CPOS,POS,"MPAY, FEEC,LREP"
3,IBAN,Internet Banking,"FTDO, FTIN, CCPY,MPAY, LREP, BPAY, FEEC"
4,MBAN,Mobile Banking,"FTDO, FTIN, CCPY,MPAY, LREP, BPAY, FEEC"
5,CUPI,UPI,"CDEP, CWIT, LREP"
6,CHEQ,Cheque / DD,"FTDO,CCPY, LDIST,LREP,BPAY,FEEC"
7,CARD,Cards,"FTDO, FTIN, CCPY,MPAY, FEEC"
8,WALL,Wallets,"FTDO, FTIN, CCPY,MPAY,LDIST"
9,CUST,Customer Terminal,"FTDO, FTIN, CCPY,MPAY,LREP"


In [None]:
#Check the allowed combination for Product-TranType mapping in the dataframe
prod_tran_map

Unnamed: 0,ProductCode,Prod Name,TranTypeCode
0,FIXD,Fixed Deposit,"CDEP, CWIT, FTDO, BPAY, FEEC"
1,BUSL,Business Loans,"CDEP, CWIT, FTDO, FTIN,MPAY,CCPY, LDIST,LREP,B..."
2,SAVS,Savings,"CDEP, CWIT, CCPAY, FTDO, FTIN"
3,CHEK,Checking,"CDEP, CWIT, CCPAY, FTDO, FTIN, BPAY"
4,OVDF,Overdraft,"CDEP, CWIT, FTDO, FTIN,MPAY,CCPY"
5,AUTL,Auto Loan,"FTDO, CDEP, FEEC"
6,HOUL,Housing Loan,"FTDO, CDEP, FEEC"
7,BILLS,Bills Discounting,"CDEP, CWIT, CCPAY, FTDO, FTIN, BPAY"
8,GLON,Gold Loans,"CDEP, CWIT, FTDO, FTIN,MPAY,CCPY, LDIST,LREP,B..."
9,ELON,Education Loans,"CDEP, CWIT, FTDO, FTIN,MPAY,CCPY"


In [None]:
#Seperate the allowed transaction types (which were mapped with a ',')
normalized_prod_tran_map = (
    prod_tran_map.assign(TranTypeCode=prod_tran_map["TranTypeCode"].str.split(","))
                 .explode("TranTypeCode")
)
normalized_prod_tran_map["TranTypeCode"] = normalized_prod_tran_map["TranTypeCode"].str.strip()

normalized_chan_tran_map = (
    channel_tran_map.assign(TranTypeCode=channel_tran_map["TranTypeCode"].str.split(","))
                 .explode("TranTypeCode")
)
normalized_chan_tran_map["TranTypeCode"] = normalized_chan_tran_map["TranTypeCode"].str.strip()

In [None]:
#Define the exchange rates to be used to convert the transaction amount to 'amount in local currency(LCY)'

exchange_rates = {
    "USD": 1.0,
    "EUR": 1.1,
    "GBP": 1.25,
    "SGD": 0.78,
    "INR": 0.012,
    "VEF": 0.08,
    "CAD": 0.73,
    "JPY": 0.0068,
    "ZAR": 0.056,
    "CNY": 0.14,
    "DKK": 0.15,
    "EGP": 0.021,
    "AUD": 0.65
}

In [None]:
#Generate 10000 synthetic transactions with (a) thresolds on the currency of the transaction (b)thresolds on the amount of the transactions
from collections import defaultdict
num_transactions=10000
transactions = []
valid_combinations = []  # (TranType, Channel, Product)
for _, row in normalized_prod_tran_map.iterrows():
    tran_types = [t.strip() for t in row["TranTypeCode"].split(",")]
    product = row["ProductCode"]
    for tran in tran_types:
        allowed_channels = normalized_chan_tran_map[normalized_chan_tran_map["TranTypeCode"] == tran]["ChannelCode"].tolist()
        for channel in allowed_channels:
            valid_combinations.append((tran, channel, product))

currency_list = currency_master["CurrencyCode"].tolist()
explicit_weights = {
    "USD": 0.7,
    "EUR": 0.1,
    "GBP": 0.05,
    "CAD": 0.03,
    "INR": 0.02,
    "JPY": 0.02,
    "AUD": 0.02
}

def choose_currency_weighted(currency_list, explicit_weights, other_weight=0.06):
    all_weights = {}
    others = [c for c in currency_list if c not in explicit_weights]

    # Distribute 'others' weight equally
    if others:
        per_other_weight = other_weight / len(others)
        for c in others:
            all_weights[c] = per_other_weight

    # Add explicit weights
    all_weights.update(explicit_weights)

    # Normalize and sample
    currencies = list(all_weights.keys())
    weights = list(all_weights.values())
    return random.choices(currencies, weights=weights, k=1)[0]

amount_slabs = [
    (50, 1000),
    (1001, 5000),
    (5001, 20000),
    (20001, 100000)
]
slab_weights = [0.4, 0.3, 0.2, 0.1]  # Should sum to 1.0

def generate_transaction_amount(slabs, weights):
    selected_slab = random.choices(slabs, weights=weights, k=1)[0]
    return round(random.uniform(selected_slab[0], selected_slab[1]), 2)

# Now generate transactions using these valid combinations

for i in range(num_transactions):
    tran_type, channel, product = random.choice(valid_combinations)
    #amount = round(np.random.uniform(50, 100000), 2)
    amount = generate_transaction_amount(amount_slabs, slab_weights)
    #currency = random.choice(currency_master["CurrencyCode"].tolist())
    currency = choose_currency_weighted(currency_list, explicit_weights)
    exchange_rate = exchange_rates.get(currency, 1.0)  # default to 1.0 if not found
    tlcy_amount = round(amount * exchange_rate, 2)

    start_date = pd.to_datetime("2025-07-20")
    end_date = pd.to_datetime("2025-08-02")
    date_range = pd.date_range(start=start_date, end=end_date).to_list()

    # Choose initiating and counterparty customers
    initiating = df_customers.sample(1).iloc[0]
    counterparty = df_customers.sample(1).iloc[0]
    while counterparty["CustomerID"] == initiating["CustomerID"]:
        counterparty = df_customers.sample(1).iloc[0]

    initiating = df_customers.sample(1).iloc[0]
    # get associated account
    initiating_accounts = df_accounts[df_accounts["CustomerID"] == initiating["CustomerID"]]
    initiating_account_id = initiating_accounts.sample(1).iloc[0]["AccountNumber"] if not initiating_accounts.empty else "Unknown"

    # Set geography rules for FTDO/FTIN
    if tran_type == "FTDO":
        counter_geo = initiating["Geography"]
    elif tran_type == "FTIN":
        counter_geo = random.choice([g for g in geo_master["GeoCode"] if g != initiating["Geography"]])
    else:
        counter_geo = counterparty["Geography"]

    transaction = {
        "TransactionDate": random.choice(date_range).strftime("%Y-%m-%d"),
        "TransactionID": f"T{i+1:05d}",
        "InitiatingCustomer": initiating["CustomerID"],
        "AccountNumber": initiating_account_id,
        "Counterparty": counterparty["CustomerID"],
        "TranType": tran_type,
        "Channel": channel,
        "Currency": currency,
        "Credit/Debit": random.choice(["Cr", "Dr"]),
        "Amount": amount,
        "Amount in LCY":tlcy_amount,
        "Product": product,
        "InitiatingGeo": initiating["Geography"],
        "CounterpartyGeo": counter_geo
        #"Timestamp": faker.date_time_between(start_date='-30d', end_date='now')
    }
    transactions.append(transaction)


In [None]:
#Save the transactions file in a csv format
transaction_df = pd.DataFrame(transactions)
transaction_df.to_csv("transactions.csv", index=False)
print("✅ Synthetic transaction dataset created with", len(transaction_df), "rows.")

✅ Synthetic transaction dataset created with 10000 rows.


# Generate watchlist data

In [None]:
# Generate Watchlist by randomly marking 10 customers as watchlisted
watchlist_customers = df_customers.sample(10)
df_watchlist_cust = watchlist_customers[["Name"]].copy()
df_watchlist_cust["WatchReason"] = "FATF"
#df_watchlist_cust.to_csv("watchlist_cust.csv", index=False)

## 1.1 Import synthetic data

## 1. 2 Enrich Synthetic data with other relevant columns
By linking transaction file with customer and account files

In [None]:
#Load the synthetic data generated (customers, accounts, transactions and watchlist)
accounts = df_accounts
customers = df_customers
transactions = transaction_df
watchlist = df_watchlist_cust

In [None]:
#Add other relevant columns to the base transaction file, by merging with the relevant master data file
#Other relevant fields-customer related: BusinessCode, GeoCode,Customer Type (along with the Risk levels for these fields), PEP Flag and Name of both Initiating customer and Counterparty
#Other relevant fields-transaction related: Channel, Product, Transaction Type (along with the Risk levels for these fields)
transactions_merge = transactions.merge(
    customers.set_index("CustomerID")[["Business","CustomerType","PEP_Flag","Name"]],
    left_on="InitiatingCustomer",
    right_index=True,
    how="left"
).rename(columns={"Business": "BusinessCode_IC",
                  "CustomerType": "CustTypeCode_IC",
                  "Name":"ICName",
                  "PEP_Flag":"PEP_Flag_IC"})

transactions_merge = transactions_merge.merge(
    customers.set_index("CustomerID")[["Business","CustomerType","PEP_Flag","Name"]],
    left_on="Counterparty",
    right_index=True,
    how="left"
).rename(columns={"Business": "BusinessCode_CP",
                  "CustomerType": "CustTypeCode_CP",
                  "Name":"CounterpartyName",
                  "PEP_Flag":"PEP_Flag_CP"})

transactions_merge = transactions_merge.merge(accounts.set_index("AccountNumber")[["ProductCode"]],
                                        on="AccountNumber",
                                        how='left')

transactions_merge = transactions_merge.merge(
    business_master.set_index("BusinessCode")["RiskRating"],
    left_on="BusinessCode_IC",
    right_index=True,
    how="left"
).rename(columns={"RiskRating": "IC_BusinessRisk"})

transactions_merge = transactions_merge.merge(
    business_master.set_index("BusinessCode")["RiskRating"],
    left_on="BusinessCode_CP",
    right_index=True,
    how="left"
).rename(columns={"RiskRating": "CP_BusinessRisk"})


transactions_merge = transactions_merge.merge(
    chann_master.set_index("ChannelCode")["RiskRating"],
    right_index=True,
    left_on ="Channel",
    how="left"
).rename(columns={"RiskRating": "ChannelRisk"})

transactions_merge = transactions_merge.merge(
    geo_master.set_index("GeoCode")["RiskRating"],
    right_index=True,
    left_on ="InitiatingGeo",
    how="left"
).rename(columns={"RiskRating": "IC_GeoRisk"})

transactions_merge = transactions_merge.merge(
    geo_master.set_index("GeoCode")["RiskRating"],
    right_index=True,
    left_on ="CounterpartyGeo",
    how="left"
).rename(columns={"RiskRating": "CP_GeoRisk"})

transactions_merge = transactions_merge.merge(
    ctype_master.set_index("CustomerTypeCode")["RiskRating"],
    right_index=True,
    left_on ="CustTypeCode_IC",
    how="left"
).rename(columns={"RiskRating": "IC_CustTypeRisk"})

transactions_merge = transactions_merge.merge(
    ctype_master.set_index("CustomerTypeCode")["RiskRating"],
    right_index=True,
    left_on ="CustTypeCode_CP",
    how="left"
).rename(columns={"RiskRating": "CP_CustTypeRisk"})


transactions_merge = transactions_merge.merge(
    prod_master.set_index("ProductCode")["RiskRating"],
    right_index=True,
    left_on ="Product",
    how="left"
).rename(columns={"RiskRating": "ProductRisk"})

transactions_merge = transactions_merge.merge(
    tran_master.set_index("TranTypeCode")["RiskRating"],
    right_index=True,
    left_on ="TranType",
    how="left"
).rename(columns={"RiskRating": "TranTypeRisk"})

transactions_merge = transactions_merge.merge(
    currency_master.set_index("CurrencyCode")[["RiskRating"]],
    left_on="Currency",
    right_index=True,
    how="left"
).rename(columns={"RiskRating": "CurrencyRisk"})

watchlist_names = set(watchlist["Name"])

# Add WL_IC flag
transactions_merge["WL_IC"] = transactions_merge["ICName"].apply(
    lambda x: "Y" if x in watchlist_names else "N"
)

# Add WL_CP flag
transactions_merge["WL_CP"] = transactions_merge["CounterpartyName"].apply(
    lambda x: "Y" if x in watchlist_names else "N"
)

In [None]:
#Check the transaction fields after merging the other relevant fields
transactions_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   TransactionDate     10000 non-null  object 
 1   TransactionID       10000 non-null  object 
 2   InitiatingCustomer  10000 non-null  object 
 3   AccountNumber       10000 non-null  object 
 4   Counterparty        10000 non-null  object 
 5   TranType            10000 non-null  object 
 6   Channel             10000 non-null  object 
 7   Currency            10000 non-null  object 
 8   Credit/Debit        10000 non-null  object 
 9   Amount              10000 non-null  float64
 10  Amount in LCY       10000 non-null  float64
 11  Product             10000 non-null  object 
 12  InitiatingGeo       10000 non-null  object 
 13  CounterpartyGeo     10000 non-null  object 
 14  BusinessCode_IC     10000 non-null  int64  
 15  CustTypeCode_IC     10000 non-null  object 
 16  PEP_F

## 1.3 Assign appropriate Risk levels for the relevant columns in the transaction file

In [None]:
#Replace the Risk levels with numerics (LR-LowRisk, MR-MediumRisk, HR-HighRisk with 1,5 and 10 respectively)
risk_map = {"LR": 1, "MR": 5, "HR": 10}
risk_columns = [col for col in transactions_merge.columns if col.endswith("Risk")]
transactions_merge[risk_columns] = transactions_merge[risk_columns].replace(risk_map)

  transactions_merge[risk_columns] = transactions_merge[risk_columns].replace(risk_map)


In [None]:
transactions_features_hr = transactions_merge.iloc[:,[0,1,2,10,16,20,23,24,25,26,27,28,29,30,31,32,33,34]]

In [None]:
#transactions_features_hr.drop(columns=["predlabel"],inplace=True)
transactions_features_hr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   TransactionDate     10000 non-null  object 
 1   TransactionID       10000 non-null  object 
 2   InitiatingCustomer  10000 non-null  object 
 3   Amount in LCY       10000 non-null  float64
 4   PEP_Flag_IC         10000 non-null  object 
 5   PEP_Flag_CP         10000 non-null  object 
 6   IC_BusinessRisk     10000 non-null  int64  
 7   CP_BusinessRisk     10000 non-null  int64  
 8   ChannelRisk         10000 non-null  int64  
 9   IC_GeoRisk          10000 non-null  int64  
 10  CP_GeoRisk          10000 non-null  int64  
 11  IC_CustTypeRisk     10000 non-null  int64  
 12  CP_CustTypeRisk     10000 non-null  int64  
 13  ProductRisk         10000 non-null  int64  
 14  TranTypeRisk        10000 non-null  int64  
 15  CurrencyRisk        10000 non-null  int64  
 16  WL_IC

# 2. Apply Rule based logic and generate alerts (A-P1, A-P2)  and identify transactions related to alerts

A-P1 - Priority 1 alert
A-P2 - Priority 2 alert

In [None]:
#Calculate the Overall Transaction Risk and update the same in merged transaction file in a new field'Overall_Tranx_Risk'
conditions_tx = [
    # High Risk
    (transactions_merge["PEP_Flag_CP"] == "Y") |
    (transactions_merge["CP_BusinessRisk"] == 10) |
    (transactions_merge["CP_GeoRisk"] == 10) |
    (transactions_merge["WL_CP"] == "Y") |
    (transactions_merge["ProductRisk"] == 10) |
    (transactions_merge["CurrencyRisk"] == 10) |
    (transactions_merge["ChannelRisk"] == 10),

    # Medium Risk
    (transactions_merge["PEP_Flag_CP"] == "N") &
    (transactions_merge["WL_CP"] == "N") &
    (
        (transactions_merge["CP_BusinessRisk"] == 5) |
        (transactions_merge["CP_GeoRisk"] == 5) |
        (transactions_merge["ProductRisk"] == 5) |
        (transactions_merge["CurrencyRisk"] == 5) |
        (transactions_merge["ChannelRisk"] == 5)
    ),

    # Low Risk
    (transactions_merge["PEP_Flag_CP"] == "N") &
    (transactions_merge["WL_CP"] == "N") &
    (
        (transactions_merge["CP_BusinessRisk"] == 1) |
        (transactions_merge["CP_GeoRisk"] == 1) |
        (transactions_merge["ProductRisk"] == 1) |
        (transactions_merge["CurrencyRisk"] == 1) |
        (transactions_merge["ChannelRisk"] == 1)
    )
]

choices_tx = ["HighRisk", "MedRisk", "LowRisk"]

transactions_merge["Overall_Tranx_Risk"] = np.select(conditions_tx, choices_tx, default="Unknown")

In [None]:
#Calculate the Initiating Customer overall Risk and update the same in merged transaction file in a new field'Overall_IC_Risk'
conditions_ic = [
    # High Risk
    (transactions_merge["PEP_Flag_IC"] == "Y") |
    (transactions_merge["IC_BusinessRisk"] == 10) |
    (transactions_merge["IC_GeoRisk"] == 10) |
    (transactions_merge["WL_IC"] == "Y"),

    # Medium Risk
    (transactions_merge["PEP_Flag_IC"] == "N") &
    (transactions_merge["WL_IC"] == "N") &
    ((transactions_merge["IC_BusinessRisk"] == 5) | (transactions_merge["IC_GeoRisk"] == 5)),

    # Low Risk
    (transactions_merge["PEP_Flag_IC"] == "N") &
    (transactions_merge["WL_IC"] == "N") &
    ((transactions_merge["IC_BusinessRisk"] == 1) | (transactions_merge["IC_GeoRisk"] == 1))
]

choices_ic = ["HighRisk", "MedRisk", "LowRisk"]

transactions_merge["Overall_IC_Risk"] = np.select(conditions_ic, choices_ic, default="Unknown")

In [None]:
#Check the merged transaction dataframe for the new fields
transactions_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 37 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   TransactionDate     10000 non-null  object 
 1   TransactionID       10000 non-null  object 
 2   InitiatingCustomer  10000 non-null  object 
 3   AccountNumber       10000 non-null  object 
 4   Counterparty        10000 non-null  object 
 5   TranType            10000 non-null  object 
 6   Channel             10000 non-null  object 
 7   Currency            10000 non-null  object 
 8   Credit/Debit        10000 non-null  object 
 9   Amount              10000 non-null  float64
 10  Amount in LCY       10000 non-null  float64
 11  Product             10000 non-null  object 
 12  InitiatingGeo       10000 non-null  object 
 13  CounterpartyGeo     10000 non-null  object 
 14  BusinessCode_IC     10000 non-null  int64  
 15  CustTypeCode_IC     10000 non-null  object 
 16  PEP_F

In [None]:
#Check the spread of initiating customers Risk profile
transactions_merge["Overall_IC_Risk"].value_counts()

Unnamed: 0_level_0,count
Overall_IC_Risk,Unnamed: 1_level_1
LowRisk,9457
HighRisk,444
MedRisk,99


In [None]:
#Check the spread of transactions Risk profile
transactions_merge["Overall_Tranx_Risk"].value_counts()

Unnamed: 0_level_0,count
Overall_Tranx_Risk,Unnamed: 1_level_1
LowRisk,5730
HighRisk,2754
MedRisk,1516


In [None]:
#Apply the rule based logic on the merged transaction file (Rule based logic is explained in the document)
#Rule based logic - Step 1 - aggregate the transaction amount by Overall_IC_Risk and Overall_Tranx_Risk
transactions_merge["TransactionID"] = transactions_merge["TransactionID"].astype(str)
agg_df = transactions_merge.groupby(
    ["InitiatingCustomer", "Overall_IC_Risk", "Overall_Tranx_Risk"]
).agg({
    "TransactionID": lambda x: ",".join(x),          # Join IDs with commas
    "Amount in LCY": "sum"                           # Total transaction amount
}).reset_index()

agg_df.rename(columns={
    "TransactionID": "Transaction IDs",
    "Amount in LCY": "Total Amount"
}, inplace=True)

# Add 'Total Tranx' column by counting commas + 1
agg_df["Total Tranx"] = agg_df["Transaction IDs"].apply(lambda x: len(x.split(",")))

print(agg_df.head())

  InitiatingCustomer Overall_IC_Risk Overall_Tranx_Risk  \
0           CUST0001        HighRisk           HighRisk   
1           CUST0001        HighRisk            LowRisk   
2           CUST0001        HighRisk            MedRisk   
3           CUST0002        HighRisk           HighRisk   
4           CUST0002        HighRisk            LowRisk   

                                     Transaction IDs  Total Amount  \
0  T01617,T01780,T01894,T01936,T02732,T03453,T044...      39255.59   
1   T01218,T01458,T01555,T04695,T04786,T05237,T05369     137990.55   
2                                      T02355,T09122       3877.35   
3  T00227,T00505,T01384,T03237,T05523,T06055,T066...      16041.11   
4  T01669,T02364,T03421,T03940,T04612,T05747,T070...     152152.73   

   Total Tranx  
0           11  
1            7  
2            2  
3            9  
4           13  


In [None]:
#Rule based logic - Step 2 - Flag the alerts based on thresholds
risk_summary = transactions_merge.groupby(
    ['InitiatingCustomer', 'Overall_Tranx_Risk']
)['Amount in LCY'].sum().unstack(fill_value=0).reset_index()

# Ensure all risk levels are represented (fill if missing)
for level in ['HighRisk', 'MedRisk', 'LowRisk']:
    if level not in risk_summary.columns:
        risk_summary[level] = 0

# Step 2: Add total transaction amount per customer (T)
risk_summary['TotalAmount'] = (
    risk_summary['HighRisk'] + risk_summary['MedRisk'] + risk_summary['LowRisk']
)

# Step 3: Compute H/T and M/T ratios
risk_summary['H_ratio'] = risk_summary['HighRisk'] / risk_summary['TotalAmount']
risk_summary['M_ratio'] = risk_summary['MedRisk'] / risk_summary['TotalAmount']

# Step 4: Prepare a mapping from InitiatingCustomer to Alert value
def determine_alert(row):
    if row['H_ratio'] >= 0.80 and row['HighRisk'] >= 1500:
        return 'A-P1'
    elif row['M_ratio'] >= 0.60 and row['MedRisk'] >= 1000:
        return 'A-P2'
    else:
        return 'N'

risk_summary['Alert'] = risk_summary.apply(determine_alert, axis=1)

# Step 5: Merge the Alert info back into agg_df
agg_df = agg_df.merge(
    risk_summary[['InitiatingCustomer', 'Alert']],
    on='InitiatingCustomer',
    how='left'
)

In [None]:
agg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190 entries, 0 to 1189
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   InitiatingCustomer  1190 non-null   object 
 1   Overall_IC_Risk     1190 non-null   object 
 2   Overall_Tranx_Risk  1190 non-null   object 
 3   Transaction IDs     1190 non-null   object 
 4   Total Amount        1190 non-null   float64
 5   Total Tranx         1190 non-null   int64  
 6   Alert               1190 non-null   object 
dtypes: float64(1), int64(1), object(5)
memory usage: 65.2+ KB


In [None]:
risk_summary[risk_summary['Alert']=='A-P2']

Overall_Tranx_Risk,InitiatingCustomer,HighRisk,LowRisk,MedRisk,TotalAmount,H_ratio,M_ratio,Alert
10,CUST0011,4835.82,86999.37,157961.07,249796.26,0.019359,0.63236,A-P2
147,CUST0148,32848.61,63993.54,206865.94,303708.09,0.108158,0.681134,A-P2
162,CUST0163,26303.46,16461.66,103396.17,146161.29,0.179962,0.707411,A-P2
163,CUST0164,13747.41,19260.94,130419.9,163428.25,0.084119,0.798025,A-P2
181,CUST0182,336.15,63525.89,108810.41,172672.45,0.001947,0.630155,A-P2
286,CUST0287,22143.91,43007.52,100272.09,165423.52,0.133862,0.606154,A-P2
387,CUST0388,29959.36,27145.89,203083.18,260188.43,0.115145,0.780523,A-P2


In [None]:
risk_summary[risk_summary['Alert']=='A-P1']

Overall_Tranx_Risk,InitiatingCustomer,HighRisk,LowRisk,MedRisk,TotalAmount,H_ratio,M_ratio,Alert
44,CUST0045,161501.72,32855.53,4322.41,198679.66,0.812875,0.021756,A-P1
79,CUST0080,128858.02,9789.65,13615.97,152263.64,0.846282,0.089424,A-P1
90,CUST0091,98082.21,4111.65,7909.0,110102.86,0.890823,0.071833,A-P1
337,CUST0338,179639.58,33447.37,7904.22,220991.17,0.812881,0.035767,A-P1
346,CUST0347,236090.97,19138.33,1364.0,256593.3,0.920098,0.005316,A-P1


In [None]:
agg_df['Alert'].value_counts()

Unnamed: 0_level_0,count
Alert,Unnamed: 1_level_1
N,1154
A-P2,21
A-P1,15


In [None]:
#Rule based logic - Step 3 - Identify the transactions related to the alerts
# Step 1: Filter agg_df for alerts A-P1 and A-P2
filtered_agg_df = agg_df[agg_df['Alert'].isin(['A-P1', 'A-P2'])].copy()

# Step 2: Split 'Transaction IDs' string into a list
filtered_agg_df['Transaction IDs'] = filtered_agg_df['Transaction IDs'].str.split(',')

# Step 3: Explode into individual rows
flat_trx_df = filtered_agg_df.explode('Transaction IDs').reset_index(drop=True)

# Step 4: Trim whitespaces if any
flat_trx_df['Transaction IDs'] = flat_trx_df['Transaction IDs'].str.strip()

# Step 5: Keep only needed columns
flat_trx_df = flat_trx_df[['InitiatingCustomer', 'Transaction IDs', 'Alert']]

In [None]:
flat_trx_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246 entries, 0 to 245
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   InitiatingCustomer  246 non-null    object
 1   Transaction IDs     246 non-null    object
 2   Alert               246 non-null    object
dtypes: object(3)
memory usage: 5.9+ KB


In [None]:
flat_trx_df["Alert"].value_counts()

Unnamed: 0_level_0,count
Alert,Unnamed: 1_level_1
A-P2,156
A-P1,90


In [None]:
flat_trx_df.to_csv("transaction_alerts.csv")

#3. Build & train Variational Auto-Encoder (VAE) model

In [None]:
#Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras import layers, Model

In [None]:
transactions_features = transactions_merge.iloc[:,[10,20,24,25,27,30,32,34]]

In [None]:
transactions_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Amount in LCY    10000 non-null  float64
 1   PEP_Flag_CP      10000 non-null  object 
 2   CP_BusinessRisk  10000 non-null  int64  
 3   ChannelRisk      10000 non-null  int64  
 4   CP_GeoRisk       10000 non-null  int64  
 5   ProductRisk      10000 non-null  int64  
 6   CurrencyRisk     10000 non-null  int64  
 7   WL_CP            10000 non-null  object 
dtypes: float64(1), int64(5), object(2)
memory usage: 625.1+ KB


In [None]:
all_indices = np.arange(len(transactions_features))

X_train_raw, X_test_raw, idx_train, idx_test = train_test_split(
    transactions_features, all_indices, test_size=0.2, random_state=55
)

In [None]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import keras

# Step 1: Define feature groups
numeric_features = ["Amount in LCY", "CP_BusinessRisk","ChannelRisk", "CP_GeoRisk", "ProductRisk","CurrencyRisk"]

binary_features = ["PEP_Flag_CP", "WL_CP"]

# Step 2: Define transformers
preprocessor = ColumnTransformer(
    transformers=[
        ("num", MinMaxScaler(), numeric_features),
        ("bin", OneHotEncoder(drop='if_binary', dtype=int), binary_features)
    ]
)

# Step 3: Set up pipeline
pipeline = Pipeline(steps=[("preprocessor", preprocessor)])
#X_preprocessed = pipeline.fit_transform(transactions_features_hr_vae)
pipeline.fit(X_train_raw)
X_train = pipeline.transform(X_train_raw)
X_test = pipeline.transform(X_test_raw)

In [None]:
@keras.saving.register_keras_serializable()
class VAE(Model):
    def __init__(self, input_dim, latent_dim, **kwargs):
        super().__init__(**kwargs)
        self.input_dim = input_dim
        self.latent_dim = latent_dim

        self.encoder = tf.keras.Sequential([
            layers.Input(shape=(input_dim,)),
            layers.Dense(512, activation='relu'),
            layers.Dense(256, activation='relu'),
            layers.Dense(128, activation='relu'),
            layers.Dense(64, activation='relu'),
            layers.Dense(32, activation='relu'),
            layers.Dense(16, activation='relu'),
            layers.Dense(8, activation='relu'),
            layers.Dense(latent_dim * 2),  # z_mean and z_log_var
        ])

        self.decoder = tf.keras.Sequential([
            layers.Input(shape=(latent_dim,)),
            layers.Dense(8, activation='relu'),
            layers.Dense(16, activation='relu'),
            layers.Dense(32, activation='relu'),
            layers.Dense(64, activation='relu'),
            layers.Dense(128, activation='relu'),
            layers.Dense(256, activation='relu'),
            layers.Dense(512, activation='relu'),
            layers.Dense(input_dim, activation='sigmoid'),
        ])

    def sample(self, z_mean, z_log_var):
        eps = tf.random.normal(shape=tf.shape(z_mean))
        return z_mean + tf.exp(0.5 * z_log_var) * eps

    def call(self, inputs):
        # Encode
        z_params = self.encoder(inputs)
        z_mean, z_log_var = tf.split(z_params, num_or_size_splits=2, axis=1)
        z = self.sample(z_mean, z_log_var)

        # Decode
        reconstructed = self.decoder(z)

        # Compute VAE Loss
        reconstruction_loss = tf.reduce_mean(tf.square(inputs - reconstructed), axis=1)
        kl_loss = -0.5 * tf.reduce_mean(
            1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=1
        )
        total_loss = tf.reduce_mean(reconstruction_loss + kl_loss)
        self.add_loss(total_loss)

        return reconstructed

    def get_config(self):
        config = super().get_config()
        config.update({
            "input_dim": self.input_dim,
            "latent_dim": self.latent_dim
        })
        return config

In [None]:
vae.summary()

In [None]:
#Compile and train
vae = VAE(input_dim=X_train.shape[1], latent_dim=6)
vae.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005))
vae.fit(X_train, X_train, epochs=50, batch_size=32, validation_data=(X_test, X_test))

Epoch 1/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 14ms/step - loss: 0.0821 - val_loss: 0.0427
Epoch 2/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - loss: 0.0415 - val_loss: 0.0423
Epoch 3/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0407 - val_loss: 0.0419
Epoch 4/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0420 - val_loss: 0.0419
Epoch 5/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0398 - val_loss: 0.0419
Epoch 6/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0414 - val_loss: 0.0421
Epoch 7/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0407 - val_loss: 0.0419
Epoch 8/50
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0401 - val_loss: 0.0419
Epoch 9/50
[1m250/250[0m [32m━━━━━━

<keras.src.callbacks.history.History at 0x7a7ee6932b90>

In [None]:
reconstructed = vae.predict(X_test)
reconstruction_error = tf.reduce_mean(tf.square(X_test - reconstructed), axis=1).numpy()
threshold = np.percentile(reconstruction_error, 87.7)

anomalies = reconstruction_error > threshold
print(f"Anomalies detected: {np.sum(anomalies)} / {len(X_test)}")

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Anomalies detected: 246 / 2000


In [None]:
# Get indices of anomalies
anomaly_indices = np.where(anomalies)[0]

# View the actual anomalous transaction(s)
anomalous_transactions = X_test[anomaly_indices]


In [None]:
print("Anomaly found at row(s):", anomaly_indices)

Anomaly found at row(s): [   2    6   17   18   27   35   40   47   49   50   63   65   67   71
   72   82   86   87  101  125  128  149  151  155  156  162  163  171
  185  191  192  196  204  224  238  242  244  249  254  264  268  287
  301  306  309  323  325  327  336  367  375  377  387  390  392  403
  412  417  418  419  422  434  435  439  440  445  450  462  464  465
  485  486  512  529  539  547  556  565  569  578  579  581  595  604
  617  625  634  641  651  663  667  682  686  692  697  716  717  727
  739  749  768  785  795  802  803  807  809  811  822  841  861  862
  876  886  891  893  909  921  931  965  988  992  993  998 1000 1008
 1009 1030 1035 1040 1042 1052 1070 1073 1096 1101 1106 1111 1120 1124
 1125 1137 1147 1165 1169 1171 1178 1186 1187 1188 1199 1201 1203 1215
 1216 1218 1227 1233 1236 1239 1244 1247 1251 1252 1256 1279 1296 1297
 1301 1302 1319 1323 1328 1334 1347 1348 1352 1369 1372 1373 1374 1381
 1389 1398 1400 1411 1412 1417 1424 1426 1428 1439 1

In [None]:
# Step 3: Get global indices (in original DataFrame)
global_anomaly_indices = idx_test[anomaly_indices]

# Step 4: Fetch those rows from transactions_features_hr
anomaly_transactions_df = transactions_merge.iloc[global_anomaly_indices]

# Optional: View it
import pandas as pd
pd.set_option('display.max_columns', None)
display(anomaly_transactions_df)

Unnamed: 0,TransactionDate,TransactionID,InitiatingCustomer,AccountNumber,Counterparty,TranType,Channel,Currency,Credit/Debit,Amount,Amount in LCY,Product,InitiatingGeo,CounterpartyGeo,BusinessCode_IC,CustTypeCode_IC,PEP_Flag_IC,ICName,BusinessCode_CP,CustTypeCode_CP,PEP_Flag_CP,CounterpartyName,ProductCode,IC_BusinessRisk,CP_BusinessRisk,ChannelRisk,IC_GeoRisk,CP_GeoRisk,IC_CustTypeRisk,CP_CustTypeRisk,ProductRisk,TranTypeRisk,CurrencyRisk,WL_IC,WL_CP,Overall_Tranx_Risk,Overall_IC_Risk
2751,2025-07-29,T02752,CUST0144,ACC00144,CUST0070,FTDO,MBAN,VEF,Cr,2606.47,208.52,PLON,ARE,ARE,1020,MBNK,N,Christopher Rubio Banking Ltd,1008,SGOV,N,William Baker State Gov Co,OVDF,1,1,5,1,1,1,1,1,5,10,N,N,HighRisk,LowRisk
4798,2025-07-20,T04799,CUST0393,ACC00393,CUST0205,FTDO,MBAN,USD,Dr,502.92,502.92,BUSL,ARE,ARE,1015,NBFC,N,Brian Smith Non-Banking Fin Co,1013,PVTL,N,Erin Warner Pvt Ltd Company,PLON,1,1,5,1,1,1,1,10,5,1,Y,N,HighRisk,HighRisk
1284,2025-07-24,T01285,CUST0105,ACC00562,CUST0065,MPAY,IBAN,USD,Cr,849.94,849.94,GLON,CHN,USD,1002,NBFC,N,Richard Henson Non-Banking Fin Co,1022,INDM,Y,Jessica Callahan,BILLS,1,1,10,1,1,1,1,1,1,1,N,N,HighRisk,LowRisk
1990,2025-07-20,T01991,CUST0305,ACC00808,CUST0183,FTDO,CUST,VEF,Dr,1186.68,94.93,BUSL,BRA,BRA,1019,GOVT,N,Maria Parker Govt Co,1019,PART,N,Angela Vaughn and Partners,SAVS,1,1,1,1,1,1,1,10,5,10,N,N,HighRisk,LowRisk
4449,2025-07-21,T04450,CUST0203,ACC00203,CUST0273,FEEC,IBAN,USD,Cr,15073.67,15073.67,BUSL,COL,CAN,1005,BFIS,N,Elizabeth Perkins Banking Ltd,1024,MBNK,N,Jonathan Lawrence Banking Ltd,ELON,1,1,10,1,1,1,1,10,1,1,N,N,HighRisk,LowRisk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3185,2025-07-24,T03186,CUST0132,ACC00698,CUST0226,BPAY,IBAN,USD,Cr,12471.58,12471.58,BUSL,ARE,GBR,1006,LLPF,N,Dustin Jordan Limited LP,1011,MBNK,N,Benjamin Smith Banking Ltd,SAVS,1,1,10,1,1,1,1,10,1,1,N,N,HighRisk,LowRisk
4464,2025-07-28,T04465,CUST0258,ACC00992,CUST0288,BPAY,CHEQ,INR,Dr,22486.55,269.84,PLON,CHN,HKG,1025,ASSO,N,Kathleen Moran & Association,1013,PUBL,Y,Carolyn Miller Pub Ltd Company,CHEK,1,1,1,1,1,1,1,1,1,1,N,N,HighRisk,LowRisk
9705,2025-07-30,T09706,CUST0001,ACC00001,CUST0380,FTDO,CUST,USD,Dr,7266.25,7266.25,BILLS,NPL,NPL,1014,CLSO,N,Allison Hill Club,1019,BFIS,N,Megan Nelson Banking Ltd,BUSL,10,1,1,10,10,10,1,1,5,1,N,N,HighRisk,HighRisk
2261,2025-07-22,T02262,CUST0115,ACC00424,CUST0117,FEEC,MBAN,USD,Dr,58381.93,58381.93,BUSL,AUS,IND,1009,FCOM,N,Garrett Lin Foreign Co,1007,SGOV,N,William Herrera State Gov Co,PLON,1,1,5,1,1,1,1,10,1,1,N,N,HighRisk,LowRisk


#4. Compare the results of rule-based logic and AI based model results (Limited comparison)

In [None]:
# Step 1: Extract transaction IDs from both sources
vae_anomaly_ids = anomaly_transactions_df["TransactionID"].astype(str).unique()
rule_based_ids = flat_trx_df["Transaction IDs"].astype(str).unique()

# Step 2: Find matches (overlap) and mismatches
matched_ids = np.intersect1d(vae_anomaly_ids, rule_based_ids)
vae_only_ids = np.setdiff1d(vae_anomaly_ids, rule_based_ids)
rule_only_ids = np.setdiff1d(rule_based_ids, vae_anomaly_ids)

# Step 3: Print results
print(f"Total VAE anomalies: {len(vae_anomaly_ids)}")
print(f"Total Rule-based alerts: {len(rule_based_ids)}")
print(f"Matched (VAE ∩ Rule-based): {len(matched_ids)}")
print(f"Match % from VAE anomalies: {len(matched_ids) / len(vae_anomaly_ids) * 100:.2f}%")

# Step 4 (Optional): Get full details of matches and mismatches
matched_df = anomaly_transactions_df[anomaly_transactions_df["TransactionID"].isin(matched_ids)]
vae_only_df = anomaly_transactions_df[anomaly_transactions_df["TransactionID"].isin(vae_only_ids)]
rule_only_tranx = flat_trx_df[flat_trx_df["Transaction IDs"].isin(rule_only_ids)]
rule_only_tranx = rule_only_tranx.rename(columns={'Transaction IDs': 'TransactionID'})
rule_only_df = rule_only_tranx.merge(transactions_merge, on='TransactionID', how='left')


Total VAE anomalies: 246
Total Rule-based alerts: 246
Matched (VAE ∩ Rule-based): 6
Match % from VAE anomalies: 2.44%


In [None]:
#Save the model for use on new/unseen data in the other notebook
vae.save("vae.keras")