In [2]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

fake = Faker()

# Configuration
num_records = 1_000_000
bad_data_percentage = 0.05

In [3]:
sides = ['Buy', 'Sell']
statuses = ['Matched', 'Unmatched', 'Partially Matched']
isins = [f"IN{fake.random_int(1000000000, 9999999999)}" for _ in range(500)]
counterparties = [fake.company() for _ in range(100)]
brokers = [f"Broker_{i}" for i in range(1, 21)]
custodians = [f"Custodian_{i}" for i in range(1, 21)]

In [6]:
def create_trade_record(i):
    base_date = datetime.now() - timedelta(days=random.randint(0, 365))

    trade = {
        "Trade ID": f"TID{i:07}",
        "Trade Date": base_date.strftime("%Y-%m-%d"),
        "ISIN": random.choice(isins),
        "Quantity": np.random.randint(1, 10000),
        "Price": round(random.uniform(10.0, 1000.0), 2),
        "Side": random.choice(sides),
        "Counterparty": random.choice(counterparties),
        "Broker": random.choice(brokers),
        "Custodian": random.choice(custodians),
        "Settlement Date": (base_date + timedelta(days=random.choice([1, 2, 3]))).strftime("%Y-%m-%d"),
        "Status": random.choice(statuses),
    }

    return trade

In [7]:
def corrupt_data(trade):
    fields_to_corrupt = random.sample(list(trade.keys()), random.randint(1, 3))

    for field in fields_to_corrupt:
        corruption_type = random.choice(["null", "invalid"])

        if corruption_type == "null":
            trade[field] = None
        elif corruption_type == "invalid":
            trade[field] = "###INVALID###"

    return trade

In [8]:
trade_data = []
for i in range(num_records):
    record = create_trade_record(i)

    # Inject bad data with probability
    if random.random() < bad_data_percentage:
        record = corrupt_data(record)

    trade_data.append(record)

In [9]:
df = pd.DataFrame(trade_data)

In [10]:
df.head(10)

Unnamed: 0,Trade ID,Trade Date,ISIN,Quantity,Price,Side,Counterparty,Broker,Custodian,Settlement Date,Status
0,TID0000000,2025-03-14,IN7980409158,5871,897.9,Sell,Cain PLC,Broker_9,Custodian_1,2025-03-15,Unmatched
1,TID0000001,2024-12-26,IN3151462597,6088,829.82,Sell,"Bell, Thornton and Martinez",Broker_3,Custodian_2,2024-12-28,Unmatched
2,TID0000002,2025-05-22,IN5685554172,6170,329.32,Sell,"Fowler, Harper and Boone",Broker_12,Custodian_19,2025-05-25,Matched
3,TID0000003,2024-12-18,IN9260049363,186,36.15,Buy,Morse Group,Broker_1,Custodian_10,2024-12-21,Unmatched
4,TID0000004,2025-01-27,IN7107327976,6645,376.87,Buy,Leach PLC,Broker_13,Custodian_7,2025-01-28,Partially Matched
5,TID0000005,2025-04-26,IN9720929919,6592,168.37,Sell,Best-Hawkins,Broker_5,Custodian_18,2025-04-29,Matched
6,TID0000006,2025-01-02,IN4151525322,5286,621.49,Sell,Smith Group,Broker_9,Custodian_9,2025-01-05,Unmatched
7,TID0000007,2025-01-11,IN3411789667,5060,551.05,Sell,Johnson-Smith,Broker_3,Custodian_5,2025-01-13,Matched
8,TID0000008,2025-02-14,IN9264646233,2035,355.98,Buy,Carrillo-Pruitt,Broker_6,Custodian_13,2025-02-16,Partially Matched
9,TID0000009,2024-12-16,IN1861768195,1686,648.46,Buy,Hernandez PLC,Broker_19,Custodian_18,2024-12-18,Unmatched


In [11]:
df.to_csv("synthetic_trade_data.csv", index=False)