In [4]:
import pandas as pd
import random
from faker import Faker
from datetime import datetime, timedelta

fake = Faker()

def generate_isin():
    return ''.join(random.choices('ABCDEFGHIJKLMNOPQRSTUVWXYZ', k=2)) + ''.join(random.choices('0123456789', k=10))

def generate_broker_data_without_common_id(n=5000):
    sides = ['Buy', 'Sell']
    custodians = ['State Street', 'BNY Mellon', 'Northern Trust', 'HDFC Custodian']
    broker_ids = ['BRK001', 'BRK002', 'BRK003', 'BRK004']
    
    data = []
    
    for i in range(n):
        trade_id = f'BKR{i:06d}'
        isin = generate_isin()
        quantity = random.randint(100, 10000)
        price = round(random.uniform(50.0, 1000.0), 2)
        trade_date = fake.date_between(start_date='-180d', end_date='today')
        settlement_date = trade_date + timedelta(days=random.randint(1, 3))
        side = random.choice(sides)
        counter_party = random.choice(custodians)
        broker_id = random.choice(broker_ids)
        
        data.append({
            'trade_id': trade_id,
            'isin': isin,
            'quantity': quantity,
            'price': price,
            'trade_date': trade_date.strftime('%Y-%m-%d'),
            'settlement_date': settlement_date.strftime('%Y-%m-%d'),
            'side': side,
            'counter_party': counter_party,
            'broker_id': broker_id
        })
    
    return pd.DataFrame(data)

# Generate and save broker data without common_trade_id
broker_df = generate_broker_data_without_common_id(5000)
broker_df.to_csv("broker_data_raw.csv", index=False)
print("✅ Broker data generated without common_trade_id.")


✅ Broker data generated without common_trade_id.


In [5]:
def generate_common_trade_id(i):
    return random.choice('ABCDEFGHIJKLMNOPQRSTUVWXYZ') + \
           f"{random.randint(1000, 9999)}" + \
           random.choice('ABCDEFGHIJKLMNOPQRSTUVWXYZ')

def add_common_trade_id_to_file(file_path, output_path):
    df = pd.read_csv(file_path)
    df.insert(0, 'common_trade_id', [generate_common_trade_id(i) for i in range(len(df))])
    df.to_csv(output_path, index=False)
    print("✅ common_trade_id added to broker data.")
    return df

# Add common_trade_id to broker file
broker_with_common_id_df = add_common_trade_id_to_file("broker_data_raw.csv", "broker_data_with_common_id.csv")
print(broker_with_common_id_df.head())


✅ common_trade_id added to broker data.
  common_trade_id   trade_id          isin  quantity   price  trade_date  \
0          F6046V  BKR000000  JS2665624158       267  312.07  2025-02-24   
1          S8302U  BKR000001  LY0049801248      7467  322.06  2025-05-28   
2          W9017L  BKR000002  QF5297575190      7104  801.17  2025-03-04   
3          E9667Q  BKR000003  ZQ1716858536      3459  748.74  2025-01-15   
4          P7895A  BKR000004  CX4878820103      5060  863.57  2025-02-20   

  settlement_date  side   counter_party broker_id  
0      2025-02-25  Sell    State Street    BRK002  
1      2025-05-31   Buy  Northern Trust    BRK003  
2      2025-03-06  Sell  HDFC Custodian    BRK004  
3      2025-01-17  Sell  HDFC Custodian    BRK004  
4      2025-02-21  Sell      BNY Mellon    BRK004  


In [6]:
def generate_custodian_data(broker_df):
    custodian_data = []

    for _, row in broker_df.iterrows():
        match_type = random.choices(
            ['matched', 'partial', 'unmatched'], weights=[0.7, 0.2, 0.1])[0]

        custodian_trade_id = f'CST{random.randint(100000,999999)}'

        # Start from broker data
        custodian_row = row.copy()
        custodian_row['trade_id'] = custodian_trade_id
        custodian_row['source'] = 'Custodian'

        # Apply differences based on match type
        if match_type == 'partial':
            custodian_row['quantity'] = int(row['quantity'] * random.uniform(0.7, 0.95))
        elif match_type == 'unmatched':
            custodian_row['isin'] = generate_isin()
            custodian_row['quantity'] = random.randint(100, 10000)
            custodian_row['price'] = round(random.uniform(50.0, 1000.0), 2)
            custodian_row['side'] = 'Buy' if row['side'] == 'Sell' else 'Sell'
            custodian_row['counter_party'] = fake.company()

        custodian_data.append(custodian_row)

    custodian_df = pd.DataFrame(custodian_data)
    return custodian_df

# Generate custodian data from broker data with common_trade_id
custodian_df = generate_custodian_data(broker_with_common_id_df)
custodian_df.to_csv("custodian_data.csv", index=False)
print("✅ Custodian data generated.")
print(custodian_df.head())


✅ Custodian data generated.
  common_trade_id   trade_id          isin  quantity   price  trade_date  \
0          F6046V  CST920701  JS2665624158       267  312.07  2025-02-24   
1          S8302U  CST182608  LY0049801248      7467  322.06  2025-05-28   
2          W9017L  CST403606  QF5297575190      7104  801.17  2025-03-04   
3          E9667Q  CST343863  ZQ1716858536      3459  748.74  2025-01-15   
4          P7895A  CST957612  CX4878820103      3646  863.57  2025-02-20   

  settlement_date  side   counter_party broker_id     source  
0      2025-02-25  Sell    State Street    BRK002  Custodian  
1      2025-05-31   Buy  Northern Trust    BRK003  Custodian  
2      2025-03-06  Sell  HDFC Custodian    BRK004  Custodian  
3      2025-01-17  Sell  HDFC Custodian    BRK004  Custodian  
4      2025-02-21  Sell      BNY Mellon    BRK004  Custodian  
