Generating seed data for the model to train

In [1]:
import pandas as pd
import random
from faker import Faker
from datetime import date

In [2]:
fake = Faker()
today = date.today()

In [3]:
companies_df = pd.read_csv("company_details.csv")
company_ids = companies_df['company_id'].tolist()

In [4]:
def generate_settlement_instructions(n=100):
    records = []
    used_security_ids = set()
    
    for i in range(n):
        sender = random.choice(company_ids)
        receiver = random.choice([c for c in company_ids if c != sender])

        # Ensure unique settlement_id and security_id
        settlement_id = f"STL{random.randint(100000, 999999)}"
        security_id = fake.unique.bothify(text='SEC########')

        record = {
            "settlement_id": settlement_id,
            "sender_id": sender,
            "receiver_id": receiver,
            "security_id": security_id,
            "settlement_date": today
        }
        records.append(record)
    
    return pd.DataFrame(records)

settlement_df = generate_settlement_instructions(100)


In [5]:
settlement_df.head(10)

Unnamed: 0,settlement_id,sender_id,receiver_id,security_id,settlement_date
0,STL635737,gsch9157,msly3284,SEC83476316,2025-06-25
1,STL116785,bnym4392,jpmc7624,SEC32466378,2025-06-25
2,STL967457,msly3284,bnym4392,SEC92526768,2025-06-25
3,STL876744,bnpp7891,jpmc7624,SEC16236506,2025-06-25
4,STL567399,msly3284,bnym4392,SEC11175111,2025-06-25
5,STL188171,msly3284,bnpp7891,SEC06202888,2025-06-25
6,STL782201,gsch9157,jpmc7624,SEC07530448,2025-06-25
7,STL723671,gsch9157,bnym4392,SEC23978461,2025-06-25
8,STL697030,msly3284,jpmc7624,SEC85416912,2025-06-25
9,STL575802,msly3284,gsch9157,SEC62299738,2025-06-25


In [6]:
settlement_df.to_csv("settlement_instructions.csv", index=False)


CTGAN Model training based on the seed data

need to setup the sdv in bash or in the lap


In [7]:
import sdv
print(sdv.__version__)


1.3.0


In [12]:
import sdv.single_table 
print(dir(sdv.single_table))

['CTGANSynthesizer', 'CopulaGANSynthesizer', 'GaussianCopulaSynthesizer', 'TVAESynthesizer', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', 'base', 'copulagan', 'copulas', 'ctgan', 'errors', 'utils']


In [18]:
from sdv.single_table import CTGANSynthesizer  # ✅ Correct for SDV ≥ 1.0


In [21]:
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer

# Create metadata from your full dataframe (including 'settlement_date')
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=settlement_df)

# Initialize CTGAN with metadata
model = CTGANSynthesizer(metadata)

# Train the model
model.fit(settlement_df)

# Generate synthetic data
synthetic_data = model.sample(1000)


In [23]:
synthetic_data.head(20)

Unnamed: 0,settlement_id,sender_id,receiver_id,security_id,settlement_date
0,STL635737,bnym4392,msly3284,SEC96594615,2025-06-25
1,STL744166,msly3284,bnym4392,SEC81696933,2025-06-25
2,STL829327,gsch9157,bnym4392,SEC39713534,2025-06-25
3,STL276191,gsch9157,gsch9157,SEC21362411,2025-06-25
4,STL479544,bnym4392,bnym4392,SEC80224851,2025-06-25
5,STL886064,bnpp7891,bnym4392,SEC78838772,2025-06-25
6,STL886064,gsch9157,bnpp7891,SEC23978461,2025-06-25
7,STL869533,gsch9157,gsch9157,SEC90499683,2025-06-25
8,STL656658,bnym4392,bnym4392,SEC53826964,2025-06-25
9,STL518951,gsch9157,bnpp7891,SEC80224851,2025-06-25


In [24]:
synthetic_data.to_csv("synthetic_settlement_data.csv", index=False)


trade table

In [36]:
import pandas as pd
import random
from faker import Faker
from datetime import datetime, timedelta

fake = Faker()

# 🔹 Load settlement instruction data
settlement_df = pd.read_csv("settlement_instructions.csv")
settlement_df['settlement_date'] = pd.to_datetime(settlement_df['settlement_date'])

# 🔹 Load company details
company_details_df = pd.read_csv("company_details.csv")  # Must have at least: company_id, company_name
all_company_ids = company_details_df['company_id'].tolist()

# 🔹 Create mapping for company_id → company_name
company_name_map = dict(zip(
    company_details_df['company_id'],
    company_details_df.get('company_name', [fake.company() for _ in range(len(company_details_df))])
))

# ✅ Trade generation function
def generate_trades_from_settlements(settlement_df, trades_per_settlement=1):
    trade_records = []
    used_trade_ids = set()

    for _, row in settlement_df.iterrows():
        quantity = random.randint(100, 1000)
        price = round(random.uniform(50, 500), 2)
        amount = round(quantity * price, 2)

        maker = f"MKR{random.randint(10000, 99999)}"
        checker = f"CKR{random.randint(10000, 99999)}"
        supervisor = f"SPR{random.randint(10000, 99999)}"

        for _ in range(trades_per_settlement):
            # Ensure unique trade_id
            while True:
                trade_id = f"TRD{random.randint(10000, 99999)}"
                if trade_id not in used_trade_ids:
                    used_trade_ids.add(trade_id)
                    break

            # ✅ Pick company_id from full company list
            company_id = random.choice(all_company_ids)

            # ✅ Create timestamp within last 3 days
            created_at = datetime.now() - timedelta(days=random.randint(0, 2), seconds=random.randint(0, 86400))

            trade_records.append({
                'trade_id': trade_id,
                'settlement_id': row['settlement_id'],
                'maker_id': maker,
                'checker_id': checker,
                'supervisor_id': supervisor,
                'quantity': quantity,
                'price': price,
                'amount': amount,
                'type': random.choice(['long', 'short']),
                'status': 'pending',
                'company_id': company_id,
                'created_at': created_at
            })

    return pd.DataFrame(trade_records)

# 🔹 Generate trades (change number per settlement if needed)
trade_df = generate_trades_from_settlements(settlement_df, trades_per_settlement=1)

# 🔹 Add company name from mapping
trade_df['company_name'] = trade_df['company_id'].map(company_name_map)




In [37]:
# Show first 10 trades
trade_df.head(10)

Unnamed: 0,trade_id,settlement_id,maker_id,checker_id,supervisor_id,quantity,price,amount,type,status,company_id,created_at,company_name
0,TRD25269,STL635737,MKR56529,CKR29058,SPR88347,767,321.46,246559.82,long,pending,gsch9157,2025-06-23 00:52:50.028799,Goldman Scahs
1,TRD28907,STL116785,MKR38144,CKR59944,SPR84033,291,308.7,89831.7,short,pending,msly3284,2025-06-23 20:00:42.028799,Morgan Stanely
2,TRD87356,STL967457,MKR77232,CKR85561,SPR20150,285,262.03,74678.55,long,pending,bnpp7891,2025-06-25 05:12:39.028799,BNP Paribas
3,TRD90371,STL876744,MKR31205,CKR11324,SPR73497,582,247.97,144318.54,long,pending,msly3284,2025-06-25 03:41:34.028799,Morgan Stanely
4,TRD47128,STL567399,MKR45413,CKR76423,SPR94202,799,70.4,56249.6,short,pending,gsch9157,2025-06-24 14:54:51.028799,Goldman Scahs
5,TRD50501,STL188171,MKR59013,CKR75697,SPR62635,199,106.44,21181.56,long,pending,jpmc7624,2025-06-25 21:01:26.028799,JP Morgan
6,TRD10749,STL782201,MKR13609,CKR78598,SPR45281,130,295.7,38441.0,short,pending,jpmc7624,2025-06-24 00:54:40.028799,JP Morgan
7,TRD37800,STL723671,MKR94192,CKR63349,SPR78243,262,55.75,14606.5,long,pending,bnym4392,2025-06-24 16:31:47.028799,BNY Mellon
8,TRD22138,STL697030,MKR90174,CKR48441,SPR21704,651,426.76,277820.76,short,pending,jpmc7624,2025-06-24 20:52:36.028799,JP Morgan
9,TRD95362,STL575802,MKR24866,CKR31161,SPR17662,476,480.09,228522.84,long,pending,bnpp7891,2025-06-23 03:48:20.028799,BNP Paribas


In [38]:
trade_df.to_csv('Trade_table.csv', index=False)