In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime

# Initialize
fake = Faker()
Faker.seed(42)
random.seed(42)

# Parameters
num_users = 10000
num_transactions = 500000

# Constants
country_corridors = [('NG', 'UK'), ('GH', 'CA'), ('KE', 'US'), ('UG', 'UK'), ('NG', 'US')]
currencies = {'NG': 'NGN', 'GH': 'GHS', 'KE': 'KES', 'UG': 'UGX', 'UK': 'GBP', 'US': 'USD', 'CA': 'CAD'}
transaction_types = ['Wallet Top-Up', 'Send Money', 'Bill Payment', 'Bank Withdrawal']
devices = ['Mobile', 'Web']
kyc_methods = ['Document Upload', 'Bank Verification', 'Utility Bill']
kyc_statuses = ['Verified', 'Pending', 'Failed']
compliance_outcomes = ['Pass', 'Review', 'Escalated']

# Generate Users
print("Generating users...")
users = []
for user_id in range(1, num_users + 1):
    country = random.choice(['NG', 'GH', 'KE', 'UG'])
    signup_date = fake.date_between(start_date='-1y', end_date='today')
    tenure_days = (datetime.now().date() - signup_date).days
    risk_score = round(random.uniform(0, 1), 3)
    users.append({
        'user_id': user_id,
        'name': fake.name(),
        'email': fake.email(),
        'signup_date': signup_date,
        'country': country,
        'kyc_status': random.choices(kyc_statuses, weights=[0.7, 0.2, 0.1])[0],
        'kyc_method': random.choice(kyc_methods),
        'tenure_days': tenure_days,
        'risk_score': risk_score
    })

users_df = pd.DataFrame(users)
users_df.to_csv("../data/raw_users.csv", index=False)  # ← current

# Generate Transactions
print("Generating transactions...")
transactions = []
for txn_id in range(1, num_transactions + 1):
    user = random.choice(users)
    corridor = random.choice(country_corridors)
    amount = round(random.uniform(5, 5000), 2)
    timestamp = fake.date_time_between(start_date='-6mo', end_date='now')
    exchange_rate = round(random.uniform(0.005, 1.2), 4)
    is_flagged = random.choices([True, False], weights=[0.04, 0.96])[0]

    transactions.append({
        'transaction_id': txn_id,
        'user_id': user['user_id'],
        'timestamp': timestamp,
        'send_country': corridor[0],
        'receive_country': corridor[1],
        'send_currency': currencies[corridor[0]],
        'receive_currency': currencies[corridor[1]],
        'amount': amount,
        'converted_amount': round(amount * exchange_rate, 2),
        'transaction_type': random.choice(transaction_types),
        'device': random.choice(devices),
        'is_flagged': is_flagged,
        'exchange_rate': exchange_rate,
        'compliance_result': random.choices(compliance_outcomes, weights=[0.9, 0.08, 0.02])[0]
    })

transactions_df = pd.DataFrame(transactions)
transactions_df.to_csv("../data/raw_transactions.csv", index=False)

print("✅ Data simulation complete. CSVs saved to /data/")

Generating users...
Generating transactions...
✅ Data simulation complete. CSVs saved to /data/


In [2]:
transactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 14 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   transaction_id     500000 non-null  int64         
 1   user_id            500000 non-null  int64         
 2   timestamp          500000 non-null  datetime64[ns]
 3   send_country       500000 non-null  object        
 4   receive_country    500000 non-null  object        
 5   send_currency      500000 non-null  object        
 6   receive_currency   500000 non-null  object        
 7   amount             500000 non-null  float64       
 8   converted_amount   500000 non-null  float64       
 9   transaction_type   500000 non-null  object        
 10  device             500000 non-null  object        
 11  is_flagged         500000 non-null  bool          
 12  exchange_rate      500000 non-null  float64       
 13  compliance_result  500000 non-null  object  

In [6]:
import os
print("Current working directory:", os.getcwd())

Current working directory: /Users/carsell/LemFi


In [8]:
import os

# Create the data folder if it doesn't exist
os.makedirs("data", exist_ok=True)

# Save the files safely
users_df.to_csv("data/raw_users.csv", index=False)
transactions_df.to_csv("data/raw_transactions.csv", index=False)