In [1]:
import numpy as np
import pandas as pd
import random
import pickle
from datetime import datetime, timedelta
from tqdm import trange, tqdm
import os


random.seed(42)

In [2]:
ver = str(random.randint(10, 500))
csv_file_path = './csv_files/' + ver
ver = '_'+ver+'_'
print(ver)
# Create a single directory
os.makedirs(csv_file_path, exist_ok=True)  # Will not raise an error if it already exists


_337_


In [3]:
all_taxpayers_df = pd.read_csv('all_taxpayers.csv')
all_taxpayers_df.head()
print(len(all_taxpayers_df))
#load sector details

with open('sector_details.pkl', 'rb') as f:
    sector_details = pickle.load(f)

n_transactions = 200000
invalid_transactions = set()
undeclared_ratio=0.05
n_declared = int(n_transactions * (1 - undeclared_ratio))
n_undeclared = n_transactions - n_declared

10100


In [4]:
allowed_sector_transactions = allowed_links = {
    'Retail': ['Wholesale Trade', 'Manufacturing', 'Agriculture', 'Transportation & Logistics', 'ICT', 'Finance'],
    'Manufacturing': ['Agriculture', 'Mining & Quarrying', 'Energy', 'Transportation & Logistics', 'Wholesale Trade', 'Construction'],
    'Agriculture': ['Manufacturing', 'Wholesale Trade', 'Retail', 'Water & Sanitation', 'Finance'],
    'Construction': ['Manufacturing', 'Wholesale Trade', 'Energy', 'Transportation & Logistics', 'Finance', 'Legal & Professional Services'],
    'ICT': ['Finance', 'Telecommunications', 'Education', 'Healthcare', 'Legal & Professional Services', 'Retail', 'Entertainment & Media'],
    'Finance': ['All'],  # Often connects to all sectors
    'Hospitality': ['Agriculture', 'Wholesale Trade', 'Retail', 'Transportation & Logistics', 'Entertainment & Media'],
    'Healthcare': ['Pharmaceuticals', 'Manufacturing', 'ICT', 'Education', 'Energy', 'Waste Management'],
    'Education': ['ICT', 'Finance', 'Publishing', 'Retail', 'Public Administration'],
    'Real Estate': ['Construction', 'Finance', 'Legal & Professional Services', 'Public Administration'],
    'Transportation & Logistics': ['Wholesale Trade', 'Retail', 'Manufacturing', 'Energy', 'Agriculture', 'Mining & Quarrying'],
    'Telecommunications': ['ICT', 'Finance', 'Education', 'Entertainment & Media'],
    'Energy': ['Manufacturing', 'Mining & Quarrying', 'Construction', 'Transportation & Logistics', 'Water & Sanitation'],
    'Legal & Professional Services': ['Finance', 'Real Estate', 'Construction', 'Public Administration', 'Healthcare'],
    'Mining & Quarrying': ['Manufacturing', 'Construction', 'Energy', 'Transportation & Logistics'],
    'Entertainment & Media': ['Retail', 'ICT', 'Arts & Culture', 'Telecommunications', 'Public Administration'],
    'Public Administration': ['All'],  # Purchases from many sectors
    'Water & Sanitation': ['Construction', 'Agriculture', 'Public Administration', 'Healthcare'],
    'Waste Management': ['Healthcare', 'Construction', 'Public Administration', 'Water & Sanitation'],
    'Security Services': ['Retail', 'Finance', 'Public Administration', 'Real Estate', 'Healthcare'],
    'Wholesale Trade': ['Manufacturing', 'Agriculture', 'Retail', 'Transportation & Logistics'],
    'Arts & Culture': ['Education', 'Entertainment & Media', 'Retail', 'Public Administration'],
    'Transportation': ['Retail', 'Wholesale Trade', 'Agriculture', 'Construction', 'Healthcare']
}

locations = [
    'Nairobi', 'Mombasa', 'Kisumu', 'Eldoret', 'Nakuru', 'Thika', 'Machakos',
    'Kericho', 'Nyeri', 'Garissa', 'Meru', 'Kitale'
]

def is_valid_transaction(seller_sector, buyer_sector):
    allowed = allowed_links.get(seller_sector, [])
    return buyer_sector in allowed or 'All' in allowed or random.random() < 0.05  # 5% noise


# === Generate VAT Transactions ===

invoice_statuses = ['Paid', 'Pending', 'Cancelled', 'Disputed']
payment_methods = ['Bank Transfer', 'Mobile Money', 'Cash', 'Cheque', 'Credit']

existing_pairs = []
pair_reuse_probability = 0.3  # 30% of transactions will reuse a pair

transactions = []

#KNBS’s Economic Survey (2024, 2023) offers seasonal insights across agriculture, trade, tourism, construction, and more 
#Kenya National Bureau of Statistics (KNBS) – Sectoral GDP by Quarter 

sector_burst_weights = {
    'Retail': [0.12, 0.07, 0.06, 0.06, 0.06, 0.10, 0.05, 0.05, 0.06, 0.08, 0.12, 0.17],  # Jan, Jun, Nov-Dec
    'Agriculture': [0.05, 0.05, 0.10, 0.10, 0.10, 0.05, 0.05, 0.05, 0.07, 0.13, 0.12, 0.08],  # Mar-May, Oct-Nov
    'Hospitality': [0.08, 0.06, 0.06, 0.12, 0.07, 0.07, 0.07, 0.12, 0.06, 0.06, 0.06, 0.17],  # Apr, Aug, Dec
    'Transport & Logistics': [0.06, 0.06, 0.06, 0.12, 0.08, 0.06, 0.06, 0.12, 0.06, 0.06, 0.06, 0.16],  # Apr, Aug, Dec
    'Construction': [0.10, 0.10, 0.10, 0.06, 0.06, 0.06, 0.10, 0.10, 0.10, 0.05, 0.04, 0.03],  # Jan-Mar, Jul-Sep
    
    # Additions below
    'Entertainment & Media': [0.10, 0.06, 0.05, 0.08, 0.08, 0.06, 0.06, 0.08, 0.06, 0.08, 0.10, 0.19],  # Peaks during school holidays, Dec
    'Energy': [0.08, 0.08, 0.08, 0.09, 0.09, 0.08, 0.08, 0.08, 0.08, 0.09, 0.09, 0.08],  # Relatively stable, slight demand spikes in Apr, Oct
    'Real Estate': [0.09, 0.09, 0.09, 0.07, 0.07, 0.07, 0.10, 0.10, 0.10, 0.07, 0.08, 0.07],  # Activity picks up in Jan-Mar, Jul-Sep
    'Other': [0.05] * 12  # Default: uniform, no specific seasonal peaks

}


# Function to sample date with seasonality and burst variations
def sample_seasonal_date(sector):
    # Get the sector's seasonal burst weights
    monthly_weights = sector_burst_weights.get(sector, sector_burst_weights['Other'])

    # Choose month with seasonal probability
    month = random.choices(range(12), weights=monthly_weights)[0]

    # Optionally introduce bursts during specific weeks for certain sectors
    burst_weeks = [10, 25, 27, 33, 48, 50]  # Defined burst weeks
    burst_probability = 0.1  # 10% chance for a burst event

    if random.random() < burst_probability:
        week = random.choice(burst_weeks)
        weekday = random.randint(0, 6)  # Random day in the week
        date = datetime.strptime(f'2022-W{week:02d}-{weekday}', "%Y-W%W-%w")
    else:
        # Sample a date based on monthly weights
        day = random.randint(1, 28)  # Safe for all months
        date = datetime(2022, month + 1, day)
    
    return date


#generate a valid transaction
def generate_transaction(seller_id, buyer_id, declared = 1):
    transaction = {}
    goods_list = sector_details.get(seller['economic_sector'], {}).get('outputs', ['General Item'])
    description = random.choice(goods_list)

    sales_amount = round(random.uniform(500, 100000), 2)
    invoice_date = sample_seasonal_date(seller['economic_sector'])
    invoice_status = random.choices(invoice_statuses, weights=[0.7, 0.2, 0.05, 0.05])[0]
    payment_method = random.choice(payment_methods)

    transaction = {
    'seller_id': seller['taxpayer_id'],
    'buyer_id': buyer['taxpayer_id'],
    'description_of_goods': description,
    'sales_amount': sales_amount,
    'invoice_date': invoice_date,
    'invoice_status': invoice_status,
    'payment_method': payment_method,
    'location': seller['location'],
    'economic_sector': seller['economic_sector'],
    'buyer_size_category': buyer['size_category'],
    'seller_size_category': seller['size_category'],
    'declared' : declared
}

    return transaction

In [5]:
#generate 

# # Function to simulate declared transactions
# def simulate_transactions(taxpayers, n_transactions=10000, undeclared_ratio=0.05):
buyers = all_taxpayers_df.copy()
sellers = all_taxpayers_df.copy()

power_law_transactions = []
high_degree_buyers = buyers.nlargest(100, 'size')['taxpayer_id'].tolist()



for _ in trange(n_declared):
    # Power-law selection: buyers are skewed toward large firms
    # Preselect a small number of dominant buyers
    buyer_id = random.choice(high_degree_buyers)
    buyer = buyers[buyers['taxpayer_id'] == buyer_id].iloc[0]
    seller = sellers.sample(weights=sellers['norm_size']).iloc[0]

    # Prevent self-transactions
    if seller['taxpayer_id'] == buyer['taxpayer_id']:
        continue

    if (seller['taxpayer_id'], buyer['taxpayer_id']) in invalid_transactions:
        continue

    elif not is_valid_transaction(seller['economic_sector'], seller['economic_sector']):
        invalid_transactions.add((seller['taxpayer_id'], buyer['taxpayer_id']))
        continue

    power_law_transactions.append(
        generate_transaction(seller['taxpayer_id'], buyer['taxpayer_id'])
    )

power_law_df = pd.DataFrame(power_law_transactions)
power_law_df.head()


# Create undeclared transactions
n_undeclared = int(n_transactions * undeclared_ratio)
declared_df = power_law_df[power_law_df['declared'] == 1]

undeclared_power_law_transactions = []

for _ in trange(n_undeclared):
    # Choose a random declared transaction with a large buyer
    ref_tx = declared_df.sample(1).iloc[0]
    buyer_id = ref_tx['buyer_id']
    buyer = all_taxpayers_df[all_taxpayers_df['taxpayer_id'] == buyer_id].iloc[0]

    # Find small sellers (e.g., in bottom 30% size)
    small_sellers = all_taxpayers_df[
            (all_taxpayers_df['size'] < all_taxpayers_df['size'].quantile(0.3)) &
            (all_taxpayers_df['location'] == ref_tx['location']) &
            (all_taxpayers_df['economic_sector'] == ref_tx['economic_sector']) &
            (all_taxpayers_df['taxpayer_id'] != buyer_id)
        ]

    if small_sellers.empty:
        continue

    seller = small_sellers.sample(1).iloc[0]

    undeclared_power_law_transactions.append(
        generate_transaction(seller['taxpayer_id'], buyer['taxpayer_id'], 0)
    )

undeclared_power_law_df = pd.DataFrame(undeclared_power_law_transactions)

power_law_df = pd.concat([power_law_df, undeclared_power_law_df], ignore_index=True)

power_law_df.to_csv(csv_file_path+'/power_law_transactions'+ver+'.csv')

print(len(power_law_df))


100%|██████████| 190000/190000 [03:55<00:00, 805.26it/s]
100%|██████████| 10000/10000 [00:39<00:00, 252.23it/s]


33056


In [6]:
# Location and sector trade flow mapping (example)


location_sector_flows = {
    ('Thika', 'Nairobi'): [ 'Manufacturing', 'Agriculture', 'Retail', 'Wholesale', 'Construction'],
    ('Nakuru', 'Nairobi'): ['Agriculture', 'Retail', 'Construction'],
    ('Kisumu', 'Nairobi'): ['Retail', 'ICT', 'Wholesale'],
    ('Nyeri', 'Kericho'): ['Manufacturing', 'Agriculture', 'Energy'],
    ('Eldoret', 'Thika'): ['Agriculture', 'Manufacturing', 'Transport'],
    ('Machakos', 'Nairobi'): ['Construction', 'Logistics'],
    ('Nairobi', 'Mombasa'): ['ICT', 'Finance', 'Trade'],
    ('Mombasa', 'Nairobi'): ['Logistics', 'Wholesale', 'Hospitality', 'Manufacturing', 'Construction', 'Energy', 'Transport', 'Trade']
}


location_sector_transactions = []
# Declared transactions
for _ in trange(n_declared):
    seller = sellers.sample(weights=sellers['norm_size']).iloc[0]
    origin = seller['location']

    # Check if origin has a structured flow
    matches = [k for k in location_sector_flows if k[0] == origin]
    if matches:
        # Pick a destination and sector this origin sells to
        dest_pair = random.choice(matches)
        dest_location = dest_pair[1]
        expected_sector = location_sector_flows[dest_pair]

        # Seller must be from the expected sector
        if seller['economic_sector'] != expected_sector:
            continue

        buyers = all_taxpayers_df[
            (all_taxpayers_df['location'] == dest_location) &
            (all_taxpayers_df['taxpayer_id'] != seller['taxpayer_id'])
        ]
    else:
        # Random fallback
        buyers = all_taxpayers_df[
            (all_taxpayers_df['location'] != origin) &
            (all_taxpayers_df['taxpayer_id'] != seller['taxpayer_id'])
        ]

    if buyers.empty:
        continue

    buyer = buyers.sample(weights=buyers['norm_size']).iloc[0]

    location_sector_transactions.append(
        generate_transaction(seller['taxpayer_id'], buyer['taxpayer_id'], 1)
    )


# Undeclared transactions: sellers from correct sector & location don't declare
for _ in trange(n_undeclared):
    dest_pair = random.choice(list(location_sector_flows.keys()))
    origin, dest = dest_pair
    sector = location_sector_flows[dest_pair]

    small_sellers = all_taxpayers_df[
        (all_taxpayers_df['location'] == origin) &
        (all_taxpayers_df['economic_sector'] == sector) &
        (all_taxpayers_df['size'] < all_taxpayers_df['size'].quantile(0.3))
    ]
    buyers = all_taxpayers_df[all_taxpayers_df['location'] == dest]

    if small_sellers.empty or buyers.empty:
        continue

    seller = small_sellers.sample(1).iloc[0]
    buyer = buyers.sample(1).iloc[0]

    location_sector_transactions.append(
        generate_transaction(seller['taxpayer_id'], buyer['taxpayer_id'], 0)
    )

location_sector_transactions_df = pd.DataFrame(location_sector_transactions)

location_sector_transactions_df.to_csv(csv_file_path+'/location_sector_transactions'+ver+'.csv')

print(len(location_sector_transactions_df))

100%|██████████| 190000/190000 [05:24<00:00, 585.59it/s]
100%|██████████| 10000/10000 [00:30<00:00, 326.89it/s]

108960


In [7]:
#sector burst sales

sector_burst_transactions = []

# def normalize(w): return [x / sum(w) for x in w]

for sector, weights in sector_burst_weights.items():
    sector_taxpayers = all_taxpayers_df[all_taxpayers_df['economic_sector'] == sector]
    if sector_taxpayers.empty:
        continue

    small_taxpayers = sector_taxpayers[sector_taxpayers['size'] < sector_taxpayers['size'].quantile(0.5)]
    if len(small_taxpayers) < 2:
        continue

    n_sector_txns = int(n_transactions * (len(sector_taxpayers) / len(all_taxpayers_df)))
    n_declared = int(n_sector_txns * (1 - undeclared_ratio))
    n_undeclared = n_sector_txns - n_declared

    # --- Valid pairs for declared txns ---
    valid_pairs = [
        (s['taxpayer_id'], b['taxpayer_id'])
        for i, s in small_taxpayers.iterrows()
        for j, b in small_taxpayers.iterrows()
        if s['taxpayer_id'] != b['taxpayer_id']
        and (s['taxpayer_id'], b['taxpayer_id']) not in invalid_transactions
        and is_valid_transaction(s['economic_sector'], b['economic_sector'])
    ]

    if not valid_pairs:
        continue

    sampled_declared = random.choices(valid_pairs, k=n_declared)

    for seller_id, buyer_id in tqdm(sampled_declared, desc="Generating declared transactions"):
        sector_burst_transactions.append(generate_transaction(seller_id, buyer_id, 1))

    # --- Undeclared transactions ---
    sampled_undeclared = random.choices(valid_pairs, k=n_undeclared)

    for seller_id, buyer_id in tqdm(sampled_undeclared, desc="Generating undeclared transactions"):
        sector_burst_transactions.append(generate_transaction(seller_id, buyer_id, 0))

# Convert to DataFrame
sector_burst_transactions_df = pd.DataFrame(sector_burst_transactions)

sector_burst_transactions_df.to_csv(csv_file_path+'/sector_burst_transactions'+ver+'.csv')

print(len(sector_burst_transactions_df))


Generating declared transactions: 100%|██████████| 8107/8107 [00:00<00:00, 46467.92it/s]
Generating undeclared transactions: 100%|██████████| 427/427 [00:00<00:00, 44566.96it/s]
Generating declared transactions: 100%|██████████| 7976/7976 [00:00<00:00, 45739.92it/s]
Generating undeclared transactions: 100%|██████████| 420/420 [00:00<00:00, 44241.49it/s]
Generating declared transactions: 100%|██████████| 8088/8088 [00:00<00:00, 45713.99it/s]
Generating undeclared transactions: 100%|██████████| 426/426 [00:00<00:00, 43155.65it/s]
Generating declared transactions:   0%|          | 0/8295 [00:00<?, ?it/s]

34176


In [8]:
all_transactions_df = pd.concat([power_law_df,location_sector_transactions_df, sector_burst_transactions_df], axis = 0) 
print(len(all_transactions_df))
all_transactions_df.to_csv(csv_file_path+'/all_transactions'+ver+'.csv')

176192


In [11]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from azure.ai.ml.entities import Data
from pathlib import Path

# Initialize ML client
ml_client = MLClient(
    DefaultAzureCredential(),
    subscription_id="2550d0cd-923a-4266-9fd3-c574cbc5929e",
    resource_group_name="brianombega-rg",
    workspace_name="Masters_Ombega"
)

# Define the data asset
data_asset = Data(
    path=Path(csv_file_path+'/all_transactions'+ver+'.csv'),  # local path to your CSV file
    type="uri_file",           # or "uri_folder" if uploading a folder
    name="my-generated-vat-data",        # unique name for the asset
    description="My dataset as a CSV",
    version="22"
    )
# Register (upload) the data asset
ml_client.data.create_or_update(data_asset)

Overriding of current TracerProvider is not allowed
Overriding of current LoggerProvider is not allowed
Overriding of current MeterProvider is not allowed
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented


Data({'path': 'azureml://subscriptions/2550d0cd-923a-4266-9fd3-c574cbc5929e/resourcegroups/brianombega-rg/workspaces/Masters_Ombega/datastores/workspaceblobstore/paths/LocalUpload/b05954d04e3c4a1d12c53febae32faee/all_transactions_337_.csv', 'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': None, 'type': 'uri_file', 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'my-generated-vat-data', 'description': 'My dataset as a CSV', 'tags': {}, 'properties': {}, 'print_as_yaml': False, 'id': '/subscriptions/2550d0cd-923a-4266-9fd3-c574cbc5929e/resourceGroups/brianombega-rg/providers/Microsoft.MachineLearningServices/workspaces/Masters_Ombega/data/my-generated-vat-data/versions/22', 'Resource__source_path': '', 'base_path': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/brianombega3/code/Users/brianombega', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x7978cc16bd90>, 'serialize': <msrest.seriali