# 01. Data Generation & Synthesis
## Overview
In this notebook, we generate a synthetic dataset of financial transactions.
Since real-world banking data is highly sensitive and subject to GDPR and banking secrecy laws, we create a mock dataset that mimics the structure of real bank statements.

### Goal:
Create a dataset with transaction descriptions and their corresponding categories (e.g., Food, Transport, Utilities) to train a classification model.

## Imports

In [1]:
import pandas as pd
import numpy as np
import random
import time
import string
import os

## Defining Transaction Samples

In [3]:
# Ensuring reproducibility
random.seed(42)
np.random.seed(42)

# Configuration
N_RECORDS = 5000  # Number of transactions
OUTPUT_PATH = '../data/raw'

# Knowledge Base
categories_config = {
    'Food & Drinks': {
        'mccs': [5411, 5812, 5814],
        'items': [
            ('Silpo', 'ТОВ СІЛЬПО-ФУД', '5411'),
            ('McDonalds', 'ПП МАКДОНАЛЬДЗ ЮКРЕЙН', '5814'),
            ('ATB-Market', 'ТОВ АТБ-МАРКЕТ', '5411'),
            ('Novus', 'ТОВ НОВУС УКРАЇНА', '5411'),
            ('Puzata Hata', 'ТОВ ПУЗАТА ХАТА', '5812'),
            ('Starbucks', 'STARBUCKS COF', '5812')
        ],
        'comments': ['за каву', 'обід', 'продукти додому', 'вечеря', 'fast food', ''],
        'amount_range': (5000, 150000)
    },
    'Transport': {
        'mccs': [4121, 5541, 4111],
        'items': [
            ('Uber Trip', 'UBER BV', '4121'),
            ('Uklon Kyiv', 'ТОВ УКЛОН УКРАЇНА', '4121'),
            ('WOG Fuel', 'ТОВ ВЕСТ ПЕТРОЛ КАРТ', '5541'),
            ('OKKO Fuel', 'ПП ОККО-ДРАЙВ', '5541'),
            ('Bolt Taxi', 'BOLT SERVICES UA', '4121')
        ],
        'comments': ['на роботу', 'додому', 'taxi', 'паливо', 'заправка', ''],
        'amount_range': (10000, 300000)
    },
    'Utilities & Bills': {
        'mccs': [4814, 4900, 4899],
        'items': [
            ('Kyivstar Pay', 'ПрАТ КИЇВСТАР', '4814'),
            ('Lifecell', 'ТОВ ЛАЙФСЕЛЛ', '4814'),
            ('Yasno Energy', 'ТОВ ДНІПРОВСЬКІ ЕНЕРГЕТИЧНІ ПОСЛУГИ', '4900'),
            ('Netflix', 'NETFLIX.COM', '4899'),
            ('Megogo', 'ТОВ МЕГОГО', '4899')
        ],
        'comments': ['за інет', 'мобільний', 'світло', 'підписка', ''],
        'amount_range': (15000, 450000)
    },
    'Shopping': {
        'mccs': [5311, 5651, 5912],
        'items': [
            ('Zara Kyiv', 'ITX MERCHANDISING', '5651'),
            ('Rozetka', 'ТОВ РОЗЕТКА.УА', '5311'),
            ('Epicentr K', 'ТОВ ЕПІЦЕНТР К', '5311'),
            ('Apteka ANC', 'ТОВ АПТЕКА НИЗЬКИХ ЦІН', '5912'),
            ('H&M Mall', 'H&M HENNES', '5651')
        ],
        'comments': ['одяг', 'ліки', 'техніка', 'для дому', 'подарунок', ''],
        'amount_range': (20000, 1500000)
    },
    'Crypto & Gambling': {
        'mccs': [6051, 7995],
        'items': [
            ('Binance', 'BINANCE LTD', '6051'),
            ('Favbet', 'ТОВ БК ФАВБЕТ', '7995'),
            ('Cosmolot', 'ТОВ СПЕЙСИКС', '7995'),
            ('WhiteBIT', 'WHITEBIT EX', '6051')
        ],
        'comments': ['dep', 'crypto buy', 'на удачу', 'deposit', ''],
        'amount_range': (50000, 5000000)
    }
}

def generate_random_id(length=12):
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

def generate_noisy_description(base_desc):
    noise_id = random.randint(1000, 999999)
    city = random.choice(['Kyiv', 'Lviv', 'Odesa', 'Online', 'Dnipro', 'London'])
    formats = [
        f"{base_desc} {city}",
        f"{base_desc} POS {noise_id}",
        f"WWW.{base_desc.upper().replace(' ', '')}.COM",
        f"{city} {base_desc} REF:{noise_id}"
    ]
    return random.choice(formats)

records = []
current_time = int(time.time())

for _ in range(N_RECORDS):
    category = random.choice(list(categories_config.keys()))
    config = categories_config[category]

    brand, counter_name, mcc_default = random.choice(config['items'])

    # Emulate Monobank API structure
    item = {
        "id": generate_random_id(10) + "=",
        "time": current_time - random.randint(0, 2592000), # за останній місяць
        "description": generate_noisy_description(brand),
        "mcc": int(mcc_default),
        "originalMcc": int(mcc_default),
        "hold": False,
        "amount": -random.randint(*config['amount_range']),
        "operationAmount": -random.randint(*config['amount_range']),
        "currencyCode": 980,
        "commissionRate": 0,
        "cashbackAmount": random.randint(0, 5000),
        "balance": random.randint(100000, 10000000),
        "comment": random.choice(config['comments']) if random.random() > 0.5 else "",
        "receiptId": f"{random.randint(1000,9999)}-{random.randint(1000,9999)}-{random.randint(1000,9999)}-{random.randint(1000,9999)}",
        "invoiceId": f"{random.randint(1000,9999)}.в.{random.randint(10,99)}",
        "counterEdrpou": str(random.randint(10000000, 99999999)),
        "counterIban": "UA" + str(random.randint(10**25, 10**26-1)),
        "counterName": counter_name,
        # Наш таргет для ML
        "category": category
    }
    records.append(item)

df = pd.DataFrame(records)

## Data Storage

In [4]:
# Create directory if it doesn't exist
output_path = '../data/raw'
os.makedirs(output_path, exist_ok=True)

# Save to CSV
df.to_csv(f'{output_path}/monobank_transactions.csv', index=False)
print(f"Data saved to {output_path}/transactions.csv")

Data saved to ../data/raw/transactions.csv


## Data Preview

In [5]:
print("First 5 rows:")
print(df.head())

print("\nCategory Distribution:")
print(df['category'].value_counts())

First 5 rows:
            id        time               description   mcc  originalMcc  \
0  TpigTHKbfo=  1766328785           Silpo POS 28824  5411         5411   
1  0LYTH8xIZM=  1767337027           ATB-Market Lviv  5411         5411   
2  NHpCq5QnuV=  1768619214        WWW.PUZATAHATA.COM  5812         5812   
3  3yn9FfcgMX=  1767083167  Kyiv WhiteBIT REF:626380  6051         6051   
4  VFgM0Fmxk7=  1766629370            WWW.MEGOGO.COM  4899         4899   

    hold   amount  operationAmount  currencyCode  commissionRate  \
0  False  -147853          -114974           980               0   
1  False   -17012           -64742           980               0   
2  False  -110162           -75186           980               0   
3  False -3251254         -3976287           980               0   
4  False  -271170           -25211           980               0   

   cashbackAmount  balance    comment            receiptId  invoiceId  \
0            1805  7636477    за каву  3615-7924-6574