Генерация синтетического датасета производилась ИИ, чтобы он создал структуры и часть инсайтов для работы

In [1]:
!pip install faker

Collecting faker
  Downloading faker-37.4.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.4.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.4.0


In [4]:
import pandas as pd
import numpy as np
import random
from faker import Faker
import uuid

In [5]:
# Инициализация
np.random.seed(42)
random.seed(42)
fake = Faker(42)

n_samples = 600

# --- Справочники значений ---
transaction_types = ['income', 'expense', 'transfer', 'refund', 'investment']
departments = ['Sales', 'Marketing', 'R&D', 'HR', 'Finance', 'IT']
vendors = [fake.company() for _ in range(30)]
payment_methods = ['card', 'bank_transfer', 'cash', 'crypto']
categories = ['salary', 'software', 'hardware', 'services', 'travel', 'training', 'taxes', 'operational']
approvers = ['CEO', 'CFO', 'CTO', 'COO', 'HR_Manager']
project_codes = ['PRJ_' + str(i).zfill(3) for i in range(1, 21)]

# --- Дата ---
date_range = pd.date_range(start="2021-01-01", end="2024-06-01", freq='D')
dates = np.random.choice(date_range, size=n_samples)

# --- Сбор транзакций ---
data = {
    "Transaction_ID": [str(uuid.uuid4()) for _ in range(n_samples)],
    "Date": dates,
    "Transaction_Type": np.random.choice(transaction_types, size=n_samples, p=[0.25, 0.55, 0.08, 0.07, 0.05]),
    "Department": np.random.choice(departments, size=n_samples, p=[0.2, 0.2, 0.15, 0.15, 0.15, 0.15]),
    "Vendor": np.random.choice(vendors, size=n_samples),
    "Payment_Method": np.random.choice(payment_methods, size=n_samples, p=[0.3, 0.5, 0.1, 0.1]),
    "Category": np.random.choice(categories, size=n_samples),
    "Approved_by": np.random.choice(approvers, size=n_samples),
    "Recurring": np.random.choice([True, False], size=n_samples, p=[0.25, 0.75]),
    "Project_Code": np.random.choice(project_codes, size=n_samples)
}

df = pd.DataFrame(data)

# --- Генерация суммы ---
def generate_amount(ttype):
    if ttype == 'income':
        return round(np.random.normal(10000, 3000), 2)
    elif ttype == 'expense':
        return round(np.random.normal(-5000, 2000), 2)
    elif ttype == 'transfer':
        return round(np.random.normal(-1000, 500), 2)
    elif ttype == 'refund':
        return round(np.random.normal(500, 200), 2)
    elif ttype == 'investment':
        return round(np.random.normal(-15000, 5000), 2)
    return 0

df['Amount'] = df['Transaction_Type'].apply(generate_amount)

# --- Добавим фрод ---
df['Is_Fraudulent'] = False

# Кандидаты в фрод: подозрительные категории + crypto/cash + большие суммы
fraud_candidates = df[
    (df['Payment_Method'].isin(['crypto', 'cash'])) &
    (df['Amount'] < -3000) &
    (df['Category'].isin(['travel', 'training', 'hardware'])) &
    (df['Approved_by'].isin(['HR_Manager', 'COO']))
]

# Гарантируем минимум 35 фродов
n_frauds = max(35, int(n_samples * 0.07))
if len(fraud_candidates) >= n_frauds:
    fraud_indices = fraud_candidates.sample(n=n_frauds, random_state=42).index
else:
    # Если недостаточно по фильтру — заполним остальное случайными suspicious транзакциями
    additional_needed = n_frauds - len(fraud_candidates)
    other_candidates = df[
        (df['Amount'] < -4000) &
        (df['Payment_Method'].isin(['cash', 'crypto'])) &
        (~df.index.isin(fraud_candidates.index))
    ].sample(n=additional_needed, random_state=43, replace=False).index

    fraud_indices = fraud_candidates.index.union(other_candidates)

df.loc[fraud_indices, 'Is_Fraudulent'] = True

# --- Добавим аномального вендора (инсайт) ---
high_freq_vendor = random.choice(vendors)
df.loc[df.sample(frac=0.1, random_state=1).index, 'Vendor'] = high_freq_vendor

# --- Добавим немного пропусков (NaNs) ---
def insert_nans(df, column, frac=0.02):
    n_missing = int(len(df) * frac)
    missing_indices = np.random.choice(df.index, size=n_missing, replace=False)
    df.loc[missing_indices, column] = np.nan

# В какие колонки добавим NaNs
columns_with_nans = ['Vendor', 'Approved_by', 'Category', 'Project_Code']
for col in columns_with_nans:
    insert_nans(df, col, frac=0.02)

# --- Перемешаем строки для рандомизации ---
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# --- Сохраняем ---
df.to_csv("synthetic_financial_transactions.csv", index=False)