# MIDST Challenge
Black-box MIA on single table given synthetic and holdout real data

# Import packages

In [1]:
# Standard
import sys

sys.path.append("..")

from pathlib import Path

# 3rd party
import pandas as pd

# Local
from src.data import process

# 1.Configuration

In [2]:
# Paths
data_dir = Path("/data8/projets/dev_synthetic_data/data/MIDST/tabsyn_black_box")

# Id of all train sets
data_id = range(1, 31)

# Destination folder to save data
dest_dir = data_dir / "population"

# 2.Load data
Collect all the real data used to train model

In [3]:
for i in data_id:
    data_dir_ith = data_dir / "train" / f"tabsyn_{i}" / "train_with_id.csv"
    df_real_ith = pd.read_csv(data_dir_ith)

    if i == 1:
        df_real = df_real_ith
    else:
        df_real = pd.concat([df_real, df_real_ith])

In [4]:
df_real = df_real.drop_duplicates()

In [5]:
df_real.shape

(461083, 10)

In [6]:
df_real.head()

Unnamed: 0,trans_id,account_id,trans_date,trans_type,operation,amount,balance,k_symbol,bank,account
0,330530,1126,336,0,3,2400.0,20515.0,1,0,0
1,50642,169,2129,2,4,14.6,65847.0,6,0,0
2,931992,3178,1641,2,4,14.6,13507.4,6,0,0
3,1058060,3617,515,2,4,14.6,36742.7,6,0,0
4,564471,1921,1984,0,2,3650.0,16299.2,1,8,78194778


In [7]:
df_real.to_csv(
    dest_dir / "real_all.csv",
    index=False,
)

# 2 Split data
- Sample a subset of real data
- Split the real data into train, val and test set

In [7]:
df_real_sample = df_real.sample(
        n=100000,
        replace=False,
        ignore_index=True,
        random_state=42,
    )

In [8]:
df_train, df_val, df_test = process.split_real_data(
    df_real_sample,
    save_folder=dest_dir,
    var_to_stratify="trans_type",
    proportion={"train": 0.5, "val": 0.25},
    seed=42,
)

In [9]:
df_train.shape, df_val.shape, df_test.shape

((50000, 10), (25000, 10), (25000, 10))

In [10]:
# Remove id

df_real_all_no_id = df_real.drop(columns=["trans_id", "account_id"])
df_train_no_id = df_train.drop(columns=["trans_id", "account_id"])
df_val_no_id = df_val.drop(columns=["trans_id", "account_id"])
df_test_no_id = df_test.drop(columns=["trans_id", "account_id"])

In [11]:
# Save files

df_real_all_no_id.to_csv(
    dest_dir / "real_all_no_id.csv",
    index=False,
)
df_train_no_id.to_csv(
    dest_dir / "real_train_no_id.csv",
    index=False,
)
df_val_no_id.to_csv(
    dest_dir / "real_val_no_id.csv",
    index=False,
)
df_test_no_id.to_csv(
    dest_dir / "real_test_no_id.csv",
    index=False,
)
