# MIDST Challenge
Black-box MIA on single table given synthetic and holdout real data

# Import packages

In [1]:
# Standard
import sys

sys.path.append("..")

from pathlib import Path

# 3rd party
import pandas as pd

# Local
from src.data import process

# 1.Configuration

In [23]:
# Paths
data_dir = Path("../data")

# Id of all train sets
data_id = range(1, 31)

# Destination folder to save population data
dest_dir = Path("../input") / "population"

# 2.Load data
Collect all the real data used to train model for different scenarios

In [3]:
population = []

In [4]:
def collect_data(attack_type, data_dir, data_id, is_single_table):
    gen_name = attack_type.split("_")[0]

    for i in data_id:
        if is_single_table:
            data_dir_ith = (
                data_dir
                / attack_type
                / "train"
                / f"{gen_name}_{i}"
                / "train_with_id.csv"
            )
        else:
            data_dir_ith = (
                data_dir / attack_type / "train" / f"{gen_name}_{i}" / "trans.csv"
            )

        df_real_ith = pd.read_csv(data_dir_ith)

        if i == 1:
            df_real = df_real_ith
        else:
            df_real = pd.concat([df_real, df_real_ith])

    df_real = df_real.drop_duplicates()

    return df_real

## 2.1 Single table

### 2.1.1 Black box

#### I.TabDDPM

In [5]:
df_real = collect_data(
    attack_type="tabddpm_black_box",
    data_dir=data_dir,
    data_id=data_id,
    is_single_table=True,
)

population.append(df_real)

In [6]:
df_real.shape

(460972, 10)

#### II.TabSyn

In [7]:
df_real = collect_data(
    attack_type="tabsyn_black_box",
    data_dir=data_dir,
    data_id=data_id,
    is_single_table=True,
)

population.append(df_real)

In [8]:
df_real.shape

(461083, 10)

### 2.1.2 White box

#### I.TabDDPM

In [9]:
df_real = collect_data(
    attack_type="tabddpm_white_box",
    data_dir=data_dir,
    data_id=data_id,
    is_single_table=True,
)

population.append(df_real)

In [10]:
df_real.shape

(460972, 10)

#### II.TabSyn

In [11]:
df_real = collect_data(
    attack_type="tabsyn_white_box",
    data_dir=data_dir,
    data_id=data_id,
    is_single_table=True,
)

population.append(df_real)

In [12]:
df_real.shape

(461083, 10)

## 2.2 Multi tables

### 2.2.1 Black box

In [13]:
df_real = collect_data(
    attack_type="clavaddpm_black_box",
    data_dir=data_dir,
    data_id=data_id,
    is_single_table=False,
)

population.append(df_real)

In [14]:
df_real.shape

(461127, 10)

### 2.2.2 White box

In [15]:
df_real = collect_data(
    attack_type="clavaddpm_white_box",
    data_dir=data_dir,
    data_id=data_id,
    is_single_table=False,
)

population.append(df_real)

In [16]:
df_real.shape

(461127, 10)

## 2.3 Combine data

In [17]:
df_population = pd.concat(population)

In [18]:
df_population = df_population.drop_duplicates()

In [19]:
df_population.shape

(867494, 10)

In [20]:
# Drop ids
df_population_no_id = df_population.drop(columns=["trans_id", "account_id"])

In [24]:
df_population.to_csv(
    dest_dir / "population_all.csv",
    index=False,
)

df_population_no_id.to_csv(
    dest_dir / "population_all_no_id.csv",
    index=False,
)

## 2.4 Add/remove challenge points
Create population data with and without all the challenge points

In [26]:
def collect_challenge_points(attack_type, data_dir, data_id, dataset):
    gen_name = attack_type.split("_")[0]

    for idx, i in enumerate(data_id): 
        data_dir_ith = (
            data_dir
            / attack_type
            / dataset
            / f"{gen_name}_{i}"
            / "challenge_with_id.csv"
        )
       
        df_test_ith = pd.read_csv(data_dir_ith)

        if idx == 0:
            df_test = df_test_ith
        else:
            df_test = pd.concat([df_test, df_test_ith])

    df_test = df_test.drop_duplicates()

    return df_test

In [27]:
# All the chanllenge points from train

data_id = range(1, 31)
df_train_folder = collect_challenge_points(
    attack_type="tabddpm_black_box",
    data_dir=data_dir,
    data_id=data_id,
    dataset = "train"
)

In [28]:
df_train_folder.shape

(5978, 10)

In [29]:
# All the chanllenge points from dev

data_id = list(range(51, 61)) + list(range(91, 101))
df_dev_folder = collect_challenge_points(
    attack_type="tabddpm_black_box",
    data_dir=data_dir,
    data_id=data_id,
    dataset = "dev"
)

In [30]:
df_dev_folder.shape

(3996, 10)

In [31]:
# All the chanllenge points from dev final 

data_id = list(range(61, 71)) + list(range(101, 111))
df_final_folder = collect_challenge_points(
    attack_type="tabddpm_black_box",
    data_dir=data_dir,
    data_id=data_id,
    dataset = "final"
)

In [32]:
df_final_folder.shape

(3991, 10)

In [33]:
# All the challenge points
df_challenge_all = pd.concat([df_train_folder, df_dev_folder, df_final_folder]).drop_duplicates()
df_challenge_all.shape

(13896, 10)

In [34]:
# Save all the challenge points
df_challenge_all.to_csv(
    dest_dir / "challenge_points_all.csv",
    index=False,
)

In [40]:
df_population_no_challenge = df_population[~df_population["trans_id"].isin(df_challenge_all["trans_id"])]

In [41]:
df_population_no_challenge.shape

(855644, 10)

In [42]:
df_population_no_challenge.to_csv(
    dest_dir / "population_all_no_challenge.csv",
    index=False,
)

In [43]:
df_population_no_challenge_no_id = df_population_no_challenge.drop(columns=["trans_id", "account_id"])

In [45]:
df_population_no_challenge_no_id.to_csv(
    dest_dir / "population_all_no_challenge_no_id.csv",
    index=False,
)

In [46]:
df_population_with_challenge = pd.concat([df_population_no_challenge, df_challenge_all])
df_population_with_challenge.shape

(869540, 10)

In [47]:
df_population_with_challenge[df_population_with_challenge["trans_id"].isin(df_challenge_all["trans_id"])].shape

(13896, 10)

In [48]:
df_population_with_challenge.to_csv(
    dest_dir / "population_all_with_challenge.csv",
    index=False,
)

In [49]:
df_population_with_challenge_no_id = df_population_with_challenge.drop(columns=["trans_id", "account_id"])
df_population_with_challenge_no_id.shape

(869540, 8)

In [50]:
df_population_with_challenge_no_id.to_csv(
    dest_dir / "population_all_with_challenge_no_id.csv",
    index=False,
)