In [6]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score 
from sklearn.preprocessing import LabelEncoder

import os

In [7]:
file_name = "adult_original_test.csv"
org_test_df = pd.read_csv(f"../data_split/original_test_dataset/{file_name}")

org_test_df.head()

Unnamed: 0,age,workclass,fnlwgt,educational-num,marital-status,occupation,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,32,Others,33117,11,Married,Others,White,Male,0,0,40,United-States,<=50K
1,51,Others,233149,12,Married,White Collar,White,Male,0,0,45,United-States,>50K
2,84,Others,188328,9,Single,White Collar,White,Female,0,0,16,United-States,<=50K
3,45,Others,158685,9,Single,White Collar,White,Female,0,0,40,United-States,<=50K
4,44,Others,112847,4,Married,Blue Collar,Asian-Pac-Islander,Male,0,0,40,Philippines,<=50K


In [8]:
file_name = "adult_original_train.csv"
org_train_df = pd.read_csv(f"../data_split/original_train_dataset/{file_name}")

org_train_df.head()

Unnamed: 0,age,workclass,fnlwgt,educational-num,marital-status,occupation,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,22,Others,51362,13,Single,White Collar,White,Female,0,0,16,United-States,<=50K
1,41,Others,203217,4,Single,Blue Collar,White,Male,0,0,40,Mexico,<=50K
2,25,Others,51498,8,Single,Others,White,Male,0,0,40,United-States,<=50K
3,28,Others,179512,9,Single,White Collar,White,Female,0,0,50,United-States,<=50K
4,42,Others,177989,9,Married,Blue Collar,White,Male,0,0,40,United-States,<=50K


In [9]:
len(org_train_df) + len(org_test_df)

45164

In [11]:
dataset_folder = "adult"
models_name = os.listdir(f"../generated_datasets/{dataset_folder}")
models_name

['great_gpt2_12_layer_adult',
 'great_gpt2_6_layer_adult',
 'realtab_gpt2_12_layer_adult',
 'realtab_gpt2_6_layer_adult',
 'realtab_gpt_bigcode_12_layer_adult',
 'realtab_gpt_bigcode_6_layer_adult',
 'realtab_gpt_neo_2_layer_adult',
 'realtab_gpt_neo_4_layer_adult',
 'realtab_gpt_neo_6_layer_adult',
 'realtab_gpt_neo_8_layer_adult',
 'realtab_llama_1_layer_adult',
 'realtab_llama_2_layer_adult']

In [14]:
outputs = []
org_train_df["original"] = [1]*len(org_train_df)
org_test_df["original"] = [1]*len(org_test_df)

for model in models_name:

    print(model)
    model_output = {"model_name" : model}
    for i in range(1, 6):

        print(i)
        synt_test_df = pd.read_csv(f"../generated_datasets/{dataset_folder}/{model}/samples/test/sample{i}.csv", sep=",")[:len(org_test_df)]
        synt_train_df = pd.read_csv(f"../generated_datasets/{dataset_folder}/{model}/samples/train/sample{i}.csv", sep=",")[:len(org_train_df)]        

        if "Unnamed: 0" in synt_test_df.columns:
            synt_test_df = synt_test_df.drop('Unnamed: 0', axis=1)

        if "Unnamed: 0" in synt_train_df.columns:
            synt_train_df = synt_train_df.drop('Unnamed: 0', axis=1)

        synt_test_df["original"] = [0]*len(synt_test_df)
        synt_train_df["original"] = [0]*len(synt_train_df)

        train_set = pd.concat([org_train_df, synt_train_df], ignore_index=True)
        test_set = pd.concat([org_test_df, synt_test_df], ignore_index=True)

        le_workclass = LabelEncoder()
        le_marital_status = LabelEncoder()
        le_occupation = LabelEncoder()
        le_race = LabelEncoder()
        le_gender = LabelEncoder()
        le_native_country = LabelEncoder()
        le_income = LabelEncoder()

        column_name = "workclass"
        train_set[column_name] = le_workclass.fit_transform(train_set[column_name])
        test_set[column_name] = le_workclass.transform(test_set[column_name])

        column_name = "marital-status"
        train_set[column_name] = le_marital_status.fit_transform(train_set[column_name])
        test_set[column_name] = le_marital_status.transform(test_set[column_name])

        column_name = "occupation"
        train_set[column_name] = le_occupation.fit_transform(train_set[column_name])
        test_set[column_name] = le_occupation.transform(test_set[column_name])

        column_name = "race"
        train_set[column_name] = le_race.fit_transform(train_set[column_name])
        test_set[column_name] = le_race.transform(test_set[column_name])

        column_name = "gender"
        train_set[column_name] = le_gender.fit_transform(train_set[column_name])
        test_set[column_name] = le_gender.transform(test_set[column_name])

        column_name = "native-country"
        train_set[column_name] = le_native_country.fit_transform(train_set[column_name])
        test_set[column_name] = le_native_country.transform(test_set[column_name])

        column_name = "income"
        train_set[column_name] = le_income.fit_transform(train_set[column_name])
        test_set[column_name] = le_income.transform(test_set[column_name])

        X_train = train_set.drop("original", axis=1)
        Y_train = train_set["original"]
        X_test = test_set.drop("original", axis=1)
        Y_test = test_set["original"]

        ml = RandomForestClassifier()
        ml.fit(X_train, Y_train)
        y_predict = ml.predict(X_test)

        accuracy_score_ = accuracy_score(Y_test, y_predict)

        model_output[i] = {
            "accuracy": accuracy_score_,
        }

    outputs.append(model_output)


great_gpt2_12_layer_adult
1
2
3
4
5
great_gpt2_6_layer_adult
1
2
3
4
5
realtab_gpt2_12_layer_adult
1
2
3
4
5
realtab_gpt2_6_layer_adult
1
2
3
4
5
realtab_gpt_bigcode_12_layer_adult
1
2
3
4
5
realtab_gpt_bigcode_6_layer_adult
1
2
3
4
5
realtab_gpt_neo_2_layer_adult
1
2
3
4
5
realtab_gpt_neo_4_layer_adult
1
2
3
4
5
realtab_gpt_neo_6_layer_adult
1
2
3
4
5
realtab_gpt_neo_8_layer_adult
1
2
3
4
5
realtab_llama_1_layer_adult
1
2
3
4
5
realtab_llama_2_layer_adult
1
2
3
4
5


In [15]:
stats = {}

for model in outputs:
    accuracy_ = []

    for i in range(1, 6):
        accuracy_.append(model[i]["accuracy"])
    
    accuracy_mean = np.mean(accuracy_)
    accuracy_std = np.std(accuracy_, ddof=1)

    stats[model["model_name"]] = {
        "accuracy": accuracy_mean,
        "accuracy_std": accuracy_std,
    }


In [16]:
pd.DataFrame(stats).T

Unnamed: 0,accuracy,accuracy_std
great_gpt2_12_layer_adult,0.676167,0.003556
great_gpt2_6_layer_adult,0.680119,0.00275
realtab_gpt2_12_layer_adult,0.546565,0.00469
realtab_gpt2_6_layer_adult,0.545887,0.001257
realtab_gpt_bigcode_12_layer_adult,0.550289,0.002086
realtab_gpt_bigcode_6_layer_adult,0.542458,0.002026
realtab_gpt_neo_2_layer_adult,0.593268,0.005595
realtab_gpt_neo_4_layer_adult,0.566023,0.002381
realtab_gpt_neo_6_layer_adult,0.566472,0.001248
realtab_gpt_neo_8_layer_adult,0.528469,0.003089


## Using general_dm function

In [17]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score 
from sklearn.preprocessing import LabelEncoder

import os
from general_dm import dm_adult

In [18]:
file_name = "adult_original_test.csv"
org_test_df = pd.read_csv(f"../data_split/original_test_dataset/{file_name}")

org_test_df.head()

Unnamed: 0,age,workclass,fnlwgt,educational-num,marital-status,occupation,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,32,Others,33117,11,Married,Others,White,Male,0,0,40,United-States,<=50K
1,51,Others,233149,12,Married,White Collar,White,Male,0,0,45,United-States,>50K
2,84,Others,188328,9,Single,White Collar,White,Female,0,0,16,United-States,<=50K
3,45,Others,158685,9,Single,White Collar,White,Female,0,0,40,United-States,<=50K
4,44,Others,112847,4,Married,Blue Collar,Asian-Pac-Islander,Male,0,0,40,Philippines,<=50K


In [19]:
file_name = "adult_original_train.csv"
org_train_df = pd.read_csv(f"../data_split/original_train_dataset/{file_name}")

org_train_df.head()

Unnamed: 0,age,workclass,fnlwgt,educational-num,marital-status,occupation,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,22,Others,51362,13,Single,White Collar,White,Female,0,0,16,United-States,<=50K
1,41,Others,203217,4,Single,Blue Collar,White,Male,0,0,40,Mexico,<=50K
2,25,Others,51498,8,Single,Others,White,Male,0,0,40,United-States,<=50K
3,28,Others,179512,9,Single,White Collar,White,Female,0,0,50,United-States,<=50K
4,42,Others,177989,9,Married,Blue Collar,White,Male,0,0,40,United-States,<=50K


In [20]:
temp = dm_adult(org_train_df=org_train_df, org_test_df=org_test_df, ml_model=RandomForestClassifier(random_state=32), 
                path_to_synthetic_dataset="../generated_datasets/adult")

great_gpt2_12_layer_adult
1
2
3
4
5
great_gpt2_6_layer_adult
1
2
3
4
5
realtab_gpt2_12_layer_adult
1
2
3
4
5
realtab_gpt2_6_layer_adult
1
2
3
4
5
realtab_gpt_bigcode_12_layer_adult
1
2
3
4
5
realtab_gpt_bigcode_6_layer_adult
1
2
3
4
5
realtab_gpt_neo_2_layer_adult
1
2
3
4
5
realtab_gpt_neo_4_layer_adult
1
2
3
4
5
realtab_gpt_neo_6_layer_adult
1
2
3
4
5
realtab_gpt_neo_8_layer_adult
1
2
3
4
5
realtab_llama_1_layer_adult
1
2
3
4
5
realtab_llama_2_layer_adult
1
2
3
4
5


In [21]:
pd.DataFrame(temp).T

Unnamed: 0,accuracy,accuracy_std
great_gpt2_12_layer_adult,0.676275,0.002833
great_gpt2_6_layer_adult,0.680475,0.001406
realtab_gpt2_12_layer_adult,0.546773,0.002217
realtab_gpt2_6_layer_adult,0.545095,0.002983
realtab_gpt_bigcode_12_layer_adult,0.54994,0.003127
realtab_gpt_bigcode_6_layer_adult,0.54003,0.001413
realtab_gpt_neo_2_layer_adult,0.593019,0.00414
realtab_gpt_neo_4_layer_adult,0.565815,0.001737
realtab_gpt_neo_6_layer_adult,0.566694,0.002325
realtab_gpt_neo_8_layer_adult,0.528019,0.003808
