## Random Forest Classifier

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, precision_score, recall_score 
from sklearn.preprocessing import LabelEncoder

import os

In [2]:
file_name = "adult_original_test.csv"
org_test_df = pd.read_csv(f"../data_split/original_test_dataset/{file_name}")

org_test_df.head()

Unnamed: 0,age,workclass,fnlwgt,educational-num,marital-status,occupation,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,32,Others,33117,11,Married,Others,White,Male,0,0,40,United-States,<=50K
1,51,Others,233149,12,Married,White Collar,White,Male,0,0,45,United-States,>50K
2,84,Others,188328,9,Single,White Collar,White,Female,0,0,16,United-States,<=50K
3,45,Others,158685,9,Single,White Collar,White,Female,0,0,40,United-States,<=50K
4,44,Others,112847,4,Married,Blue Collar,Asian-Pac-Islander,Male,0,0,40,Philippines,<=50K


In [3]:
file_name = "adult_original_train.csv"
org_train_df = pd.read_csv(f"../data_split/original_train_dataset/{file_name}")

org_train_df.head()

Unnamed: 0,age,workclass,fnlwgt,educational-num,marital-status,occupation,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,22,Others,51362,13,Single,White Collar,White,Female,0,0,16,United-States,<=50K
1,41,Others,203217,4,Single,Blue Collar,White,Male,0,0,40,Mexico,<=50K
2,25,Others,51498,8,Single,Others,White,Male,0,0,40,United-States,<=50K
3,28,Others,179512,9,Single,White Collar,White,Female,0,0,50,United-States,<=50K
4,42,Others,177989,9,Married,Blue Collar,White,Male,0,0,40,United-States,<=50K


In [4]:
print("Len of original test dataset:", len(org_test_df))
print("Len of original train dataset:", len(org_train_df))

Len of original test dataset: 14904
Len of original train dataset: 30260


In [5]:
le_workclass = LabelEncoder()
le_marital_status = LabelEncoder()
le_occupation = LabelEncoder()
le_race = LabelEncoder()
le_gender = LabelEncoder()
le_native_country = LabelEncoder()
le_income = LabelEncoder()

column_name = "workclass"
org_train_df[column_name] = le_workclass.fit_transform(org_train_df[column_name])
org_test_df[column_name] = le_workclass.transform(org_test_df[column_name])

column_name = "marital-status"
org_train_df[column_name] = le_marital_status.fit_transform(org_train_df[column_name])
org_test_df[column_name] = le_marital_status.transform(org_test_df[column_name])

column_name = "occupation"
org_train_df[column_name] = le_occupation.fit_transform(org_train_df[column_name])
org_test_df[column_name] = le_occupation.transform(org_test_df[column_name])

column_name = "race"
org_train_df[column_name] = le_race.fit_transform(org_train_df[column_name])
org_test_df[column_name] = le_race.transform(org_test_df[column_name])

column_name = "gender"
org_train_df[column_name] = le_gender.fit_transform(org_train_df[column_name])
org_test_df[column_name] = le_gender.transform(org_test_df[column_name])

column_name = "native-country"
org_train_df[column_name] = le_native_country.fit_transform(org_train_df[column_name])
org_test_df[column_name] = le_native_country.transform(org_test_df[column_name])

column_name = "income"
org_train_df[column_name] = le_income.fit_transform(org_train_df[column_name])
org_test_df[column_name] = le_income.transform(org_test_df[column_name])



In [6]:
X_train = org_train_df.drop("income", axis=1)
Y_train = org_train_df["income"]

X_test_org = org_test_df.drop("income", axis=1)
Y_test_org = org_test_df["income"]


In [7]:
ml = RandomForestClassifier(random_state=32)
ml.fit(X_train, Y_train)

In [8]:
y_predict_test_org= ml.predict(X_test_org)


In [9]:
current_path = os.getcwd()
current_path

'c:\\__Local Disk D\\master thesis submit version\\execution\\results_analysis_code'

In [10]:
dataset_folder = "adult"
dataset_folder = f"../generated_datasets/{dataset_folder}"
models_name = os.listdir(dataset_folder)
models_name

['great_gpt2_12_layer_adult',
 'great_gpt2_6_layer_adult',
 'realtab_gpt2_12_layer_adult',
 'realtab_gpt2_6_layer_adult',
 'realtab_gpt_bigcode_12_layer_adult',
 'realtab_gpt_bigcode_6_layer_adult',
 'realtab_gpt_neo_2_layer_adult',
 'realtab_gpt_neo_4_layer_adult',
 'realtab_gpt_neo_6_layer_adult',
 'realtab_gpt_neo_8_layer_adult',
 'realtab_llama_1_layer_adult',
 'realtab_llama_2_layer_adult']

In [11]:
outputs = []

for model in models_name:

    model_output = {"model_name" : model}
    # print(model)

    for i in range(1, 6):
        # print(i)
        synt_test_df = pd.read_csv(f"../generated_datasets/{dataset_folder}/{model}/samples/test/sample{i}.csv", sep=",")[:14904]

        if "Unnamed: 0" in synt_test_df.columns:
            synt_test_df = synt_test_df.drop('Unnamed: 0', axis=1)

        column_name = "workclass"
        synt_test_df[column_name] = le_workclass.transform(synt_test_df[column_name])
        column_name = "marital-status"
        synt_test_df[column_name] = le_marital_status.transform(synt_test_df[column_name])
        column_name = "occupation"
        synt_test_df[column_name] = le_occupation.transform(synt_test_df[column_name])
        column_name = "race"
        synt_test_df[column_name] = le_race.transform(synt_test_df[column_name])
        column_name = "gender"
        synt_test_df[column_name] = le_gender.transform(synt_test_df[column_name])
        column_name = "native-country"
        synt_test_df[column_name] = le_native_country.transform(synt_test_df[column_name])
        column_name = "income"
        synt_test_df[column_name] = le_income.transform(synt_test_df[column_name])

        X_test_synt = synt_test_df.drop("income", axis=1)
        Y_test_synt = synt_test_df["income"]

        y_predict_syn_test = ml.predict(X_test_synt)
        f1_score_micro = f1_score(Y_test_synt, y_predict_syn_test, average='micro')
        f1_score_macro = f1_score(Y_test_synt, y_predict_syn_test, average='macro')
        f1_score_weighted = f1_score(Y_test_synt, y_predict_syn_test, average='weighted')
        accuracy_ = accuracy_score(Y_test_synt, y_predict_syn_test)
        precision_ = precision_score(Y_test_synt, y_predict_syn_test)
        recall_ = recall_score(Y_test_synt, y_predict_syn_test)

        model_output[i] = {
            "f1_score_micro": f1_score_micro,
            "f1_score_macro": f1_score_macro,
            "f1_score_weighted": f1_score_weighted,
            "accuracy": accuracy_,
            "precision": precision_,
            "recall": recall_,
        }

    outputs.append(model_output)


In [12]:
stats = {}

for model in outputs:
    f1_score_micro = []
    f1_score_macro = []
    f1_score_weighted = []
    accuracy_ = []
    precision_ = []
    recall_ = []

    for i in range(1, 6):
        f1_score_micro.append(model[i]["f1_score_micro"])
        f1_score_macro.append(model[i]["f1_score_macro"])
        f1_score_weighted.append(model[i]["f1_score_weighted"])
        accuracy_.append(model[i]["accuracy"])
        precision_.append(model[i]["precision"])
        recall_.append(model[i]["recall"])
    
    f1_score_micro_mean = np.mean(f1_score_micro)
    f1_score_micro_std = np.std(f1_score_micro, ddof=1)
    f1_score_macro_mean = np.mean(f1_score_macro)
    f1_score_macro_std = np.std(f1_score_macro, ddof=1)
    f1_score_weighted_mean = np.mean(f1_score_weighted)
    f1_score_weighted_std = np.std(f1_score_weighted, ddof=1)
    accuracy_mean = np.mean(accuracy_)
    accuracy_std = np.std(accuracy_, ddof=1)
    precision_mean = np.mean(precision_)
    precision_std = np.std(precision_, ddof=1)
    recall_mean = np.mean(recall_)
    recall_std = np.std(recall_, ddof=1)

    stats[model["model_name"]] = {
        "f1_score_micro": f1_score_micro_mean, 
        "f1_score_micro_std": f1_score_micro_std,
        "f1_score_macro": f1_score_macro_mean,
        "f1_score_macro_std": f1_score_macro_std,
        "f1_score_weighted": f1_score_weighted_mean,
        "f1_score_weighted_std": f1_score_weighted_std,
        "accuracy": accuracy_mean,
        "accuracy_std": accuracy_std,
        "precision": precision_mean,
        "precision_std": precision_std,
        "recall": recall_mean,
        "recall_std": recall_std        
    }



In [13]:
f1_score_micro = f1_score(Y_test_org, y_predict_test_org, average='micro')
f1_score_macro = f1_score(Y_test_org, y_predict_test_org, average='macro')
f1_score_weighted = f1_score(Y_test_org, y_predict_test_org, average='weighted')
accuracy_ = accuracy_score(Y_test_org, y_predict_test_org)
precision_ = precision_score(Y_test_org, y_predict_test_org)
recall_ = recall_score(Y_test_org, y_predict_test_org)

original = {
                "f1_score_micro" : f1_score_micro,
                "f1_score_micro_std" : 0,
                "f1_score_macro" : f1_score_macro,
                "f1_score_macro_std" : 0,
                "f1_score_weighted" : f1_score_weighted,
                "f1_score_weighted_std" : 0,
                "accuracy" : accuracy_,
                "accuracy_std" : 0,
                "precision" : precision_,
                "precision_std" : 0,
                "recall" : recall_,
                "recall_std" : 0
            }

stats["original"] = original



In [14]:
pd.DataFrame(stats).loc[["f1_score_macro", "f1_score_macro_std"]].T

Unnamed: 0,f1_score_macro,f1_score_macro_std
great_gpt2_12_layer_adult,0.786043,0.004309
great_gpt2_6_layer_adult,0.77703,0.003556
realtab_gpt2_12_layer_adult,0.795454,0.001883
realtab_gpt2_6_layer_adult,0.801522,0.002989
realtab_gpt_bigcode_12_layer_adult,0.792441,0.004352
realtab_gpt_bigcode_6_layer_adult,0.806099,0.005174
realtab_gpt_neo_2_layer_adult,0.793311,0.017591
realtab_gpt_neo_4_layer_adult,0.784295,0.001649
realtab_gpt_neo_6_layer_adult,0.767416,0.003105
realtab_gpt_neo_8_layer_adult,0.773803,0.004173


In [15]:
pd.DataFrame(stats).loc[["accuracy", "accuracy_std"]].T

Unnamed: 0,accuracy,accuracy_std
great_gpt2_12_layer_adult,0.847692,0.003734
great_gpt2_6_layer_adult,0.841908,0.001113
realtab_gpt2_12_layer_adult,0.838218,0.001094
realtab_gpt2_6_layer_adult,0.86099,0.001876
realtab_gpt_bigcode_12_layer_adult,0.852402,0.002712
realtab_gpt_bigcode_6_layer_adult,0.857568,0.003572
realtab_gpt_neo_2_layer_adult,0.85045,0.012629
realtab_gpt_neo_4_layer_adult,0.833897,0.001962
realtab_gpt_neo_6_layer_adult,0.846095,0.00189
realtab_gpt_neo_8_layer_adult,0.846122,0.001943


## Logistic Regression

In [16]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, precision_score, recall_score 
from sklearn.preprocessing import LabelEncoder

import os

from general_mle import mle_adult

In [17]:
file_name = "adult_original_test.csv"
org_test_df = pd.read_csv(f"../data_split/original_test_dataset/{file_name}")

file_name = "adult_original_train.csv"
org_train_df = pd.read_csv(f"../data_split/original_train_dataset/{file_name}")

dataset_folder = "adult"
dataset_folder = f"../generated_datasets/{dataset_folder}"


In [18]:
file_name = "adult_original_train.csv"
org_train_df = pd.read_csv(f"../data_split/original_train_dataset/{file_name}")

org_train_df.head()

Unnamed: 0,age,workclass,fnlwgt,educational-num,marital-status,occupation,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,22,Others,51362,13,Single,White Collar,White,Female,0,0,16,United-States,<=50K
1,41,Others,203217,4,Single,Blue Collar,White,Male,0,0,40,Mexico,<=50K
2,25,Others,51498,8,Single,Others,White,Male,0,0,40,United-States,<=50K
3,28,Others,179512,9,Single,White Collar,White,Female,0,0,50,United-States,<=50K
4,42,Others,177989,9,Married,Blue Collar,White,Male,0,0,40,United-States,<=50K


In [19]:
temp = mle_adult(org_test_df=org_test_df, org_train_df=org_train_df, ml_model=LogisticRegression(random_state=32),
                            path_to_synthetic_data=dataset_folder)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
pd.DataFrame(stats).loc[["f1_score_macro", "f1_score_macro_std"]].T

Unnamed: 0,f1_score_macro,f1_score_macro_std
great_gpt2_12_layer_adult,0.786043,0.004309
great_gpt2_6_layer_adult,0.77703,0.003556
realtab_gpt2_12_layer_adult,0.795454,0.001883
realtab_gpt2_6_layer_adult,0.801522,0.002989
realtab_gpt_bigcode_12_layer_adult,0.792441,0.004352
realtab_gpt_bigcode_6_layer_adult,0.806099,0.005174
realtab_gpt_neo_2_layer_adult,0.793311,0.017591
realtab_gpt_neo_4_layer_adult,0.784295,0.001649
realtab_gpt_neo_6_layer_adult,0.767416,0.003105
realtab_gpt_neo_8_layer_adult,0.773803,0.004173


In [21]:
pd.DataFrame(stats).loc[["accuracy", "accuracy_std"]].T

Unnamed: 0,accuracy,accuracy_std
great_gpt2_12_layer_adult,0.847692,0.003734
great_gpt2_6_layer_adult,0.841908,0.001113
realtab_gpt2_12_layer_adult,0.838218,0.001094
realtab_gpt2_6_layer_adult,0.86099,0.001876
realtab_gpt_bigcode_12_layer_adult,0.852402,0.002712
realtab_gpt_bigcode_6_layer_adult,0.857568,0.003572
realtab_gpt_neo_2_layer_adult,0.85045,0.012629
realtab_gpt_neo_4_layer_adult,0.833897,0.001962
realtab_gpt_neo_6_layer_adult,0.846095,0.00189
realtab_gpt_neo_8_layer_adult,0.846122,0.001943


## Catboost

In [22]:
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, precision_score, recall_score 
from sklearn.preprocessing import LabelEncoder

import os

from general_mle import mle_adult

In [23]:
file_name = "adult_original_test.csv"
org_test_df = pd.read_csv(f"../data_split/original_test_dataset/{file_name}")

file_name = "adult_original_train.csv"
org_train_df = pd.read_csv(f"../data_split/original_train_dataset/{file_name}")

dataset_folder = "adult"
dataset_folder = f"../generated_datasets/{dataset_folder}"


In [24]:
temp = mle_adult(org_test_df=org_test_df, org_train_df=org_train_df, ml_model=CatBoostClassifier(random_state=32),
                            path_to_synthetic_data=dataset_folder)

Learning rate set to 0.044184
0:	learn: 0.6503175	total: 142ms	remaining: 2m 21s
1:	learn: 0.6096526	total: 152ms	remaining: 1m 15s
2:	learn: 0.5782235	total: 160ms	remaining: 53.3s
3:	learn: 0.5474095	total: 167ms	remaining: 41.7s
4:	learn: 0.5199013	total: 175ms	remaining: 34.7s
5:	learn: 0.4994793	total: 182ms	remaining: 30.1s
6:	learn: 0.4787114	total: 190ms	remaining: 27s
7:	learn: 0.4630303	total: 198ms	remaining: 24.6s
8:	learn: 0.4470104	total: 204ms	remaining: 22.5s
9:	learn: 0.4331414	total: 212ms	remaining: 21s
10:	learn: 0.4216798	total: 219ms	remaining: 19.6s
11:	learn: 0.4108738	total: 229ms	remaining: 18.8s
12:	learn: 0.4022098	total: 236ms	remaining: 18s
13:	learn: 0.3945132	total: 244ms	remaining: 17.2s
14:	learn: 0.3880601	total: 251ms	remaining: 16.5s
15:	learn: 0.3818304	total: 258ms	remaining: 15.9s
16:	learn: 0.3771999	total: 264ms	remaining: 15.3s
17:	learn: 0.3712705	total: 271ms	remaining: 14.8s
18:	learn: 0.3670281	total: 278ms	remaining: 14.3s
19:	learn: 0.36

In [25]:
pd.DataFrame(stats).loc[["f1_score_macro", "f1_score_macro_std"]].T

Unnamed: 0,f1_score_macro,f1_score_macro_std
great_gpt2_12_layer_adult,0.786043,0.004309
great_gpt2_6_layer_adult,0.77703,0.003556
realtab_gpt2_12_layer_adult,0.795454,0.001883
realtab_gpt2_6_layer_adult,0.801522,0.002989
realtab_gpt_bigcode_12_layer_adult,0.792441,0.004352
realtab_gpt_bigcode_6_layer_adult,0.806099,0.005174
realtab_gpt_neo_2_layer_adult,0.793311,0.017591
realtab_gpt_neo_4_layer_adult,0.784295,0.001649
realtab_gpt_neo_6_layer_adult,0.767416,0.003105
realtab_gpt_neo_8_layer_adult,0.773803,0.004173


In [26]:
pd.DataFrame(stats).loc[["accuracy", "accuracy_std"]].T

Unnamed: 0,accuracy,accuracy_std
great_gpt2_12_layer_adult,0.847692,0.003734
great_gpt2_6_layer_adult,0.841908,0.001113
realtab_gpt2_12_layer_adult,0.838218,0.001094
realtab_gpt2_6_layer_adult,0.86099,0.001876
realtab_gpt_bigcode_12_layer_adult,0.852402,0.002712
realtab_gpt_bigcode_6_layer_adult,0.857568,0.003572
realtab_gpt_neo_2_layer_adult,0.85045,0.012629
realtab_gpt_neo_4_layer_adult,0.833897,0.001962
realtab_gpt_neo_6_layer_adult,0.846095,0.00189
realtab_gpt_neo_8_layer_adult,0.846122,0.001943


## Decision Tree

In [27]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, precision_score, recall_score 
from sklearn.preprocessing import LabelEncoder

import os

from general_mle import mle_adult

In [28]:
file_name = "adult_original_test.csv"
org_test_df = pd.read_csv(f"../data_split/original_test_dataset/{file_name}")

org_test_df.head()

Unnamed: 0,age,workclass,fnlwgt,educational-num,marital-status,occupation,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,32,Others,33117,11,Married,Others,White,Male,0,0,40,United-States,<=50K
1,51,Others,233149,12,Married,White Collar,White,Male,0,0,45,United-States,>50K
2,84,Others,188328,9,Single,White Collar,White,Female,0,0,16,United-States,<=50K
3,45,Others,158685,9,Single,White Collar,White,Female,0,0,40,United-States,<=50K
4,44,Others,112847,4,Married,Blue Collar,Asian-Pac-Islander,Male,0,0,40,Philippines,<=50K


In [29]:
file_name = "adult_original_train.csv"
org_train_df = pd.read_csv(f"../data_split/original_train_dataset/{file_name}")

org_train_df.head()

Unnamed: 0,age,workclass,fnlwgt,educational-num,marital-status,occupation,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,22,Others,51362,13,Single,White Collar,White,Female,0,0,16,United-States,<=50K
1,41,Others,203217,4,Single,Blue Collar,White,Male,0,0,40,Mexico,<=50K
2,25,Others,51498,8,Single,Others,White,Male,0,0,40,United-States,<=50K
3,28,Others,179512,9,Single,White Collar,White,Female,0,0,50,United-States,<=50K
4,42,Others,177989,9,Married,Blue Collar,White,Male,0,0,40,United-States,<=50K


In [None]:
dataset_folder = "adult"
dataset_folder = f"../generated_datasets/{dataset_folder}"


temp = mle_adult(org_test_df=org_test_df, org_train_df=org_train_df, ml_model=DecisionTreeClassifier(random_state=32), 
        path_to_synthetic_data=dataset_folder)

In [31]:
temp

{'great_gpt2_12_layer_adult': {'f1_score_micro': 0.8149490069779924,
  'f1_score_micro_std': 0.003727608546234778,
  'f1_score_macro': 0.7477966232972572,
  'f1_score_macro_std': 0.0046195799768698215,
  'f1_score_weighted': 0.8125291930424314,
  'f1_score_weighted_std': 0.0035911783703025113,
  'accuracy': 0.8149490069779924,
  'accuracy_std': 0.003727608546234778,
  'precision': 0.642426371600226,
  'precision_std': 0.010492776098655067,
  'recall': 0.5947787343021197,
  'recall_std': 0.005711077601585539},
 'great_gpt2_6_layer_adult': {'f1_score_micro': 0.8106816961889425,
  'f1_score_micro_std': 0.0025268601683942557,
  'f1_score_macro': 0.7416808242490783,
  'f1_score_macro_std': 0.004801703992005672,
  'f1_score_weighted': 0.8081157381017876,
  'f1_score_weighted_std': 0.002525806672215223,
  'accuracy': 0.8106816961889425,
  'accuracy_std': 0.0025268601683942557,
  'precision': 0.6334420242938806,
  'precision_std': 0.011250292738751586,
  'recall': 0.5849140478403381,
  'recall

In [32]:
pd.DataFrame(temp).loc[["f1_score_macro", "f1_score_macro_std"]].T

Unnamed: 0,f1_score_macro,f1_score_macro_std
great_gpt2_12_layer_adult,0.747797,0.00462
great_gpt2_6_layer_adult,0.741681,0.004802
realtab_gpt2_12_layer_adult,0.761733,0.003693
realtab_gpt2_6_layer_adult,0.756342,0.004022
realtab_gpt_bigcode_12_layer_adult,0.754808,0.004432
realtab_gpt_bigcode_6_layer_adult,0.76425,0.004963
realtab_gpt_neo_2_layer_adult,0.765208,0.023871
realtab_gpt_neo_4_layer_adult,0.75013,0.003301
realtab_gpt_neo_6_layer_adult,0.725451,0.003147
realtab_gpt_neo_8_layer_adult,0.735677,0.004798


In [33]:
pd.DataFrame(temp).loc[["accuracy", "accuracy_std"]].T

Unnamed: 0,accuracy,accuracy_std
great_gpt2_12_layer_adult,0.814949,0.003728
great_gpt2_6_layer_adult,0.810682,0.002527
realtab_gpt2_12_layer_adult,0.803999,0.002444
realtab_gpt2_6_layer_adult,0.819646,0.003379
realtab_gpt_bigcode_12_layer_adult,0.816895,0.003617
realtab_gpt_bigcode_6_layer_adult,0.817888,0.004345
realtab_gpt_neo_2_layer_adult,0.821081,0.018839
realtab_gpt_neo_4_layer_adult,0.798162,0.002313
realtab_gpt_neo_6_layer_adult,0.806616,0.001935
realtab_gpt_neo_8_layer_adult,0.809286,0.002633
