## RandomForestClassifier

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, precision_score, recall_score 
from sklearn.preprocessing import LabelEncoder

import os

In [2]:
file_name = "Customer_travel_original_test.csv"
org_test_df = pd.read_csv(f"../data_split/original_test_dataset/{file_name}")

org_test_df.head()

Unnamed: 0,Age,FrequentFlyer,AnnualIncomeClass,ServicesOpted,AccountSyncedToSocialMedia,BookedHotelOrNot,Target
0,35,Yes,High Income,6,No,Yes,1
1,33,Yes,High Income,1,No,No,0
2,31,No,Low Income,1,Yes,Yes,0
3,34,Yes,Low Income,5,No,Yes,0
4,28,No,Middle Income,4,No,No,1


In [3]:
file_name = "Customer_travel_original_train.csv"
org_train_df = pd.read_csv(f"../data_split/original_train_dataset/{file_name}")

org_train_df.head()

Unnamed: 0,Age,FrequentFlyer,AnnualIncomeClass,ServicesOpted,AccountSyncedToSocialMedia,BookedHotelOrNot,Target
0,38,Yes,Low Income,6,No,Yes,0
1,34,Yes,Low Income,5,Yes,No,1
2,37,No,Low Income,1,No,No,0
3,31,Yes,High Income,1,No,No,0
4,30,No,Low Income,1,Yes,No,0


In [4]:
print("Len of original test dataset:", len(org_test_df))
print("Len of original train dataset:", len(org_train_df))

Len of original test dataset: 315
Len of original train dataset: 639


In [5]:
le_frequent_flyer = LabelEncoder()
le_annual_income_class = LabelEncoder()
le_account_syncted = LabelEncoder()
le_booked = LabelEncoder()

column_name = "FrequentFlyer"
org_train_df[column_name] = le_frequent_flyer.fit_transform(org_train_df[column_name])
org_test_df[column_name] = le_frequent_flyer.transform(org_test_df[column_name])

column_name = "AnnualIncomeClass"
org_train_df[column_name] = le_annual_income_class.fit_transform(org_train_df[column_name])
org_test_df[column_name] = le_annual_income_class.transform(org_test_df[column_name])

column_name = "AccountSyncedToSocialMedia"
org_train_df[column_name] = le_account_syncted.fit_transform(org_train_df[column_name])
org_test_df[column_name] = le_account_syncted.transform(org_test_df[column_name])

column_name = "BookedHotelOrNot"
org_train_df[column_name] = le_booked.fit_transform(org_train_df[column_name])
org_test_df[column_name] = le_booked.transform(org_test_df[column_name])


In [6]:
X_train = org_train_df.drop("Target", axis=1)
Y_train = org_train_df["Target"]

X_test_org = org_test_df.drop("Target", axis=1)
Y_test_org = org_test_df["Target"]


In [7]:
ml = RandomForestClassifier(random_state=32)
ml.fit(X_train, Y_train)

In [8]:
y_predict_test_org= ml.predict(X_test_org)


In [9]:
current_path = os.getcwd()
current_path

'c:\\__Local Disk D\\master thesis submit version\\execution\\results_analysis_code'

In [10]:
dataset_folder = "customer_travel"
dataset_folder = f"../generated_datasets/{dataset_folder}"
models_name = os.listdir(dataset_folder)
# models_name

In [11]:
outputs = []

for model in models_name:

    model_output = {"model_name" : model}
    # print(model)

    for i in range(1, 6):
        # print(i)
        synt_test_df = pd.read_csv(f"../generated_datasets/{dataset_folder}/{model}/samples/test/sample{i}.csv", sep=",")[:315]

        if "Unnamed: 0" in synt_test_df.columns:
            synt_test_df = synt_test_df.drop('Unnamed: 0', axis=1)

        column_name = "FrequentFlyer"
        synt_test_df[column_name] = le_frequent_flyer.transform(synt_test_df[column_name])
        column_name = "AnnualIncomeClass"
        synt_test_df[column_name] = le_annual_income_class.transform(synt_test_df[column_name])
        column_name = "AccountSyncedToSocialMedia"
        synt_test_df[column_name] = le_account_syncted.transform(synt_test_df[column_name])
        column_name = "BookedHotelOrNot"
        synt_test_df[column_name] = le_booked.transform(synt_test_df[column_name])

        X_test_synt = synt_test_df.drop("Target", axis=1)
        Y_test_synt = synt_test_df["Target"]

        y_predict_syn_test = ml.predict(X_test_synt)
        f1_score_micro = f1_score(Y_test_synt, y_predict_syn_test, average='micro')
        f1_score_macro = f1_score(Y_test_synt, y_predict_syn_test, average='macro')
        f1_score_weighted = f1_score(Y_test_synt, y_predict_syn_test, average='weighted')
        accuracy_ = accuracy_score(Y_test_synt, y_predict_syn_test)
        precision_ = precision_score(Y_test_synt, y_predict_syn_test)
        recall_ = recall_score(Y_test_synt, y_predict_syn_test)

        model_output[i] = {
            "f1_score_micro": f1_score_micro,
            "f1_score_macro": f1_score_macro,
            "f1_score_weighted": f1_score_weighted,
            "accuracy": accuracy_,
            "precision": precision_,
            "recall": recall_,
        }

    outputs.append(model_output)


In [12]:
stats = {}

for model in outputs:
    f1_score_micro = []
    f1_score_macro = []
    f1_score_weighted = []
    accuracy_ = []
    precision_ = []
    recall_ = []

    for i in range(1, 6):
        f1_score_micro.append(model[i]["f1_score_micro"])
        f1_score_macro.append(model[i]["f1_score_macro"])
        f1_score_weighted.append(model[i]["f1_score_weighted"])
        accuracy_.append(model[i]["accuracy"])
        precision_.append(model[i]["precision"])
        recall_.append(model[i]["recall"])
    
    f1_score_micro_mean = np.mean(f1_score_micro)
    f1_score_micro_std = np.std(f1_score_micro, ddof=1)
    f1_score_macro_mean = np.mean(f1_score_macro)
    f1_score_macro_std = np.std(f1_score_macro, ddof=1)
    f1_score_weighted_mean = np.mean(f1_score_weighted)
    f1_score_weighted_std = np.std(f1_score_weighted, ddof=1)
    accuracy_mean = np.mean(accuracy_)
    accuracy_std = np.std(accuracy_, ddof=1)
    precision_mean = np.mean(precision_)
    precision_std = np.std(precision_, ddof=1)
    recall_mean = np.mean(recall_)
    recall_std = np.std(recall_, ddof=1)

    stats[model["model_name"]] = {
        "f1_score_micro": f1_score_micro_mean, 
        "f1_score_micro_std": f1_score_micro_std,
        "f1_score_macro": f1_score_macro_mean,
        "f1_score_macro_std": f1_score_macro_std,
        "f1_score_weighted": f1_score_weighted_mean,
        "f1_score_weighted_std": f1_score_weighted_std,
        "accuracy": accuracy_mean,
        "accuracy_std": accuracy_std,
        "precision": precision_mean,
        "precision_std": precision_std,
        "recall": recall_mean,
        "recall_std": recall_std        
    }



In [13]:
f1_score_micro = f1_score(Y_test_org, y_predict_test_org, average='micro')
f1_score_macro = f1_score(Y_test_org, y_predict_test_org, average='macro')
f1_score_weighted = f1_score(Y_test_org, y_predict_test_org, average='weighted')
accuracy_ = accuracy_score(Y_test_org, y_predict_test_org)
precision_ = precision_score(Y_test_org, y_predict_test_org)
recall_ = recall_score(Y_test_org, y_predict_test_org)

original = {
                "f1_score_micro" : f1_score_micro,
                "f1_score_micro_std" : 0,
                "f1_score_macro" : f1_score_macro,
                "f1_score_macro_std" : 0,
                "f1_score_weighted" : f1_score_weighted,
                "f1_score_weighted_std" : 0,
                "accuracy" : accuracy_,
                "accuracy_std" : 0,
                "precision" : precision_,
                "precision_std" : 0,
                "recall" : recall_,
                "recall_std" : 0
            }

stats["original"] = original



In [14]:
pd.DataFrame(stats).loc[["f1_score_macro", "f1_score_macro_std"]].T

Unnamed: 0,f1_score_macro,f1_score_macro_std
great_gpt2_12_layer_customer_travel,0.583633,0.03051
great_gpt2_6_layer_customer_travel,0.528157,0.034146
great_gpt_bigcode_12_layer_customer_travel,0.591151,0.048742
great_gpt_bigcode_6_layer_customer_travel,0.6206,0.019976
great_gpt_j_1_layer_customer_travel,0.636292,0.04481
great_gpt_neox_1_layer_customer_travel,0.709753,0.043359
great_gpt_neo_2_layer_customer_travel,0.741934,0.049679
great_gpt_neo_4_layer_customer_travel,0.807451,0.008041
great_gpt_neo_6_layer_customer_travel,0.791393,0.043702
great_gpt_neo_8_layer_customer_travel,0.766197,0.019003


In [15]:
pd.DataFrame(stats).loc[["accuracy", "accuracy_std"]].T

Unnamed: 0,accuracy,accuracy_std
great_gpt2_12_layer_customer_travel,0.787302,0.012295
great_gpt2_6_layer_customer_travel,0.753016,0.006882
great_gpt_bigcode_12_layer_customer_travel,0.773968,0.018835
great_gpt_bigcode_6_layer_customer_travel,0.781587,0.014444
great_gpt_j_1_layer_customer_travel,0.809524,0.025198
great_gpt_neox_1_layer_customer_travel,0.83873,0.024936
great_gpt_neo_2_layer_customer_travel,0.846349,0.02382
great_gpt_neo_4_layer_customer_travel,0.88127,0.009681
great_gpt_neo_6_layer_customer_travel,0.867302,0.027893
great_gpt_neo_8_layer_customer_travel,0.857778,0.010384


## Logistic Regression

In [16]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, precision_score, recall_score 
from sklearn.preprocessing import LabelEncoder

import os

from general_mle import mle_customer_travel

In [17]:
file_name = "Customer_travel_original_test.csv"
org_test_df = pd.read_csv(f"../data_split/original_test_dataset/{file_name}")

org_test_df.head()

Unnamed: 0,Age,FrequentFlyer,AnnualIncomeClass,ServicesOpted,AccountSyncedToSocialMedia,BookedHotelOrNot,Target
0,35,Yes,High Income,6,No,Yes,1
1,33,Yes,High Income,1,No,No,0
2,31,No,Low Income,1,Yes,Yes,0
3,34,Yes,Low Income,5,No,Yes,0
4,28,No,Middle Income,4,No,No,1


In [18]:
file_name = "Customer_travel_original_train.csv"
org_train_df = pd.read_csv(f"../data_split/original_train_dataset/{file_name}")

org_train_df.head()

Unnamed: 0,Age,FrequentFlyer,AnnualIncomeClass,ServicesOpted,AccountSyncedToSocialMedia,BookedHotelOrNot,Target
0,38,Yes,Low Income,6,No,Yes,0
1,34,Yes,Low Income,5,Yes,No,1
2,37,No,Low Income,1,No,No,0
3,31,Yes,High Income,1,No,No,0
4,30,No,Low Income,1,Yes,No,0


In [19]:
dataset_folder = "customer_travel"
dataset_folder = f"../generated_datasets/{dataset_folder}"

In [20]:
temp = mle_customer_travel(org_test_df=org_test_df, org_train_df=org_train_df, ml_model=LogisticRegression(random_state=32), 
                        path_to_synthetic_data=dataset_folder)

In [21]:
temp

{'great_gpt2_12_layer_customer_travel': {'f1_score_micro': 0.8095238095238095,
  'f1_score_micro_std': 0.01076560314781788,
  'f1_score_macro': 0.6142345052347322,
  'f1_score_macro_std': 0.027123338062725617,
  'f1_score_weighted': 0.7737268517818152,
  'f1_score_weighted_std': 0.011937863303565641,
  'accuracy': 0.8095238095238095,
  'accuracy_std': 0.01076560314781788,
  'precision': 0.6190062111801242,
  'precision_std': 0.06765394876609965,
  'recall': 0.2352120826033869,
  'recall_std': 0.04719805879678247},
 'great_gpt2_6_layer_customer_travel': {'f1_score_micro': 0.777142857142857,
  'f1_score_micro_std': 0.016556704521149584,
  'f1_score_macro': 0.5424024961475616,
  'f1_score_macro_std': 0.03800752898622831,
  'f1_score_weighted': 0.7304498824969702,
  'f1_score_weighted_std': 0.021986988931745256,
  'accuracy': 0.777142857142857,
  'accuracy_std': 0.016556704521149584,
  'precision': 0.42328431372549014,
  'precision_std': 0.09269924093276992,
  'recall': 0.1463599096053457,

In [22]:
pd.DataFrame(temp).loc[["f1_score_macro", "f1_score_macro_std"]].T

Unnamed: 0,f1_score_macro,f1_score_macro_std
great_gpt2_12_layer_customer_travel,0.614235,0.027123
great_gpt2_6_layer_customer_travel,0.542402,0.038008
great_gpt_bigcode_12_layer_customer_travel,0.651869,0.022528
great_gpt_bigcode_6_layer_customer_travel,0.628127,0.035862
great_gpt_j_1_layer_customer_travel,0.619544,0.027201
great_gpt_neox_1_layer_customer_travel,0.641495,0.02512
great_gpt_neo_2_layer_customer_travel,0.730721,0.035648
great_gpt_neo_4_layer_customer_travel,0.733292,0.040945
great_gpt_neo_6_layer_customer_travel,0.741226,0.037978
great_gpt_neo_8_layer_customer_travel,0.714594,0.031945


In [23]:
pd.DataFrame(temp).loc[["accuracy", "accuracy_std"]].T

Unnamed: 0,accuracy,accuracy_std
great_gpt2_12_layer_customer_travel,0.809524,0.010766
great_gpt2_6_layer_customer_travel,0.777143,0.016557
great_gpt_bigcode_12_layer_customer_travel,0.808254,0.012005
great_gpt_bigcode_6_layer_customer_travel,0.79746,0.02956
great_gpt_j_1_layer_customer_travel,0.813333,0.021225
great_gpt_neox_1_layer_customer_travel,0.822222,0.017388
great_gpt_neo_2_layer_customer_travel,0.846984,0.023372
great_gpt_neo_4_layer_customer_travel,0.848889,0.026806
great_gpt_neo_6_layer_customer_travel,0.850159,0.023155
great_gpt_neo_8_layer_customer_travel,0.83873,0.011534


## Catboost

In [24]:
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, precision_score, recall_score 
from sklearn.preprocessing import LabelEncoder

import os

In [25]:
file_name = "Customer_travel_original_test.csv"
org_test_df = pd.read_csv(f"../data_split/original_test_dataset/{file_name}")

file_name = "Customer_travel_original_train.csv"
org_train_df = pd.read_csv(f"../data_split/original_train_dataset/{file_name}")

dataset_folder = "customer_travel"
dataset_folder = f"../generated_datasets/{dataset_folder}"

In [26]:
temp = mle_customer_travel(org_test_df=org_test_df, org_train_df=org_train_df, ml_model=CatBoostClassifier(random_state=32), 
                        path_to_synthetic_data=dataset_folder)

Learning rate set to 0.008509
0:	learn: 0.6873141	total: 125ms	remaining: 2m 4s
1:	learn: 0.6811414	total: 127ms	remaining: 1m 3s
2:	learn: 0.6737528	total: 129ms	remaining: 43s
3:	learn: 0.6686202	total: 130ms	remaining: 32.4s
4:	learn: 0.6635424	total: 133ms	remaining: 26.4s
5:	learn: 0.6539587	total: 134ms	remaining: 22.2s
6:	learn: 0.6470545	total: 135ms	remaining: 19.2s
7:	learn: 0.6399767	total: 136ms	remaining: 16.9s
8:	learn: 0.6337496	total: 137ms	remaining: 15.1s
9:	learn: 0.6291790	total: 138ms	remaining: 13.7s
10:	learn: 0.6249056	total: 139ms	remaining: 12.5s
11:	learn: 0.6166350	total: 140ms	remaining: 11.5s
12:	learn: 0.6105164	total: 142ms	remaining: 10.8s
13:	learn: 0.6022871	total: 145ms	remaining: 10.2s
14:	learn: 0.5935144	total: 147ms	remaining: 9.65s
15:	learn: 0.5844371	total: 148ms	remaining: 9.1s
16:	learn: 0.5795776	total: 149ms	remaining: 8.62s
17:	learn: 0.5760754	total: 150ms	remaining: 8.19s
18:	learn: 0.5726619	total: 151ms	remaining: 7.8s
19:	learn: 0.56

In [27]:
temp

{'great_gpt2_12_layer_customer_travel': {'f1_score_micro': 0.7898412698412699,
  'f1_score_micro_std': 0.017154610902992123,
  'f1_score_macro': 0.6025011472927202,
  'f1_score_macro_std': 0.0243360779317567,
  'f1_score_weighted': 0.7608361149142621,
  'f1_score_weighted_std': 0.017602608663406235,
  'accuracy': 0.7898412698412699,
  'accuracy_std': 0.017154610902992123,
  'precision': 0.5010513739545998,
  'precision_std': 0.06411466310308962,
  'recall': 0.24775176514306949,
  'recall_std': 0.04978583834820156},
 'great_gpt2_6_layer_customer_travel': {'f1_score_micro': 0.7536507936507937,
  'f1_score_micro_std': 0.01067157861204543,
  'f1_score_macro': 0.5256266042533775,
  'f1_score_macro_std': 0.036495684681751475,
  'f1_score_weighted': 0.7147958739065056,
  'f1_score_weighted_std': 0.011351789803672872,
  'accuracy': 0.7536507936507937,
  'accuracy_std': 0.01067157861204543,
  'precision': 0.3227039627039627,
  'precision_std': 0.12429146182796515,
  'recall': 0.1417663341294173

In [28]:
pd.DataFrame(temp).loc[["f1_score_macro", "f1_score_macro_std"]].T

Unnamed: 0,f1_score_macro,f1_score_macro_std
great_gpt2_12_layer_customer_travel,0.602501,0.024336
great_gpt2_6_layer_customer_travel,0.525627,0.036496
great_gpt_bigcode_12_layer_customer_travel,0.596462,0.043547
great_gpt_bigcode_6_layer_customer_travel,0.631087,0.025198
great_gpt_j_1_layer_customer_travel,0.656945,0.040252
great_gpt_neox_1_layer_customer_travel,0.722066,0.043915
great_gpt_neo_2_layer_customer_travel,0.744388,0.044576
great_gpt_neo_4_layer_customer_travel,0.822234,0.015836
great_gpt_neo_6_layer_customer_travel,0.79308,0.044941
great_gpt_neo_8_layer_customer_travel,0.795194,0.023226


In [29]:
pd.DataFrame(temp).loc[["accuracy", "accuracy_std"]].T

Unnamed: 0,accuracy,accuracy_std
great_gpt2_12_layer_customer_travel,0.789841,0.017155
great_gpt2_6_layer_customer_travel,0.753651,0.010672
great_gpt_bigcode_12_layer_customer_travel,0.773333,0.020769
great_gpt_bigcode_6_layer_customer_travel,0.785397,0.019258
great_gpt_j_1_layer_customer_travel,0.813968,0.024754
great_gpt_neox_1_layer_customer_travel,0.841905,0.026504
great_gpt_neo_2_layer_customer_travel,0.845714,0.023177
great_gpt_neo_4_layer_customer_travel,0.888254,0.012172
great_gpt_neo_6_layer_customer_travel,0.866667,0.028127
great_gpt_neo_8_layer_customer_travel,0.871111,0.01358


## Decision Tree

In [30]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, precision_score, recall_score 
from sklearn.preprocessing import LabelEncoder

import os

In [31]:
file_name = "Customer_travel_original_test.csv"
org_test_df = pd.read_csv(f"../data_split/original_test_dataset/{file_name}")

file_name = "Customer_travel_original_train.csv"
org_train_df = pd.read_csv(f"../data_split/original_train_dataset/{file_name}")

dataset_folder = "customer_travel"
dataset_folder = f"../generated_datasets/{dataset_folder}"

In [32]:
temp = mle_customer_travel(org_test_df=org_test_df, org_train_df=org_train_df, ml_model=DecisionTreeClassifier(random_state=32), 
                        path_to_synthetic_data=dataset_folder)

In [33]:
temp

{'great_gpt2_12_layer_customer_travel': {'f1_score_micro': 0.7815873015873016,
  'f1_score_micro_std': 0.019231912674109196,
  'f1_score_macro': 0.5974237523998044,
  'f1_score_macro_std': 0.021702124828412248,
  'f1_score_weighted': 0.7552647834983122,
  'f1_score_weighted_std': 0.019946595603000224,
  'accuracy': 0.7815873015873016,
  'accuracy_std': 0.019231912674109196,
  'precision': 0.4673189808838896,
  'precision_std': 0.06481413514064534,
  'recall': 0.2515565111217285,
  'recall_std': 0.04305129095703371},
 'great_gpt2_6_layer_customer_travel': {'f1_score_micro': 0.7447619047619047,
  'f1_score_micro_std': 0.01465140413868485,
  'f1_score_macro': 0.5298598915100218,
  'f1_score_macro_std': 0.03587979694580315,
  'f1_score_weighted': 0.7126481324415124,
  'f1_score_weighted_std': 0.015003829642095774,
  'accuracy': 0.7447619047619047,
  'accuracy_std': 0.01465140413868485,
  'precision': 0.3072786862260547,
  'precision_std': 0.10067696296804361,
  'recall': 0.1623438206501087

In [34]:
pd.DataFrame(temp).loc[["f1_score_macro", "f1_score_macro_std"]].T

Unnamed: 0,f1_score_macro,f1_score_macro_std
great_gpt2_12_layer_customer_travel,0.597424,0.021702
great_gpt2_6_layer_customer_travel,0.52986,0.03588
great_gpt_bigcode_12_layer_customer_travel,0.600235,0.043298
great_gpt_bigcode_6_layer_customer_travel,0.61578,0.0187
great_gpt_j_1_layer_customer_travel,0.669056,0.030076
great_gpt_neox_1_layer_customer_travel,0.71832,0.048054
great_gpt_neo_2_layer_customer_travel,0.72932,0.045094
great_gpt_neo_4_layer_customer_travel,0.813091,0.012243
great_gpt_neo_6_layer_customer_travel,0.79262,0.039017
great_gpt_neo_8_layer_customer_travel,0.776107,0.00796


In [35]:
pd.DataFrame(temp).loc[["accuracy", "accuracy_std"]].T

Unnamed: 0,accuracy,accuracy_std
great_gpt2_12_layer_customer_travel,0.781587,0.019232
great_gpt2_6_layer_customer_travel,0.744762,0.014651
great_gpt_bigcode_12_layer_customer_travel,0.773333,0.021601
great_gpt_bigcode_6_layer_customer_travel,0.776508,0.017761
great_gpt_j_1_layer_customer_travel,0.817778,0.02089
great_gpt_neox_1_layer_customer_travel,0.839365,0.025946
great_gpt_neo_2_layer_customer_travel,0.839365,0.022848
great_gpt_neo_4_layer_customer_travel,0.88254,0.010039
great_gpt_neo_6_layer_customer_travel,0.866667,0.02365
great_gpt_neo_8_layer_customer_travel,0.861587,0.006583
