## RandomForestClassifier

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, precision_score, recall_score 
from sklearn.preprocessing import LabelEncoder

import os

In [2]:
file_name = "stroke_healthcare_original_test.csv"
org_test_df = pd.read_csv(f"../data_split/original_test_dataset/{file_name}")

org_test_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Female,81.0,1,0,Yes,Private,Rural,74.02,25.0,no smoking,1
1,Female,24.0,0,0,No,Private,Urban,71.63,22.0,no smoking,0
2,Female,55.0,0,0,Yes,Private,Urban,71.02,21.2,no smoking,0
3,Female,32.0,0,0,Yes,Private,Rural,71.8,26.5,no smoking,0
4,Female,57.0,0,0,Yes,Private,Urban,83.14,31.9,no smoking,0


In [3]:
file_name = "stroke_healthcare_original_train.csv"
org_train_df = pd.read_csv(f"../data_split/original_train_dataset/{file_name}")

org_train_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Female,65.0,0,0,Yes,Private,Rural,95.87,29.8,no smoking,0
1,Female,21.0,0,0,No,Private,Rural,74.24,32.7,no smoking,0
2,Female,25.0,0,0,No,Private,Rural,92.82,24.1,no smoking,0
3,Female,24.0,0,0,Yes,Private,Urban,75.23,29.0,no smoking,0
4,Female,43.0,0,0,Yes,Govt_job,Rural,72.13,42.6,no smoking,0


In [4]:
print("Len of original test dataset:", len(org_test_df))
print("Len of original train dataset:", len(org_train_df))

Len of original test dataset: 1108
Len of original train dataset: 2249


In [5]:
le_gender = LabelEncoder()
le_ever_married = LabelEncoder()
le_work_type = LabelEncoder()
le_residence_type = LabelEncoder()
le_smoking_status = LabelEncoder()

column_name = "gender"
org_train_df[column_name] = le_gender.fit_transform(org_train_df[column_name])
org_test_df[column_name] = le_gender.transform(org_test_df[column_name])

column_name = "ever_married"
org_train_df[column_name] = le_ever_married.fit_transform(org_train_df[column_name])
org_test_df[column_name] = le_ever_married.transform(org_test_df[column_name])

column_name = "work_type"
org_train_df[column_name] = le_work_type.fit_transform(org_train_df[column_name])
org_test_df[column_name] = le_work_type.transform(org_test_df[column_name])

column_name = "Residence_type"
org_train_df[column_name] = le_residence_type.fit_transform(org_train_df[column_name])
org_test_df[column_name] = le_residence_type.transform(org_test_df[column_name])

column_name = "smoking_status"
org_train_df[column_name] = le_smoking_status.fit_transform(org_train_df[column_name])
org_test_df[column_name] = le_smoking_status.transform(org_test_df[column_name])


In [6]:
X_train = org_train_df.drop("stroke", axis=1)
Y_train = org_train_df["stroke"]

X_test_org = org_test_df.drop("stroke", axis=1)
Y_test_org = org_test_df["stroke"]


In [7]:
ml = RandomForestClassifier(random_state=32)
ml.fit(X_train, Y_train)

In [8]:
y_predict_test_org= ml.predict(X_test_org)


In [9]:
current_path = os.getcwd()
current_path

'c:\\__Local Disk D\\master thesis submit version\\execution\\results_analysis_code'

In [10]:
dataset_folder = "stroke_healthcare"
dataset_folder = f"../generated_datasets/{dataset_folder}"
models_name = os.listdir(dataset_folder)
# models_name

In [11]:
outputs = []

for model in models_name:

    model_output = {"model_name" : model}
    # print(model)

    for i in range(1, 6):
        # print(i)
        synt_test_df = pd.read_csv(f"../generated_datasets/{dataset_folder}/{model}/samples/test/sample{i}.csv", sep=",")[:315]

        if "Unnamed: 0" in synt_test_df.columns:
            synt_test_df = synt_test_df.drop('Unnamed: 0', axis=1)

        column_name = "gender"
        synt_test_df[column_name] = le_gender.transform(synt_test_df[column_name])
        column_name = "ever_married"
        synt_test_df[column_name] = le_ever_married.transform(synt_test_df[column_name])
        column_name = "work_type"
        synt_test_df[column_name] = le_work_type.transform(synt_test_df[column_name])
        column_name = "Residence_type"
        synt_test_df[column_name] = le_residence_type.transform(synt_test_df[column_name])
        column_name = "smoking_status"
        synt_test_df[column_name] = le_smoking_status.transform(synt_test_df[column_name])
        
        X_test_synt = synt_test_df.drop("stroke", axis=1)
        Y_test_synt = synt_test_df["stroke"]

        y_predict_syn_test = ml.predict(X_test_synt)
        f1_score_micro = f1_score(Y_test_synt, y_predict_syn_test, average='micro')
        f1_score_macro = f1_score(Y_test_synt, y_predict_syn_test, average='macro')
        f1_score_weighted = f1_score(Y_test_synt, y_predict_syn_test, average='weighted')
        accuracy_ = accuracy_score(Y_test_synt, y_predict_syn_test)
        precision_ = precision_score(Y_test_synt, y_predict_syn_test)
        recall_ = recall_score(Y_test_synt, y_predict_syn_test)

        model_output[i] = {
            "f1_score_micro": f1_score_micro,
            "f1_score_macro": f1_score_macro,
            "f1_score_weighted": f1_score_weighted,
            "accuracy": accuracy_,
            "precision": precision_,
            "recall": recall_,
        }

    outputs.append(model_output)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [12]:
stats = {}

for model in outputs:
    f1_score_micro = []
    f1_score_macro = []
    f1_score_weighted = []
    accuracy_ = []
    precision_ = []
    recall_ = []

    for i in range(1, 6):
        f1_score_micro.append(model[i]["f1_score_micro"])
        f1_score_macro.append(model[i]["f1_score_macro"])
        f1_score_weighted.append(model[i]["f1_score_weighted"])
        accuracy_.append(model[i]["accuracy"])
        precision_.append(model[i]["precision"])
        recall_.append(model[i]["recall"])
    
    f1_score_micro_mean = np.mean(f1_score_micro)
    f1_score_micro_std = np.std(f1_score_micro, ddof=1)
    f1_score_macro_mean = np.mean(f1_score_macro)
    f1_score_macro_std = np.std(f1_score_macro, ddof=1)
    f1_score_weighted_mean = np.mean(f1_score_weighted)
    f1_score_weighted_std = np.std(f1_score_weighted, ddof=1)
    accuracy_mean = np.mean(accuracy_)
    accuracy_std = np.std(accuracy_, ddof=1)
    precision_mean = np.mean(precision_)
    precision_std = np.std(precision_, ddof=1)
    recall_mean = np.mean(recall_)
    recall_std = np.std(recall_, ddof=1)

    stats[model["model_name"]] = {
        "f1_score_micro": f1_score_micro_mean, 
        "f1_score_micro_std": f1_score_micro_std,
        "f1_score_macro": f1_score_macro_mean,
        "f1_score_macro_std": f1_score_macro_std,
        "f1_score_weighted": f1_score_weighted_mean,
        "f1_score_weighted_std": f1_score_weighted_std,
        "accuracy": accuracy_mean,
        "accuracy_std": accuracy_std,
        "precision": precision_mean,
        "precision_std": precision_std,
        "recall": recall_mean,
        "recall_std": recall_std        
    }



In [13]:
f1_score_micro = f1_score(Y_test_org, y_predict_test_org, average='micro')
f1_score_macro = f1_score(Y_test_org, y_predict_test_org, average='macro')
f1_score_weighted = f1_score(Y_test_org, y_predict_test_org, average='weighted')
accuracy_ = accuracy_score(Y_test_org, y_predict_test_org)
precision_ = precision_score(Y_test_org, y_predict_test_org)
recall_ = recall_score(Y_test_org, y_predict_test_org)

original = {
                "f1_score_micro" : f1_score_micro,
                "f1_score_micro_std" : 0,
                "f1_score_macro" : f1_score_macro,
                "f1_score_macro_std" : 0,
                "f1_score_weighted" : f1_score_weighted,
                "f1_score_weighted_std" : 0,
                "accuracy" : accuracy_,
                "accuracy_std" : 0,
                "precision" : precision_,
                "precision_std" : 0,
                "recall" : recall_,
                "recall_std" : 0
            }

stats["original"] = original



In [14]:
pd.DataFrame(stats).loc[["f1_score_macro", "f1_score_macro_std"]].T

Unnamed: 0,f1_score_macro,f1_score_macro_std
great_gpt2_12_layer_stroke_healthcare,0.494686,0.026927
great_gpt2_6_layer_stroke_healthcare,0.496714,0.025817
great_gpt_bigcode_12_layer_stroke_healthcare,0.483578,0.004326
great_gpt_bigcode_6_layer_stroke_healthcare,0.490746,0.016133
great_gpt_j_1_layer_stroke_healthcare,0.487459,0.0028
great_gpt_neox_1_layer_stroke_healthcare,0.517746,0.035965
great_gpt_neo_2_layer_stroke_healthcare,0.497246,0.032999
great_gpt_neo_4_layer_stroke_healthcare,0.498186,0.029922
great_gpt_neo_6_layer_stroke_healthcare,0.503439,0.032094
great_gpt_neo_8_layer_stroke_healthcare,0.495021,0.026805


In [15]:
pd.DataFrame(stats).loc[["accuracy", "accuracy_std"]].T

Unnamed: 0,accuracy,accuracy_std
great_gpt2_12_layer_stroke_healthcare,0.936508,0.014374
great_gpt2_6_layer_stroke_healthcare,0.944127,0.013204
great_gpt_bigcode_12_layer_stroke_healthcare,0.936508,0.016187
great_gpt_bigcode_6_layer_stroke_healthcare,0.934603,0.016888
great_gpt_j_1_layer_stroke_healthcare,0.951111,0.010672
great_gpt_neox_1_layer_stroke_healthcare,0.937778,0.01733
great_gpt_neo_2_layer_stroke_healthcare,0.937778,0.015159
great_gpt_neo_4_layer_stroke_healthcare,0.937778,0.012817
great_gpt_neo_6_layer_stroke_healthcare,0.935238,0.018181
great_gpt_neo_8_layer_stroke_healthcare,0.937778,0.015649


## Logistic Regression

In [16]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, precision_score, recall_score 
from sklearn.preprocessing import LabelEncoder

import os

from general_mle import mle_stroke_healthcare

In [17]:
file_name = "stroke_healthcare_original_test.csv"
org_test_df = pd.read_csv(f"../data_split/original_test_dataset/{file_name}")

file_name = "stroke_healthcare_original_train.csv"
org_train_df = pd.read_csv(f"../data_split/original_train_dataset/{file_name}")

dataset_folder = "stroke_healthcare"
dataset_folder = f"../generated_datasets/{dataset_folder}"

In [18]:
temp = mle_stroke_healthcare(org_test_df=org_test_df, org_train_df=org_train_df, ml_model=LogisticRegression(random_state=32),
                            path_to_synthetic_data=dataset_folder)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.

In [19]:
pd.DataFrame(temp).loc[["f1_score_macro", "f1_score_macro_std"]].T

Unnamed: 0,f1_score_macro,f1_score_macro_std
great_gpt2_12_layer_stroke_healthcare,0.486272,0.001671
great_gpt2_6_layer_stroke_healthcare,0.485836,0.002607
great_gpt_bigcode_12_layer_stroke_healthcare,0.486129,0.001655
great_gpt_bigcode_6_layer_stroke_healthcare,0.486177,0.001605
great_gpt_j_1_layer_stroke_healthcare,0.486173,0.002174
great_gpt_neox_1_layer_stroke_healthcare,0.486128,0.001779
great_gpt_neo_2_layer_stroke_healthcare,0.486224,0.001656
great_gpt_neo_4_layer_stroke_healthcare,0.486223,0.001796
great_gpt_neo_6_layer_stroke_healthcare,0.489611,0.008079
great_gpt_neo_8_layer_stroke_healthcare,0.493684,0.011778


In [20]:
pd.DataFrame(temp).loc[["accuracy", "accuracy_std"]].T

Unnamed: 0,accuracy,accuracy_std
great_gpt2_12_layer_stroke_healthcare,0.94657,0.006331
great_gpt2_6_layer_stroke_healthcare,0.944946,0.009845
great_gpt_bigcode_12_layer_stroke_healthcare,0.946029,0.006266
great_gpt_bigcode_6_layer_stroke_healthcare,0.946209,0.006074
great_gpt_j_1_layer_stroke_healthcare,0.946209,0.008213
great_gpt_neox_1_layer_stroke_healthcare,0.946029,0.006736
great_gpt_neo_2_layer_stroke_healthcare,0.94639,0.006272
great_gpt_neo_4_layer_stroke_healthcare,0.94639,0.006802
great_gpt_neo_6_layer_stroke_healthcare,0.94639,0.006589
great_gpt_neo_8_layer_stroke_healthcare,0.94657,0.006944


## CatboostClassifier

In [21]:
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, precision_score, recall_score 
from sklearn.preprocessing import LabelEncoder

import os

from general_mle import mle_stroke_healthcare

In [22]:
file_name = "stroke_healthcare_original_test.csv"
org_test_df = pd.read_csv(f"../data_split/original_test_dataset/{file_name}")

file_name = "stroke_healthcare_original_train.csv"
org_train_df = pd.read_csv(f"../data_split/original_train_dataset/{file_name}")

dataset_folder = "stroke_healthcare"
dataset_folder = f"../generated_datasets/{dataset_folder}"

In [23]:
temp = mle_stroke_healthcare(org_test_df=org_test_df, org_train_df=org_train_df, ml_model=CatBoostClassifier(random_state=32),
                            path_to_synthetic_data=dataset_folder)

Learning rate set to 0.014562
0:	learn: 0.6717896	total: 139ms	remaining: 2m 18s
1:	learn: 0.6499052	total: 141ms	remaining: 1m 10s
2:	learn: 0.6287460	total: 143ms	remaining: 47.6s
3:	learn: 0.6095198	total: 145ms	remaining: 36s
4:	learn: 0.5911751	total: 147ms	remaining: 29.2s
5:	learn: 0.5724726	total: 148ms	remaining: 24.5s
6:	learn: 0.5585707	total: 149ms	remaining: 21.1s
7:	learn: 0.5433985	total: 151ms	remaining: 18.7s
8:	learn: 0.5288750	total: 152ms	remaining: 16.8s
9:	learn: 0.5138256	total: 154ms	remaining: 15.3s
10:	learn: 0.4985372	total: 156ms	remaining: 14s
11:	learn: 0.4859396	total: 158ms	remaining: 13s
12:	learn: 0.4730450	total: 160ms	remaining: 12.2s
13:	learn: 0.4622292	total: 162ms	remaining: 11.4s
14:	learn: 0.4516858	total: 163ms	remaining: 10.7s
15:	learn: 0.4380244	total: 165ms	remaining: 10.1s
16:	learn: 0.4272659	total: 166ms	remaining: 9.63s
17:	learn: 0.4170491	total: 168ms	remaining: 9.17s
18:	learn: 0.4088089	total: 169ms	remaining: 8.73s
19:	learn: 0.40

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
pd.DataFrame(temp).loc[["f1_score_macro", "f1_score_macro_std"]].T

Unnamed: 0,f1_score_macro,f1_score_macro_std
great_gpt2_12_layer_stroke_healthcare,0.496388,0.016713
great_gpt2_6_layer_stroke_healthcare,0.498504,0.008263
great_gpt_bigcode_12_layer_stroke_healthcare,0.488406,0.005951
great_gpt_bigcode_6_layer_stroke_healthcare,0.485984,0.001994
great_gpt_j_1_layer_stroke_healthcare,0.501887,0.004671
great_gpt_neox_1_layer_stroke_healthcare,0.523136,0.01548
great_gpt_neo_2_layer_stroke_healthcare,0.49539,0.008361
great_gpt_neo_4_layer_stroke_healthcare,0.499989,0.01843
great_gpt_neo_6_layer_stroke_healthcare,0.502635,0.025129
great_gpt_neo_8_layer_stroke_healthcare,0.528809,0.022001


In [25]:
pd.DataFrame(temp).loc[["accuracy", "accuracy_std"]].T

Unnamed: 0,accuracy,accuracy_std
great_gpt2_12_layer_stroke_healthcare,0.94639,0.006207
great_gpt2_6_layer_stroke_healthcare,0.944765,0.010259
great_gpt_bigcode_12_layer_stroke_healthcare,0.943682,0.00454
great_gpt_bigcode_6_layer_stroke_healthcare,0.945487,0.00754
great_gpt_j_1_layer_stroke_healthcare,0.944765,0.008989
great_gpt_neox_1_layer_stroke_healthcare,0.945487,0.006272
great_gpt_neo_2_layer_stroke_healthcare,0.945487,0.005438
great_gpt_neo_4_layer_stroke_healthcare,0.945487,0.005324
great_gpt_neo_6_layer_stroke_healthcare,0.944765,0.007669
great_gpt_neo_8_layer_stroke_healthcare,0.947653,0.006814


## Decision Tree

In [26]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, precision_score, recall_score 
from sklearn.preprocessing import LabelEncoder

import os

from general_mle import mle_stroke_healthcare

In [27]:
file_name = "stroke_healthcare_original_test.csv"
org_test_df = pd.read_csv(f"../data_split/original_test_dataset/{file_name}")

file_name = "stroke_healthcare_original_train.csv"
org_train_df = pd.read_csv(f"../data_split/original_train_dataset/{file_name}")

dataset_folder = "stroke_healthcare"
dataset_folder = f"../generated_datasets/{dataset_folder}"

In [28]:
temp = mle_stroke_healthcare(org_test_df=org_test_df, org_train_df=org_train_df, ml_model=DecisionTreeClassifier(random_state=32),
                            path_to_synthetic_data=dataset_folder)

In [29]:
pd.DataFrame(temp).loc[["f1_score_macro", "f1_score_macro_std"]].T

Unnamed: 0,f1_score_macro,f1_score_macro_std
great_gpt2_12_layer_stroke_healthcare,0.537369,0.01366
great_gpt2_6_layer_stroke_healthcare,0.557129,0.036827
great_gpt_bigcode_12_layer_stroke_healthcare,0.526589,0.02742
great_gpt_bigcode_6_layer_stroke_healthcare,0.508755,0.011014
great_gpt_j_1_layer_stroke_healthcare,0.533732,0.017564
great_gpt_neox_1_layer_stroke_healthcare,0.557506,0.020607
great_gpt_neo_2_layer_stroke_healthcare,0.539951,0.021411
great_gpt_neo_4_layer_stroke_healthcare,0.551038,0.023483
great_gpt_neo_6_layer_stroke_healthcare,0.560666,0.009332
great_gpt_neo_8_layer_stroke_healthcare,0.57133,0.029079


In [30]:
pd.DataFrame(temp).loc[["accuracy", "accuracy_std"]].T

Unnamed: 0,accuracy,accuracy_std
great_gpt2_12_layer_stroke_healthcare,0.918412,0.009173
great_gpt2_6_layer_stroke_healthcare,0.912274,0.011494
great_gpt_bigcode_12_layer_stroke_healthcare,0.906498,0.006891
great_gpt_bigcode_6_layer_stroke_healthcare,0.905596,0.008741
great_gpt_j_1_layer_stroke_healthcare,0.913177,0.005863
great_gpt_neox_1_layer_stroke_healthcare,0.920397,0.004877
great_gpt_neo_2_layer_stroke_healthcare,0.912455,0.006601
great_gpt_neo_4_layer_stroke_healthcare,0.91426,0.004602
great_gpt_neo_6_layer_stroke_healthcare,0.920397,0.005722
great_gpt_neo_8_layer_stroke_healthcare,0.919134,0.005658
