In [None]:
import sys
# !{sys.executable} -m pip install mlxtend
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle5 as pickle
from mlxtend.classifier import EnsembleVoteClassifier
from tpot.metrics import balanced_accuracy
from sklearn.metrics import average_precision_score, roc_auc_score, precision_recall_curve, auc, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
data = pd.read_csv("kenya_stunted_data.csv", index_col="Participant_Id")

In [None]:
x = data.drop("Stunted [EUPATH_0035062]", axis=1)
y = data["Stunted [EUPATH_0035062]"]
x.info()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42, stratify=y)

In [None]:
files = []
random_seeds = ["100", '12', '124', '24', '34', '44', '50', '68', '72', '75']

for random in random_seeds:
    filepath = random + '/kenya_stuntedPipeline_balanced_accuracy_' + random + '.pkl'
    pipeline = pickle.load(open(filepath, 'rb'))
    files.append(pipeline)

In [None]:
eclf = EnsembleVoteClassifier(clfs=files, voting='soft', weights=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], fit_base_estimators=False)

In [None]:
eclf.fit(x_train, y_train)

In [None]:
eclf.score(x_test, y_test)

In [None]:
YtestProba = eclf.predict_proba(x_test)[:, 1]
YtestPred = eclf.predict(x_test)

row = []
row.append("Soft_Voting")
row.append(average_precision_score(y_test, YtestProba))
row.append(roc_auc_score(y_test, YtestProba))
precisionTest, recallTest, _ = precision_recall_curve(y_test, YtestProba)
row.append(auc(recallTest, precisionTest))
row.append(accuracy_score(y_test, YtestPred))
row.append(balanced_accuracy(y_test, YtestPred))
row.append(precision_score(y_test, YtestPred))
row.append(recall_score(y_test, YtestPred))
row.append(f1_score(y_test, YtestPred))
[TN, FP, FN, TP] =  confusion_matrix(y_test, YtestPred).ravel()
row.append(TN)
row.append(FP)
row.append(FN)
row.append(TP)

rows = []
rows.append(row)

In [None]:
for i in range(10):
    row = []
    YtestProba = files[i].predict_proba(x_test)[:, 1]
    YtestPred = files[i].predict(x_test)
    
    row.append("pipeline_" + random_seeds[i])
    row.append(average_precision_score(y_test, YtestProba))
    row.append(roc_auc_score(y_test, YtestProba))
    precisionTest, recallTest, _ = precision_recall_curve(y_test, YtestProba)
    row.append(auc(recallTest, precisionTest))
    row.append(accuracy_score(y_test, YtestPred))
    row.append(balanced_accuracy(y_test, YtestPred))
    row.append(precision_score(y_test, YtestPred))
    row.append(recall_score(y_test, YtestPred))
    row.append(f1_score(y_test, YtestPred))
    [TN, FP, FN, TP] =  confusion_matrix(y_test, YtestPred).ravel()
    row.append(TN)
    row.append(FP)
    row.append(FN)
    row.append(TP)
    
    rows.append(row)
compArray = np.array(rows)
compArray

In [None]:
compDF = pd.DataFrame(compArray, columns=['pipeline', "average_precision_score", "roc_auc_score", "auc", 
                                                "accuracy_score", "balanced_accuracy", "precision_score", 
                                                "recall_score", "f1_score", "TN", 'FP', 'FN', 'TP'])
compDF

In [None]:
compDF.to_csv("accuracy_scores.csv", index=False)

In [None]:
pickle.dump(eclf, open('kenya_softVoting.pkl', 'wb'))