In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
import json

In [4]:
with open('../datasets/index.json', 'r') as file:
    files = json.load(file)

In [5]:
algorithms = {
    #svm.SVC(kernel='linear'): 'Support Vector Machines (SVM)',
    KNeighborsClassifier(n_neighbors=3): 'KNN',
    LogisticRegression(): 'LR',
    #DecisionTreeClassifier(): 'Decision Trees',
    RandomForestClassifier(): 'RF',
    GaussianNB(): 'NB',
    GradientBoostingClassifier(): 'GB',
    #MLPClassifier(): 'MLP',
    AdaBoostClassifier(): 'ADA'
}

In [6]:
performances = []

file = files[0]

dataset = pd.read_csv(f"../datasets/{file['file']}")

X = dataset.drop(file["target"],axis=1)
y = dataset[file["target"]]

X, y = shuffle(X, y, random_state=33)

n_splits = 1

splitted = False

if len(X)>60000:
    n_splits = len(X)//30000
    splitted = True

for split in range(n_splits):

    X_split, y_split = X, y
    if splitted:
        X_split, y_split = X.iloc[split*30000:(split+1)*30000], y.iloc[split*30000:(split+1)*30000]

    performance = {'dataset':f"{file['name']}_{split+1}_of_{n_splits}"}

    for algorithm, algorithm_name in algorithms.items():

        print(algorithm_name)

        stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=33)

        accuracies = cross_val_score(algorithm, X_split, y_split, cv=stratified_kfold)
        
        performance[algorithm_name] = accuracies.mean()

    print(performance)
    performances.append(performance)

KNN
LR
RF
NB
GB
ADA
{'dataset': 'covid19_1_of_34', 'KNN': 0.9321666666666666, 'LR': 0.9353666666666667, 'RF': 0.9445666666666666, 'NB': 0.9033666666666667, 'GB': 0.9500333333333332, 'ADA': 0.9459333333333333}
KNN
LR
RF
NB
GB
ADA
{'dataset': 'covid19_2_of_34', 'KNN': 0.9361333333333333, 'LR': 0.9374, 'RF': 0.9446, 'NB': 0.8959666666666666, 'GB': 0.951, 'ADA': 0.9469}
KNN
LR
RF
NB
GB
ADA
{'dataset': 'covid19_3_of_34', 'KNN': 0.9330333333333332, 'LR': 0.9350999999999999, 'RF': 0.9434000000000001, 'NB': 0.8946333333333334, 'GB': 0.9480000000000001, 'ADA': 0.9456333333333333}
KNN
LR
RF
NB
GB
ADA
{'dataset': 'covid19_4_of_34', 'KNN': 0.9342666666666666, 'LR': 0.9380666666666666, 'RF': 0.943, 'NB': 0.8970333333333332, 'GB': 0.9490000000000001, 'ADA': 0.9474666666666666}
KNN
LR
RF
NB
GB
ADA
{'dataset': 'covid19_5_of_34', 'KNN': 0.9337333333333333, 'LR': 0.9347666666666667, 'RF': 0.9425666666666667, 'NB': 0.8927333333333334, 'GB': 0.9502333333333333, 'ADA': 0.9460666666666666}
KNN
LR
RF
NB
GB
A

In [33]:
performances_df = pd.DataFrame(performances)
performances_df

Unnamed: 0,dataset,KNN,LR,RF,NB,GB,ADA
0,covid19_1_of_69,0.934658,0.939133,0.944990,0.906496,0.950583,0.946437
1,covid19_2_of_69,0.928341,0.933803,0.941830,0.902612,0.948871,0.945384
2,covid19_3_of_69,0.928276,0.930907,0.939199,0.918406,0.946108,0.942949
3,covid19_4_of_69,0.927618,0.934527,0.943476,0.886031,0.949333,0.947490
4,covid19_5_of_69,0.928999,0.935119,0.942818,0.879977,0.950977,0.947293
...,...,...,...,...,...,...,...
121,arrhythmiaBinary_4_of_6,0.965735,0.934986,0.984506,0.935165,0.979441,0.972946
122,arrhythmiaBinary_5_of_6,0.968119,0.933556,0.985162,0.931589,0.980216,0.968476
123,arrhythmiaBinary_6_of_6,0.966927,0.931410,0.984804,0.937072,0.979203,0.969907
124,cervicalCancer_1_of_1,0.932402,0.958038,0.951034,0.871787,0.956875,0.948728


In [34]:
def reorder(row, columns):
    sorted_index = row.loc[columns].sort_values(ascending=False).index
    new_columns = [f'v{i}' for i in range(1, len(sorted_index) + 1)]
    return pd.Series(sorted_index.values, index=new_columns)

In [35]:
performance_df_sorted = performances_df.apply(reorder, axis=1, columns=performances_df.columns[1:])
performance_df_sorted = pd.concat([performances_df[['dataset']], performance_df_sorted], axis=1)
performance_df_sorted

Unnamed: 0,dataset,v1,v2,v3,v4,v5,v6
0,covid19_1_of_69,GB,ADA,RF,LR,KNN,NB
1,covid19_2_of_69,GB,ADA,RF,LR,KNN,NB
2,covid19_3_of_69,GB,ADA,RF,LR,KNN,NB
3,covid19_4_of_69,GB,ADA,RF,LR,KNN,NB
4,covid19_5_of_69,GB,ADA,RF,LR,KNN,NB
...,...,...,...,...,...,...,...
121,arrhythmiaBinary_4_of_6,RF,GB,ADA,KNN,NB,LR
122,arrhythmiaBinary_5_of_6,RF,GB,ADA,KNN,LR,NB
123,arrhythmiaBinary_6_of_6,RF,GB,ADA,KNN,NB,LR
124,cervicalCancer_1_of_1,LR,GB,RF,ADA,KNN,NB


In [36]:
performance_df_sorted['v1'].value_counts()

GB     78
LR     17
RF     16
ADA    11
NB      2
KNN     2
Name: v1, dtype: int64

In [37]:
performances_df.to_csv('../performances/splitted/1_of_1.csv',index=False)

In [38]:
performance_df_sorted.to_csv('../performances/splitted/1_of_1_sort.csv',index=False)