In [28]:
import warnings
warnings.filterwarnings('ignore')

In [29]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
import json

In [30]:
with open('../datasets/index.json', 'r') as file:
    files = json.load(file)

In [31]:
algorithms = {
    #svm.SVC(kernel='linear'): 'Support Vector Machines (SVM)',
    KNeighborsClassifier(n_neighbors=3): 'KNN',
    LogisticRegression(): 'LR',
    #DecisionTreeClassifier(): 'Decision Trees',
    RandomForestClassifier(): 'RF',
    GaussianNB(): 'NB',
    GradientBoostingClassifier(): 'GB',
    #MLPClassifier(): 'MLP',
    AdaBoostClassifier(): 'ADA'
}

In [32]:
performances = []

for file in files:

    dataset = pd.read_csv(f"../datasets/{file['file']}")

    X = dataset.drop(file["target"],axis=1)
    y = dataset[file["target"]]

    n_splits = 1

    splitted = False

    if len(X)>30000:
        n_splits = len(X)//15000
        stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=33)
        splits = stratified_kfold.split(X, y)
        splits_array = [b for a,b in splits]
        splitted = True

    for split in range(n_splits):

        X_split, y_split = X, y

        if splitted:
            X_split, y_split = X.iloc[splits_array[split]], y.iloc[splits_array[split]]

        performance = {'dataset':f"{file['name']}_{split+1}_of_{n_splits}"}

        for algorithm, algorithm_name in algorithms.items():

            print(algorithm_name)

            stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=33)

            accuracies = cross_val_score(algorithm, X_split, y_split, cv=stratified_kfold)
            
            performance[algorithm_name] = accuracies.mean()

        print(performance)
        performances.append(performance)

KNN
LR
RF
NB
GB
ADA
{'dataset': 'covid19_1_of_69', 'KNN': 0.9346581934846989, 'LR': 0.9391329817633917, 'RF': 0.9449895005282208, 'NB': 0.906496142255936, 'GB': 0.9505825150239865, 'ADA': 0.9464371070816231}
KNN
LR
RF
NB
GB
ADA
{'dataset': 'covid19_2_of_69', 'KNN': 0.928340975216917, 'LR': 0.9338026488976638, 'RF': 0.9418304800748167, 'NB': 0.9026121819850713, 'GB': 0.9488713609047297, 'ADA': 0.9453842373703261}
KNN
LR
RF
NB
GB
ADA
{'dataset': 'covid19_3_of_69', 'KNN': 0.9282756187111412, 'LR': 0.9309074790876501, 'RF': 0.9391985980499126, 'NB': 0.9184056173256439, 'GB': 0.9461079865260388, 'ADA': 0.942949442337334}
KNN
LR
RF
NB
GB
ADA
{'dataset': 'covid19_4_of_69', 'KNN': 0.9276177672710899, 'LR': 0.9345266145373305, 'RF': 0.943476234391507, 'NB': 0.8860308532931539, 'GB': 0.9493327098595452, 'ADA': 0.9474902582220606}
KNN
LR
RF
NB
GB
ADA
{'dataset': 'covid19_5_of_69', 'KNN': 0.9289992596248766, 'LR': 0.9351185249649296, 'RF': 0.9428180149287335, 'NB': 0.8799765331393636, 'GB': 0.9509

In [3]:
performances_df = pd.DataFrame(performances)
performances_df

NameError: name 'pd' is not defined

In [34]:
def reorder(row, columns):
    sorted_index = row.loc[columns].sort_values(ascending=False).index
    new_columns = [f'v{i}' for i in range(1, len(sorted_index) + 1)]
    return pd.Series(sorted_index.values, index=new_columns)

In [35]:
performance_df_sorted = performances_df.apply(reorder, axis=1, columns=performances_df.columns[1:])
performance_df_sorted = pd.concat([performances_df[['dataset']], performance_df_sorted], axis=1)
performance_df_sorted

Unnamed: 0,dataset,v1,v2,v3,v4,v5,v6
0,covid19_1_of_69,GB,ADA,RF,LR,KNN,NB
1,covid19_2_of_69,GB,ADA,RF,LR,KNN,NB
2,covid19_3_of_69,GB,ADA,RF,LR,KNN,NB
3,covid19_4_of_69,GB,ADA,RF,LR,KNN,NB
4,covid19_5_of_69,GB,ADA,RF,LR,KNN,NB
...,...,...,...,...,...,...,...
121,arrhythmiaBinary_4_of_6,RF,GB,ADA,KNN,NB,LR
122,arrhythmiaBinary_5_of_6,RF,GB,ADA,KNN,LR,NB
123,arrhythmiaBinary_6_of_6,RF,GB,ADA,KNN,NB,LR
124,cervicalCancer_1_of_1,LR,GB,RF,ADA,KNN,NB


In [36]:
performance_df_sorted['v1'].value_counts()

GB     78
LR     17
RF     16
ADA    11
NB      2
KNN     2
Name: v1, dtype: int64

In [37]:
performances_df.to_csv('../performances/splitted/1_of_1.csv',index=False)

In [38]:
performance_df_sorted.to_csv('../performances/splitted/1_of_1_sort.csv',index=False)