In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
import json

In [3]:
with open('../datasets/index.json', 'r') as file:
    files = json.load(file)

In [4]:
algorithms = {
    #svm.SVC(kernel='linear'): 'Support Vector Machines (SVM)',
    KNeighborsClassifier(n_neighbors=3): 'KNN',
    LogisticRegression(): 'LR',
    #DecisionTreeClassifier(): 'Decision Trees',
    RandomForestClassifier(): 'RF',
    GaussianNB(): 'NB',
    GradientBoostingClassifier(): 'GB',
    #MLPClassifier(): 'MLP',
    AdaBoostClassifier(): 'ADA'
}

In [5]:
performances = []

for file in files:

    dataset = pd.read_csv(f"../datasets/{file['file']}")

    X = dataset.drop(file["target"],axis=1)
    y = dataset[file["target"]]

    n_splits = 1

    splitted = False

    if len(X)>15000 or len(X)<900:
        continue
    
    n_splits = 10
    stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=33)
    splits = stratified_kfold.split(X, y)
    splits_array = [b for a,b in splits]
    splitted = True

    for split in range(n_splits):

        X_split, y_split = X, y

        if splitted:
            X_split, y_split = X.iloc[splits_array[split]], y.iloc[splits_array[split]]

        performance = {'dataset':f"{file['name']}_{split+1}_of_{n_splits}"}

        for algorithm, algorithm_name in algorithms.items():

            print(algorithm_name)

            stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=33)

            accuracies = cross_val_score(algorithm, X_split, y_split, cv=stratified_kfold)
            
            performance[algorithm_name] = accuracies.mean()

        print(performance)
        performances.append(performance)

KNN
LR
RF
NB
GB
ADA
{'dataset': 'symptoms_1_of_10', 'KNN': 0.8841269841269842, 'LR': 0.833353947639662, 'RF': 0.9512059369202227, 'NB': 0.8516594516594516, 'GB': 0.9349824778396207, 'ADA': 0.07318078746650175}
KNN
LR
RF
NB
GB
ADA
{'dataset': 'symptoms_2_of_10', 'KNN': 0.8882498453927026, 'LR': 0.8516594516594516, 'RF': 0.9472273757988043, 'NB': 0.8741496598639455, 'GB': 0.9431869717584004, 'ADA': 0.0670995670995671}
KNN
LR
RF
NB
GB
ADA
{'dataset': 'symptoms_3_of_10', 'KNN': 0.900247371675943, 'LR': 0.8881467738610596, 'RF': 0.9533292104720676, 'NB': 0.8638425066996496, 'GB': 0.9553494124922697, 'ADA': 0.07520098948670377}
KNN
LR
RF
NB
GB
ADA
{'dataset': 'symptoms_4_of_10', 'KNN': 0.8943104514533087, 'LR': 0.8556586270871985, 'RF': 0.9532261389404246, 'NB': 0.8780457637600495, 'GB': 0.9511853226138941, 'ADA': 0.08536384250669964}
KNN
LR
RF
NB
GB
ADA
{'dataset': 'symptoms_5_of_10', 'KNN': 0.9003916718202433, 'LR': 0.8454545454545455, 'RF': 0.9614100185528758, 'NB': 0.8048237476808906, 'G

In [6]:
performances_df = pd.DataFrame(performances)
performances_df

Unnamed: 0,dataset,KNN,LR,RF,NB,GB,ADA
0,symptoms_1_of_10,0.884127,0.833354,0.951206,0.851659,0.934982,0.073181
1,symptoms_2_of_10,0.888250,0.851659,0.947227,0.874150,0.943187,0.067100
2,symptoms_3_of_10,0.900247,0.888147,0.953329,0.863843,0.955349,0.075201
3,symptoms_4_of_10,0.894310,0.855659,0.953226,0.878046,0.951185,0.085364
4,symptoms_5_of_10,0.900392,0.845455,0.961410,0.804824,0.953226,0.079262
...,...,...,...,...,...,...,...
95,epilepsy_6_of_10,0.864348,0.596522,0.960000,0.960870,0.959130,0.958261
96,epilepsy_7_of_10,0.886957,0.574783,0.956522,0.966957,0.947826,0.933913
97,epilepsy_8_of_10,0.873043,0.572174,0.953043,0.952174,0.949565,0.937391
98,epilepsy_9_of_10,0.879130,0.598261,0.953913,0.956522,0.942609,0.933913


In [7]:
def reorder(row, columns):
    sorted_index = row.loc[columns].sort_values(ascending=False).index
    new_columns = [f'v{i}' for i in range(1, len(sorted_index) + 1)]
    return pd.Series(sorted_index.values, index=new_columns)

In [8]:
performance_df_sorted = performances_df.apply(reorder, axis=1, columns=performances_df.columns[1:])
performance_df_sorted = pd.concat([performances_df[['dataset']], performance_df_sorted], axis=1)
performance_df_sorted

Unnamed: 0,dataset,v1,v2,v3,v4,v5,v6
0,symptoms_1_of_10,RF,GB,KNN,NB,LR,ADA
1,symptoms_2_of_10,RF,GB,KNN,NB,LR,ADA
2,symptoms_3_of_10,GB,RF,KNN,LR,NB,ADA
3,symptoms_4_of_10,RF,GB,KNN,NB,LR,ADA
4,symptoms_5_of_10,RF,GB,KNN,LR,NB,ADA
...,...,...,...,...,...,...,...
95,epilepsy_6_of_10,NB,RF,GB,ADA,KNN,LR
96,epilepsy_7_of_10,NB,RF,GB,ADA,KNN,LR
97,epilepsy_8_of_10,RF,NB,GB,ADA,KNN,LR
98,epilepsy_9_of_10,NB,RF,GB,ADA,KNN,LR


In [9]:
performance_df_sorted['v1'].value_counts()

RF     42
GB     18
LR     16
NB      9
KNN     9
ADA     6
Name: v1, dtype: int64

In [10]:
performances_df.to_csv(f'../performances/splitted/1_of_{n_splits}.csv',index=False)
performance_df_sorted.to_csv(f'../performances/splitted/1_of_{n_splits}_sort.csv',index=False)