In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
import json

In [3]:
with open('../datasets/index.json', 'r') as file:
    files = json.load(file)

In [4]:
algorithms = {
    #svm.SVC(kernel='linear'): 'Support Vector Machines (SVM)',
    KNeighborsClassifier(n_neighbors=3): 'KNN',
    LogisticRegression(): 'LR',
    #DecisionTreeClassifier(): 'Decision Trees',
    RandomForestClassifier(): 'RF',
    GaussianNB(): 'NB',
    GradientBoostingClassifier(): 'GB',
    #MLPClassifier(): 'MLP',
    AdaBoostClassifier(): 'ADA'
}

In [5]:
performances = []

for file in files:

    dataset = pd.read_csv(f"../datasets/{file['file']}")

    X = dataset.drop(file["target"],axis=1)
    y = dataset[file["target"]]

    n_splits = 1

    splitted = False

    if len(X)>15000 or len(X)<400:
        continue
    
    n_splits = 4
    stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=33)
    splits = stratified_kfold.split(X, y)
    splits_array = [b for a,b in splits]
    splitted = True

    for split in range(n_splits):

        X_split, y_split = X, y

        if splitted:
            X_split, y_split = X.iloc[splits_array[split]], y.iloc[splits_array[split]]

        performance = {'dataset':f"{file['name']}_{split+1}_of_{n_splits}"}

        for algorithm, algorithm_name in algorithms.items():

            print(algorithm_name)

            stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=33)

            accuracies = cross_val_score(algorithm, X_split, y_split, cv=stratified_kfold)
            
            performance[algorithm_name] = accuracies.mean()

        print(performance)
        performances.append(performance)

KNN
LR
RF
NB
GB
ADA
{'dataset': 'diabetes_1_of_4', 'KNN': 0.7133603238866397, 'LR': 0.7802968960863698, 'RF': 0.7489878542510121, 'NB': 0.7699055330634279, 'GB': 0.7492577597840755, 'ADA': 0.7443994601889339}
KNN
LR
RF
NB
GB
ADA
{'dataset': 'diabetes_2_of_4', 'KNN': 0.7133603238866397, 'LR': 0.7136302294197032, 'RF': 0.7507422402159245, 'NB': 0.7554655870445345, 'GB': 0.7346828609986505, 'ADA': 0.7139001349527665}
KNN
LR
RF
NB
GB
ADA
{'dataset': 'diabetes_3_of_4', 'KNN': 0.7238866396761134, 'LR': 0.7502024291497976, 'RF': 0.7709851551956814, 'NB': 0.7394062078272604, 'GB': 0.7653171390013496, 'ADA': 0.7136302294197031}
KNN
LR
RF
NB
GB
ADA
{'dataset': 'diabetes_4_of_4', 'KNN': 0.7086369770580296, 'LR': 0.7813765182186234, 'RF': 0.7340080971659919, 'NB': 0.7658569500674763, 'GB': 0.7337381916329285, 'ADA': 0.7342780026990553}
KNN
LR
RF
NB
GB
ADA
{'dataset': 'symptoms_1_of_4', 'KNN': 0.9617886178861788, 'LR': 0.8699186991869918, 'RF': 0.983739837398374, 'NB': 0.8902439024390244, 'GB': 0.9

In [6]:
performances_df = pd.DataFrame(performances)
performances_df

Unnamed: 0,dataset,KNN,LR,RF,NB,GB,ADA
0,diabetes_1_of_4,0.713360,0.780297,0.748988,0.769906,0.749258,0.744399
1,diabetes_2_of_4,0.713360,0.713630,0.750742,0.755466,0.734683,0.713900
2,diabetes_3_of_4,0.723887,0.750202,0.770985,0.739406,0.765317,0.713630
3,diabetes_4_of_4,0.708637,0.781377,0.734008,0.765857,0.733738,0.734278
4,symptoms_1_of_4,0.961789,0.869919,0.983740,0.890244,0.982927,0.086179
...,...,...,...,...,...,...,...
71,cervicalCancer_4_of_4,0.920709,0.939313,0.934662,0.846512,0.934662,0.920709
72,epilepsy_1_of_4,0.906087,0.603826,0.967304,0.962435,0.955826,0.948174
73,epilepsy_2_of_4,0.903652,0.605913,0.965217,0.950261,0.954783,0.936696
74,epilepsy_3_of_4,0.904000,0.614957,0.964522,0.958957,0.955478,0.942609


In [7]:
def reorder(row, columns):
    sorted_index = row.loc[columns].sort_values(ascending=False).index
    new_columns = [f'v{i}' for i in range(1, len(sorted_index) + 1)]
    return pd.Series(sorted_index.values, index=new_columns)

In [8]:
performance_df_sorted = performances_df.apply(reorder, axis=1, columns=performances_df.columns[1:])
performance_df_sorted = pd.concat([performances_df[['dataset']], performance_df_sorted], axis=1)
performance_df_sorted

Unnamed: 0,dataset,v1,v2,v3,v4,v5,v6
0,diabetes_1_of_4,LR,NB,GB,RF,ADA,KNN
1,diabetes_2_of_4,NB,RF,GB,ADA,LR,KNN
2,diabetes_3_of_4,RF,GB,LR,NB,KNN,ADA
3,diabetes_4_of_4,LR,NB,ADA,RF,GB,KNN
4,symptoms_1_of_4,RF,GB,KNN,NB,LR,ADA
...,...,...,...,...,...,...,...
71,cervicalCancer_4_of_4,LR,RF,GB,ADA,KNN,NB
72,epilepsy_1_of_4,RF,NB,GB,ADA,KNN,LR
73,epilepsy_2_of_4,RF,GB,NB,ADA,KNN,LR
74,epilepsy_3_of_4,RF,NB,GB,ADA,KNN,LR


In [9]:
performance_df_sorted['v1'].value_counts()

RF     28
LR     25
GB     13
NB      4
ADA     3
KNN     3
Name: v1, dtype: int64

In [10]:
performances_df.to_csv(f'../performances/splitted/1_of_{n_splits}.csv',index=False)
performance_df_sorted.to_csv(f'../performances/splitted/1_of_{n_splits}_sort.csv',index=False)