## Import Libraries and Database

In [10]:
# Import the libraries
import sklearn as sk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from scipy.stats import kruskal, mannwhitneyu

# Import the models
import src.DT_model as dt
import src.KNN_model as knn
import src.NB_model as nb
import src.SVM_model as svm
import src.MLP_model as mlp
import src.BC_model as bc

# Load the database
data = pd.read_csv("CTG.csv")
data = data.drop(data.columns[0], axis=1)

data.head()

Unnamed: 0,LB,AC,FM,UC,ASTV,MSTV,ALTV,MLTV,DL,DS,...,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency,NSP
0,120,0,0,0,73,0.5,43,2.4,0,0,...,62,126,2,0,120,137,121,73,1,2
1,132,4,0,4,17,2.1,0,10.4,2,0,...,68,198,6,1,141,136,140,12,0,1
2,133,2,0,5,16,2.1,0,13.4,2,0,...,68,198,5,1,141,135,138,13,0,1
3,134,2,0,6,16,2.4,0,23.0,2,0,...,53,170,11,0,137,134,137,13,1,1
4,132,4,0,5,16,2.4,0,19.9,0,0,...,53,170,9,0,137,136,138,11,1,1


## Create the Dataframe

In [11]:
df_data = pd.DataFrame(data)
df_data = df_data.drop(columns=["NSP"])
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2126 entries, 0 to 2125
Data columns (total 21 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   LB        2126 non-null   int64  
 1   AC        2126 non-null   int64  
 2   FM        2126 non-null   int64  
 3   UC        2126 non-null   int64  
 4   ASTV      2126 non-null   int64  
 5   MSTV      2126 non-null   float64
 6   ALTV      2126 non-null   int64  
 7   MLTV      2126 non-null   float64
 8   DL        2126 non-null   int64  
 9   DS        2126 non-null   int64  
 10  DP        2126 non-null   int64  
 11  Width     2126 non-null   int64  
 12  Min       2126 non-null   int64  
 13  Max       2126 non-null   int64  
 14  Nmax      2126 non-null   int64  
 15  Nzeros    2126 non-null   int64  
 16  Mode      2126 non-null   int64  
 17  Mean      2126 non-null   int64  
 18  Median    2126 non-null   int64  
 19  Variance  2126 non-null   int64  
 20  Tendency  2126 non-null   int6

## Data Separation

In [12]:
from sklearn.model_selection import train_test_split

# Function the create new set every run
def data_shufle(df_data):
    
    # Shuffle the data
    df_data = sk.utils.shuffle(df_data)
    
    # Split the data
    x_train, x_temp, y_train, y_temp = train_test_split(df_data, data["NSP"], test_size=0.5, random_state=42)
    x_validation, x_test, y_validation, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)


    return x_train, x_validation, x_test, y_train, y_validation, y_test

## Run the Models

In [5]:

acc_out = pd.DataFrame(columns=["KNN", "DT", "NB", "SVM", "MLP", "MV", "SV"])

# Run the models
for i in range(20):
    
    # Shufle the data
    x_train, x_validation, x_test, y_train, y_validation, y_test = data_shufle(df_data)
    
    # ----------------------
    # Monolithic classifiers
    # ----------------------
    
    # KNN execution
    knn_acc, knn_model, knn_params = knn.KNN(x_train, y_train, x_validation, y_validation, x_test, y_test)
    
    # DT execution
    dt_acc, dt_model, dt_params = dt.DT(x_train, y_train, x_validation, y_validation, x_test, y_test)
    
    # NB execution
    nb_acc, nb_model = nb.NB(x_train, y_train, x_validation, y_validation, x_test, y_test)
    
    # SVM execution
    svm_acc, svm_model, svm_params = svm.SVM(x_train, y_train, x_validation, y_validation, x_test, y_test)
    
    # MLP execution
    mlp_acc, mlp_model, mlp_params = mlp.MLP(x_train, y_train, x_validation, y_validation, x_test, y_test)
    
    # --------------------
    # Multiple classifiers
    # --------------------
    from sklearn.ensemble import VotingClassifier
    estimators = [('KNN', knn_model), ('DT', dt_model), ('NB', nb_model), ('SVM', svm_model), ('MLP', mlp_model)]
    y_test = np.ascontiguousarray(y_test)
    x_test = np.ascontiguousarray(x_test)
    
    # Majority Voting
    majority_voting = VotingClassifier(estimators, voting='hard')
    majority_voting.fit(x_train, y_train)
    majority_voting_acc = sk.metrics.accuracy_score(y_test, majority_voting.predict(x_test))
    
    # Sum Voting
    sum_voting = VotingClassifier(estimators, voting='soft')
    sum_voting.fit(x_train, y_train)
    sum_voting_acc = sk.metrics.accuracy_score(y_test, sum_voting.predict(x_test))
    
    # Borda Count
    borda_count = bc.BordaCountClassifier(estimators)
    borda_count.fit(x_train, y_train)
    borda_count_acc = sk.metrics.accuracy_score(y_test, borda_count.predict(x_test))
    
    # Save the results
    acc_out.loc[i] = [knn_acc, dt_acc, nb_acc, svm_acc, mlp_acc, majority_voting_acc, sum_voting_acc]
    

# Generete the CSV file
acc_out.to_csv("acc_out.csv", index=False) 
    
    
    

KNN
DT
NB
SVM
MLP




## Mann-Whitney U test

In [None]:
def mannwhitneyu_pair(df, model_col, a):
    print("Mann-Whitney U test")
    for i in range(len(model_col)):
        for j in range(i+1, len(model_col)):
            u_statistic, pVal = mannwhitneyu(df[model_col[i]], df[model_col[j]], alternative='two-sided')
            
            print("============Mann-Whitney============")
            print(model_col[i], "x", model_col[j], ":", u_statistic, "\nPval:", pVal)
            if pVal < a:
                print("Reject H0")
            else:
                print("Fail to reject H0")
            print("====================================")

## Analyze the Results - Monolithic Classifiers

In [None]:
# Read the output file
output_df = pd.read_csv("acc_out.csv")

alpha = 0.05

# Extract the data
knn = output_df["KNN"]
dt = output_df["DT"]
nb = output_df["NB"]
svm = output_df["SVM"]
mlp = output_df["MLP"]

# Kruskal-Wallis test
stat, p = kruskal(knn_acc, dt_acc, nb_acc, svm_acc, mlp_acc)

print('Kruskal Statistics=%.3f, P=%.3f' % (stat, p))

if p < alpha:
    print("Reject H0. Significant differences between the groups\n")
    model_col = ["KNN", "DT", "NB", "SVM", "MLP"]
    mannwhitneyu_pair(output_df, model_col, alpha)
else:
    print("Fail to reject H0. No significant differences between the groups\n")

## Analyze the Results - Multiple Classifiers

In [None]:
# Read the output file
output_df = pd.read_csv("acc_out.csv")

alpha = 0.05

# Extract the data
mv = output_df["MV"]
sv = output_df["SV"]
bc = output_df["BC"]

# Kruskal-Wallis test
stat, p = kruskal(mv, sv, bc)

print('Kruskal Statistics=%.3f, P=%.3f' % (stat, p))

if p < alpha:
    print("Reject H0. Significant differences between the groups\n")
    model_col = ["MV", "SV", "BC"]
    mannwhitneyu_pair(output_df, model_col, alpha)
else:
    print("Fail to reject H0. No significant differences between the groups\n")