In [21]:
import numpy as np
import pandas as pd

# Load the Parkinson's Disease dataset
# Load the Parkinson's Disease dataset
df = pd.read_csv('pd_speech_features.csv')

# Data cleaning
df = df.drop('id', axis=1)  # Remove the 'name' column

# Data preprocessing
X = df.drop('class', axis=1)  # Features
Y = df['class']  # Target variable

In [22]:
from imblearn.over_sampling import SVMSMOTE
from mrmr import mrmr_classif
svmsmote = SVMSMOTE(random_state=42)

# Oversampling the data
X, Y = svmsmote.fit_resample(X, Y)
#Shuffling the data
X['class'] = Y
X=X.sample(frac=1,random_state=0).reset_index(drop=True)
Y=X['class']
X=X.drop('class', axis=1)

print(df.columns)
# Select the features
features = mrmr_classif(X,Y, K=528)

# Print the selected features
print(features)

Index(['gender', 'PPE', 'DFA', 'RPDE', 'numPulses', 'numPeriodsPulses',
       'meanPeriodPulses', 'stdDevPeriodPulses', 'locPctJitter',
       'locAbsJitter',
       ...
       'tqwt_kurtosisValue_dec_28', 'tqwt_kurtosisValue_dec_29',
       'tqwt_kurtosisValue_dec_30', 'tqwt_kurtosisValue_dec_31',
       'tqwt_kurtosisValue_dec_32', 'tqwt_kurtosisValue_dec_33',
       'tqwt_kurtosisValue_dec_34', 'tqwt_kurtosisValue_dec_35',
       'tqwt_kurtosisValue_dec_36', 'class'],
      dtype='object', length=754)


100%|██████████| 528/528 [01:15<00:00,  7.00it/s]

['std_6th_delta_delta', 'tqwt_stdValue_dec_26', 'mean_2nd_delta', 'tqwt_TKEO_mean_dec_33', 'tqwt_maxValue_dec_11', 'std_9th_delta_delta', 'std_delta_delta_log_energy', 'mean_MFCC_2nd_coef', 'std_7th_delta_delta', 'tqwt_entropy_log_dec_12', 'tqwt_kurtosisValue_dec_33', 'std_8th_delta_delta', 'tqwt_entropy_log_dec_26', 'std_6th_delta', 'tqwt_minValue_dec_12', 'std_11th_delta_delta', 'DFA', 'std_9th_delta', 'tqwt_maxValue_dec_12', 'locPctJitter', 'std_10th_delta_delta', 'tqwt_energy_dec_6', 'std_8th_delta', 'tqwt_energy_dec_25', 'std_7th_delta', 'tqwt_entropy_log_dec_13', 'app_det_TKEO_mean_7_coef', 'std_10th_delta', 'tqwt_maxValue_dec_13', 'std_11th_delta', 'tqwt_stdValue_dec_12', 'std_4th_delta_delta', 'tqwt_stdValue_dec_11', 'tqwt_meanValue_dec_25', 'tqwt_entropy_log_dec_16', 'std_4th_delta', 'tqwt_entropy_log_dec_27', 'app_LT_TKEO_std_7_coef', 'std_12th_delta_delta', 'tqwt_minValue_dec_13', 'std_delta_log_energy', 'tqwt_minValue_dec_11', 'std_5th_delta_delta', 'app_TKEO_std_8_coef', '




In [23]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_temp, X_test, Y_temp, Y_test = train_test_split(X, Y, test_size=0.4, random_state=42)

X_train, X_val, Y_train, Y_val = train_test_split(X_temp, Y_temp, test_size=0.25, random_state=42)

scaler=StandardScaler()
columns=X_train.columns

X_train=pd.DataFrame(scaler.fit_transform(X_train), columns=columns)
X_val=pd.DataFrame(scaler.fit_transform(X_val), columns=columns)
X_test=pd.DataFrame(scaler.fit_transform(X_test), columns=columns)
print(X_train.columns)


Index(['gender', 'PPE', 'DFA', 'RPDE', 'numPulses', 'numPeriodsPulses',
       'meanPeriodPulses', 'stdDevPeriodPulses', 'locPctJitter',
       'locAbsJitter',
       ...
       'tqwt_kurtosisValue_dec_27', 'tqwt_kurtosisValue_dec_28',
       'tqwt_kurtosisValue_dec_29', 'tqwt_kurtosisValue_dec_30',
       'tqwt_kurtosisValue_dec_31', 'tqwt_kurtosisValue_dec_32',
       'tqwt_kurtosisValue_dec_33', 'tqwt_kurtosisValue_dec_34',
       'tqwt_kurtosisValue_dec_35', 'tqwt_kurtosisValue_dec_36'],
      dtype='object', length=753)


In [24]:
import random
k=15 #Number of bags

# Randomly select 753 elements from the list, allowing duplicates
X_train_sets=[]
X_val_sets=[]
X_test_sets=[]
random.seed(0)
for i in range(k):
    resampled_features = random.choices(features, k=753)
    X_train_sets.append(X_train[resampled_features])
    X_val_sets.append(X_val[resampled_features])
    X_test_sets.append(X_test[resampled_features])
    print(X_train_sets[i].columns)

Index(['tqwt_TKEO_std_dec_23', 'tqwt_TKEO_std_dec_28',
       'app_LT_entropy_log_9_coef', 'app_LT_entropy_shannon_5_coef',
       'app_entropy_log_3_coef', 'std_0th_delta', 'det_LT_TKEO_std_10_coef',
       'tqwt_entropy_log_dec_20', 'GNE_SNR_SEO', 'tqwt_stdValue_dec_5',
       ...
       'GQ_std_cycle_closed', 'tqwt_entropy_log_dec_34',
       'app_LT_TKEO_std_10_coef', 'mean_MFCC_11th_coef',
       'tqwt_stdValue_dec_19', 'tqwt_entropy_shannon_dec_10',
       'mean_MFCC_9th_coef', 'app_entropy_shannon_9_coef',
       'app_entropy_log_8_coef', 'tqwt_kurtosisValue_dec_13'],
      dtype='object', length=753)


Index(['tqwt_energy_dec_25', 'tqwt_TKEO_std_dec_32', 'std_12th_delta_delta',
       'tqwt_TKEO_mean_dec_12', 'tqwt_entropy_shannon_dec_36',
       'tqwt_entropy_log_dec_13', 'det_LT_entropy_log_9_coef',
       'tqwt_medianValue_dec_7', 'tqwt_TKEO_mean_dec_12',
       'tqwt_kurtosisValue_dec_27',
       ...
       'app_entropy_shannon_5_coef', 'tqwt_TKEO_mean_dec_25',
       'std_delta_delta_0th', 'locDbShimmer', 'app_entropy_log_6_coef',
       'GNE_SNR_TKEO', 'tqwt_TKEO_std_dec_35', 'Ea2', 'tqwt_minValue_dec_26',
       'app_LT_entropy_shannon_10_coef'],
      dtype='object', length=753)
Index(['app_entropy_shannon_1_coef', 'app_entropy_log_8_coef',
       'tqwt_entropy_log_dec_7', 'std_MFCC_11th_coef', 'tqwt_maxValue_dec_18',
       'app_LT_entropy_log_6_coef', 'std_MFCC_6th_coef', 'mean_Log_energy',
       'tqwt_entropy_log_dec_23', 'meanAutoCorrHarmonicity',
       ...
       'tqwt_TKEO_std_dec_32', 'tqwt_maxValue_dec_33', 'tqwt_meanValue_dec_11',
       'Ed2_7_coef', 'tqwt_stdVal

In [25]:
from sklearn.model_selection import KFold

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression,RidgeClassifier

from sklearn.metrics import accuracy_score

models=[]
for i in range(len(X_train_sets)):
    models.append([SVC(), GaussianNB(), KNeighborsClassifier(),\
                    DecisionTreeClassifier(random_state=42), LogisticRegression(random_state=42), RidgeClassifier(random_state=42)])

k_fold=KFold(n_splits=5, shuffle=True, random_state=42)

for i in range(len(X_train_sets)):
    X_train_fromsets=X_train_sets[i]
    
    for train_index, val_index in k_fold.split(X_train_fromsets):
        X_train_kfold, X_val_kfold = X_train_fromsets.iloc[train_index].values, X_train_fromsets.iloc[val_index].values
        Y_train_kfold, Y_val_kfold = Y_train.iloc[train_index].values, Y_train.iloc[val_index].values

        for model in models[i]:
            model.fit(X_train_kfold, Y_train_kfold)
        

In [26]:
best_models=[]
for i in range(len(X_val_sets)):
    X_val_fromsets=X_val_sets[i]
    accuracies=[]
    print(f"Bag {i+1}:")
    for model in models[i]:
        Y_pred = model.predict(X_val_fromsets.values)
        acc=accuracy_score(Y_val, Y_pred)
        accuracies.append(acc)
        print(f"{type(model).__name__} : ",acc)
    best_model=models[i][accuracies.index(max(accuracies))]
    best_models.append(best_model)
    print(f"Best model: {type(best_model).__name__}")
        
    print("\n")

    

Bag 1:
SVC :  0.8757396449704142
GaussianNB :  0.7337278106508875
KNeighborsClassifier :  0.8106508875739645
DecisionTreeClassifier :  0.757396449704142
LogisticRegression :  0.8816568047337278
RidgeClassifier :  0.7988165680473372
Best model: LogisticRegression


Bag 2:
SVC :  0.8994082840236687
GaussianNB :  0.7514792899408284
KNeighborsClassifier :  0.7869822485207101
DecisionTreeClassifier :  0.7869822485207101
LogisticRegression :  0.8698224852071006
RidgeClassifier :  0.7633136094674556
Best model: SVC


Bag 3:
SVC :  0.8994082840236687
GaussianNB :  0.7633136094674556
KNeighborsClassifier :  0.834319526627219
DecisionTreeClassifier :  0.7337278106508875
LogisticRegression :  0.863905325443787
RidgeClassifier :  0.7396449704142012
Best model: SVC


Bag 4:
SVC :  0.893491124260355
GaussianNB :  0.7455621301775148
KNeighborsClassifier :  0.8224852071005917
DecisionTreeClassifier :  0.7751479289940828
LogisticRegression :  0.863905325443787
RidgeClassifier :  0.7041420118343196
Best

In [27]:
Y_pred_sets=pd.DataFrame()
for i in range(len(X_test_sets)):
    X_test_fromsets=X_test_sets[i]
    best_model=best_models[i]
    Y_pred = best_model.predict(X_test_fromsets.values)
    Y_pred_sets[f"Y_pred_{i}"]=Y_pred
    acc=accuracy_score(Y_test, Y_pred)
    print(f"{type(best_model).__name__} : ",acc)
    print("\n")

LogisticRegression :  0.8805309734513275


SVC :  0.8849557522123894


SVC :  0.8761061946902655


SVC :  0.8716814159292036


SVC :  0.8783185840707964


SVC :  0.8761061946902655


SVC :  0.8761061946902655


SVC :  0.8915929203539823


SVC :  0.8650442477876106


SVC :  0.8871681415929203


SVC :  0.8716814159292036


SVC :  0.8871681415929203


SVC :  0.8849557522123894


SVC :  0.8805309734513275


SVC :  0.8783185840707964




In [28]:
from sklearn.metrics import confusion_matrix

Y_pred_sets['mode']=Y_pred_sets.mode(axis=1)
# print(Y_pred_sets)K

tn, fp, fn, tp = confusion_matrix(Y_test, Y_pred_sets['mode']).ravel()

sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
fpr = fp / (fp + tn)
ppv = tp / (tp + fp)
npv = tn / (tn + fn)
accuracy = (tp + tn) / (tp + fp + tn + fn)

print(f"Sensitivity: {sensitivity}")
print(f"Specificity: {specificity}")
print(f"False Positive Rate: {fpr}")
print(f"Positive Predictive Value: {ppv}")
print(f"Negative Predictive Value: {npv}")
print(f"\nMajority Voting Accuracy : ",accuracy)
Y_pred_sets.drop('mode', axis=1, inplace=True)






Sensitivity: 0.8716814159292036
Specificity: 0.9026548672566371
False Positive Rate: 0.09734513274336283
Positive Predictive Value: 0.8995433789954338
Negative Predictive Value: 0.8755364806866953

Majority Voting Accuracy :  0.8871681415929203


In [29]:
Y_pred_val_sets=pd.DataFrame()
for i in range(len(X_val_sets)):
    X_val_fromsets=X_val_sets[i]
    best_model=best_models[i]
    Y_pred_val = best_model.predict(X_val_fromsets.values)
    Y_pred_val_sets[f"Y_pred_{i}"]=Y_pred_val
    acc=accuracy_score(Y_val, Y_pred_val)

model=DecisionTreeClassifier(random_state=42)
model.fit(Y_pred_val_sets, Y_val)
Y_pred_val = model.predict(Y_pred_val_sets)
acc=accuracy_score(Y_val, Y_pred_val)
print("Validation Accuracy : ",acc)

Validation Accuracy :  0.9644970414201184


In [30]:
final_preds=model.predict(Y_pred_sets)
acc=accuracy_score(Y_test, final_preds)
print("Final Accuracy : ",acc)

Final Accuracy :  0.9070796460176991
