# Ensamble - Voting Classifier  

### The idea behind the VotingClassifier is to combine concptually different machine learning classifiers and use a majority vote or the average predicted probablities (soft vote) to predict the class labels. such a classifier can be useful for a set of equally well performing model in order to balance out their individual weaknesses.  

### - **Majority Class Labels (Majority/Hard Voting):** In majority voting, the predicted calss label for a particular sample is the class label that represents the majority (mode) of the class labels predicted by each individual classifier.<br> E.g. If the prediction for a given sample is:  
- classifier 1 --> class 1
- classifier 2 --> class 1
- classifier 3 --> class 2  
### The Voting Classifier would classify the sample as 'class 1' based on the majority class label.  
### In the case of a tie, the *VotingClassifier* will select the class based on the ascending sort order. <br> E.g. in the following scenario:
- classifier 1 --> class 2
- classifier 2 --> class 1
### the class label 1 will be assigned to the sample. 


## Usage:

The following example shows how to fit the majority rule classifier:

In [3]:
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.ensemble import VotingClassifier

import warnings

# Importing the modules
import numpy as np
import pandas as pd

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn import datasets

from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, recall_score, precision_score

from imblearn.over_sampling import SMOTE 

# iris = datasets.load_iris()
# X, y = iris.data[:, 1:3], iris.target

# # Generating the 3 classifiers
# clf1 = LogisticRegression(random_state=12)
# clf2 = RandomForestClassifier(n_estimators=50, random_state=12)
# clf3 = GaussianNB()

# # Creating the ensamble classifier
# eclf = VotingClassifier(
#     estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], 
#     voting='hard') # Voting = hard  ->  Majority voting

# # Training and testing the models with the cross validation procedure
# for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'Naive Bayes', 'Ensable']):
#     scores = cross_val_score(clf, X, y, scoring='accuracy', cv=5)
#     print(f'Accuracy: {scores.mean():.2} (+/- {scores.std():.2f}) [{label}]')

## **Trying to do the Ensamble without the Voting Classifier**

In [4]:
# Opening the Cancer dataset
cancer = datasets.load_breast_cancer()

# Generating the X and y data structures
X, y = cancer.data, cancer.target

# Defining the ML models that will be part of the ensemble
svm_model = SVC(kernel='rbf', C=100, gamma=1)
nb_model = GaussianNB()
knn_model = KNeighborsClassifier(n_neighbors=6)
dt_model = DecisionTreeClassifier()

# Defining the ML models that already are ensemble models
rf_model = RandomForestClassifier(n_estimators=100, n_jobs=-1)
xgb_model = xgb.XGBClassifier(n_estimators=100, eval_metric='logloss')

models_dict = {'SVM': svm_model, 
                'NB': nb_model,
                'kNN': knn_model,
                'DT': dt_model}

In [7]:
def cross_validation_ensemble_00(ensemble, X, y, folds=10):

    cv = StratifiedKFold(n_splits=folds, shuffle=True)

    scores = {\
            'auc':[], \
            'accuracy': [], \
            'f1': [], \
            'precision': [], \
            'recall': []}

    for train_idx, test_idx in cv.split(X,y): 
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Storing the final prediction results
        ensemble_pred = [] 

        # I will train and test each of the model that are part of the ensamble on each of the training and test subset of the CV
        for model in ensemble:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            
            if len(ensemble_pred) == 0:
                ensemble_pred = [[x] for x in y_pred]
            else:
                for idx, el in enumerate(y_pred):
                    ensemble_pred[idx].append(el)

        # Extracting the class that has been majority voted by the models
        for idx, li in enumerate(ensemble_pred):
            pred = li[0]
            count = 0
            for el in li:
                freq_el = li.count(el)
                if freq_el > count:
                    count = freq_el
                    pred = el

            ensemble_pred[idx] = pred

        scores['auc'].append(roc_auc_score(list(y_test), ensemble_pred))
        scores['accuracy'].append(accuracy_score(list(y_test), ensemble_pred))
        scores['f1'].append(f1_score(list(y_test), ensemble_pred))
        scores['precision'].append(precision_score(list(y_test), ensemble_pred))
        scores['recall'].append(recall_score(list(y_test), ensemble_pred))

    # This will print all the scores from the k-folds
    scores = {k: np.mean(v) for k, v in scores.items()}

    print(scores)


print('Ensemble:')
cross_validation_ensemble_00([svm_model, nb_model, knn_model, dt_model], X, y, folds=10)
print('SVM:')
cross_validation_ensemble_00([svm_model], X, y, folds=10)
print('NB:')
cross_validation_ensemble_00([nb_model], X, y, folds=10)


Ensemble:
{'auc': 0.8992568542568543, 'accuracy': 0.9225877192982456, 'f1': 0.9417465274711129, 'precision': 0.8972561531611596, 'recall': 0.9915873015873016}
SVM:
{'auc': 0.5, 'accuracy': 0.6274122807017544, 'f1': 0.7710324738375229, 'precision': 0.6274122807017544, 'recall': 1.0}
NB:
{'auc': 0.93258658008658, 'accuracy': 0.9420426065162907, 'f1': 0.9545395306077561, 'precision': 0.9409422934655133, 'recall': 0.9692857142857143}


## **Trying with the 192 proteins dataset and with the correspondin best features**

In [8]:
# Importing the Dataset
lisbon_coimbra_df = pd.read_excel('190 proteins quantified_analysis.xls', sheet_name='Lisbon_Coimbra_proteomics')

# Generating the y array
y = np.array([1 if i == 'Amyloid-Positive' else 2 for i in lisbon_coimbra_df['Group']]) # 1 -> A+ ; 2 -> A-
# print(f'- The number of Amyloid-Positive patients is: {list(y).count(1)} \n- The number of Amyloid-Negative patients is: {list(y).count(2)} \n')

# Dropping the useless columns
lisbon_coimbra_df = lisbon_coimbra_df.drop(['Sample Name','Group'], axis=1)

# # Generating the X array 
X = lisbon_coimbra_df.values
# print(f'The final X matrix is made out of: {X.shape[0]} examples and {X.shape[1]} features')

In [9]:
svm_dataset = lisbon_coimbra_df[['P02768_ALBU', 'P02765_FETUA']]

knn_dataset = lisbon_coimbra_df[['P02765_FETUA', 'Q16270_IBP7', 'P02749_APOH', 'P05090_APOD', 'P09486_SPRC', 'P51693_APLP1', 'P00338_LDHA', 'P05155_IC1', 'P13987_CD59', 'P41271_NBL1', 'P01011_AACT', 'P06727_APOA4', 'Q9UBX5_FBLN5', 'P02750_A2GL']]

nb_dataset = lisbon_coimbra_df[['P02765_FETUA', 'Q16270_IBP7', 'P09486_SPRC', 'P41271_NBL1', 'P01011_AACT', 'P13987_CD59', 'Q9UBX5_FBLN5', 'P24592_IBP6', 'P02766_TTHY', 'P05155_IC1', 'P08294_SODE', 'Q08380_LG3BP', 'P02747_C1QC', 'Q969P0_IGSF8', 'P06396_GELS', 'P05090_APOD', 'P61916_NPC2', 'P01024_CO3', 'P22352_GPX3', 'P10643_CO7', 'P02751_FINC', 'P02749_APOH', 'P25311_ZA2G']]

rf_dataset = lisbon_coimbra_df[['P02765_FETUA', 'Q16270_IBP7', 'P09486_SPRC', 'P41271_NBL1', 'P01011_AACT', 'P13987_CD59', 'Q9UBX5_FBLN5']]

xgb_dataset = lisbon_coimbra_df[['P24592_IBP6', 'P02766_TTHY', 'P05155_IC1', 'P08294_SODE', 'Q08380_LG3BP', 'P02747_C1QC', 'Q969P0_IGSF8', 'P06396_GELS', 'P05090_APOD', 'P61916_NPC2', 'P01024_CO3', 'P22352_GPX3', 'P10643_CO7', 'P02751_FINC']]

model_dataset_dict = {'svm': (svm_model, svm_dataset), 'knn': (knn_model, knn_dataset), 'nb': (nb_model, nb_dataset)}

ensemble_dataset_dict = {'rf': (rf_model, rf_dataset), 'xgb': (xgb_model, xgb_dataset)}  

In [10]:
def cross_validation_ensemble(model_dataset_dict, X, y, folds=10):
    
    cv = StratifiedKFold(n_splits=folds, shuffle=True)
    
    scores = {\
                'auc':[], \
                'accuracy': [], \
                'f1': [], \
                'precision': [], \
                'recall': []}
    
    for train_idx, test_idx in cv.split(X,y): 
    
        ensemble_pred = []
        y_pred_ensemble = list()
    
        for key, values in model_dataset_dict.items():
            
            # The X matrix will contain the best features for each of the model
            X_temp = values[1].values
    
            # Splitting the X matrix in train and test using the cv.split function (this spit will be the same for each of the model)
            X_train, X_test = X_temp[train_idx], X_temp[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
    
            # Fitting the model
            values[0].fit(X_train, y_train)

            # Predictig the class of the X_test dataset
            y_pred = values[0].predict(X_test)
    
            # Storing the y_pred result in the ensemble pred array
            if len(ensemble_pred) == 0:
                ensemble_pred = [[x] for x in y_pred]
            else:
                for idx, el in enumerate(y_pred):
                    ensemble_pred[idx].append(el)
    
        # Extracting the class that has been majority voted from the different models
        for idx, li in enumerate(ensemble_pred):
            pred = li[0]
            count = 0
            for el in li:
                freq_el = li.count(el)
                if freq_el > count:
                    count = freq_el
                    pred = el
            y_pred_ensemble.append(pred)

        print(y_pred_ensemble)

        # Computing the scoring metrics
        scores['auc'].append(roc_auc_score(y_test, y_pred_ensemble))
        scores['accuracy'].append(accuracy_score(y_test, y_pred_ensemble))
        scores['f1'].append(f1_score(y_test, y_pred_ensemble))
        scores['precision'].append(precision_score(y_test, y_pred_ensemble))
        scores['recall'].append(recall_score(y_test, y_pred_ensemble))
    
    # Computing the mean of the k-fold-CV for each of the scoring metrics f
    scores = {k: np.mean(v) for k, v in scores.items()}

    return scores, y_pred_ensemble
    

final_scores, y_ensemble = cross_validation_ensemble(model_dataset_dict, X, y, folds=10)

print('final ensemble:')
print(y_ensemble)

[1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2]
[2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1]
[2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1]
[1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1]
[2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1]
[1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1]
[1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1]
[1, 2, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1]
[2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1]
[1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1]
final ensemble:
[1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1]


In [94]:
def cross_validation_ensemble(model_dataset_dict, X, y, folds=10):
    
    cv = StratifiedKFold(n_splits=folds, shuffle=True)
    
    scores = {\
                'auc':[], \
                'accuracy': [], \
                'f1': [], \
                'precision': [], \
                'recall': []}
    
    for train_idx, test_idx in cv.split(X,y): 
    
        ensemble_pred = []
        y_pred_ensemble = list()
    
        for key, values in model_dataset_dict.items():
            
            # The X matrix will contain the best features for each of the model
            X_temp = values[1].values
    
            # Splitting the X matrix in train and test using the cv.split function (this spit will be the same for each of the model)
            X_train, X_test = X_temp[train_idx], X_temp[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
    
            # Fitting the model
            values[0].fit(X_train, y_train)

            # Predictig the class of the X_test dataset
            y_pred = values[0].predict(X_test)
    
            # Storing the y_pred result in the ensemble pred array
            if len(ensemble_pred) == 0:
                ensemble_pred = [[x] for x in y_pred]
            else:
                for idx, el in enumerate(y_pred):
                    ensemble_pred[idx].append(el)
    
        # Extracting the class that has been majority voted from the different models
        for idx, li in enumerate(ensemble_pred):
            pred = li[0]
            count = 0
            for el in li:
                freq_el = li.count(el)
                if freq_el > count:
                    count = freq_el
                    pred = el
            y_pred_ensemble.append(pred)

        # print(y_pred_ensemble)

        # Computing the scoring metrics
        scores['auc'].append(roc_auc_score(y_test, y_pred_ensemble))
        scores['accuracy'].append(accuracy_score(y_test, y_pred_ensemble))
        scores['f1'].append(f1_score(y_test, y_pred_ensemble))
        scores['precision'].append(precision_score(y_test, y_pred_ensemble))
        scores['recall'].append(recall_score(y_test, y_pred_ensemble))
    
    # Computing the mean of the k-fold-CV for each of the scoring metrics f
    scores = {k: np.mean(v) for k, v in scores.items()}

    return scores, y_pred_ensemble

final_scores, y_ensemble = cross_validation_ensemble(model_dataset_dict, X, y, folds=10)

# Ensemble of ensemble

In [144]:
# cv = StratifiedKFold(n_splits=10, shuffle=True)

# for train_idx, test_idx in cv.split(X,y): 
#     continue

def ensemble_test(model_dataset_dict, train_idx, test_idx, y):

    ensemble_pred = []
    y_pred_ensemble = list()

    for key, values in model_dataset_dict.items():    
        
        # The X matrix will contain the best features for each of the model
        X_temp = values[1].values
        # print(len(X_temp))

        # Splitting the X matrix in train and test using the cv.split function (this spit will be the same for each of the model)
        X_train, X_test = X_temp[train_idx], X_temp[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Fitting the model
        values[0].fit(X_train, y_train)

        # Predictig the class of the X_test dataset
        y_pred = values[0].predict(X_test)

        # Storing the y_pred result in the ensemble pred array
        if len(ensemble_pred) == 0:
            ensemble_pred = [[x] for x in y_pred]
        else:
            for idx, el in enumerate(y_pred):
                ensemble_pred[idx].append(el)

    # Finding and saving the most voted class
    for idx, li in enumerate(ensemble_pred):
            pred = li[0]
            count = 0
            for el in li:
                freq_el = li.count(el)
                if freq_el > count:
                    count = freq_el
                    pred = el
            y_pred_ensemble.append(pred)
    
    return y_pred_ensemble

# y_pred_ensemble = ensemble_test(model_dataset_dict, train_idx, test_idx, y)

# print(y_pred_ensemble)

In [145]:
def cross_validation_ensemble_1(model_dataset_dict, ensemble_dataset_dict, X, y, folds=10):
    
    # y = [int(0) if i == 1 else int(1) for i in y]

    # print(y)

    cv = StratifiedKFold(n_splits=folds, shuffle=True)
    
    scores = {\
                'auc':[], \
                'accuracy': [], \
                'f1': [], \
                'precision': [], \
                'recall': []}
    
    for train_idx, test_idx in cv.split(X,y): 

        ensemble_pred = []
        y_pred_ensemble = list()

        # print(len(train_idx))
        # print(len(test_idx))

        y_ensemble = ensemble_test(model_dataset_dict, train_idx, test_idx, y)   

        # print('Models_ensemble result:')
        # print(y_ensemble, 'len:', len(y_ensemble))    

        for key, values in ensemble_dataset_dict.items():

            # The X matrix will contain the best features for each of the model
            X_temp = values[1].values
    
            # Splitting the X matrix in train and test using the cv.split function (this spit will be the same for each of the model)
            X_train, X_test = X_temp[train_idx], X_temp[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            
            # Fitting the model
            values[0].fit(X_train, y_train)

            # Predictig the class of the X_test dataset
            y_pred = values[0].predict(X_test)    

            # print(y_pred, 'len :', len(y_pred))  

            # Storing the y_pred result in the ensemble pred array
            if len(ensemble_pred) == 0:
                ensemble_pred = [[x] for x in y_pred]
            else:
                for idx, el in enumerate(y_pred):
                    ensemble_pred[idx].append(el)
        
        # Adding the results of the model ensemble
        for i, pred in enumerate(y_ensemble):
            ensemble_pred[i].append(pred)

        # Finding and saving the most voted class
        for idx, li in enumerate(ensemble_pred):
                pred = li[0]
                count = 0
                for el in li:
                    freq_el = li.count(el)
                    if freq_el > count:
                        count = freq_el
                        pred = el
                y_pred_ensemble.append(pred)

        # print('Final Result')
        # print(y_pred_ensemble)
        # print(y_test)

        scores['auc'].append(roc_auc_score(y_test, y_pred_ensemble))
        scores['accuracy'].append(accuracy_score(y_test, y_pred_ensemble))
        scores['f1'].append(f1_score(y_test, y_pred_ensemble))
        scores['precision'].append(precision_score(y_test, y_pred_ensemble))
        scores['recall'].append(recall_score(y_test, y_pred_ensemble))

    # print(scores)
    
    scores = {k: np.mean(v) for k, v in scores.items()}

    return scores

final_scores = cross_validation_ensemble_1(model_dataset_dict, ensemble_dataset_dict, X, y, folds=5)

print(final_scores)

{'auc': 0.926073926073926, 'accuracy': 0.9283076923076923, 'f1': 0.9353360560257112, 'precision': 0.9052106227106227, 'recall': 0.9703296703296704}


# Logistic Regression (LR)

In [143]:
from sklearn.linear_model import LogisticRegression

# Creating a dict in which I will store the results
scores = {\
            'auc':[], \
            'accuracy': [], \
            'f1': [], \
            'precision': [], \
            'recall': []}

# Splitting the dataset in training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# Creating the model
lr_model = LogisticRegression()

# Fitting the model 
lr_model.fit(X_train, y_train)

# Predicting the y_test
y_pred = lr_model.predict(X_test)

# Computing the scoring metrics
scores['auc'].append(roc_auc_score(y_test, y_pred))
scores['accuracy'].append(accuracy_score(y_test, y_pred))
scores['f1'].append(f1_score(y_test, y_pred))
scores['precision'].append(precision_score(y_test, y_pred))
scores['recall'].append(recall_score(y_test, y_pred))

print(scores)

{'auc': [0.5], 'accuracy': [0.5238095238095238], 'f1': [0.6875000000000001], 'precision': [0.5238095238095238], 'recall': [1.0]}


# Standard Deviation (Standard Error)

In [201]:
def cross_validate_balancing(estimator, X, y, balance = SMOTE(), folds=10):

    cv = StratifiedKFold(n_splits = folds, shuffle = True) 
    scores = {\
            'auc':[], \
            'accuracy': [], \
            'f1': [], \
            'precision': [], \
            'recall': []}
    
    for train_idx, test_idx in cv.split(X,y): 
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        if balance != 'None':
            X_train, y_train = balance.fit_resample(X_train, y_train)

        estimator.fit(X_train, y_train)
        y_predicted = estimator.predict(X_test)

        scores['auc'].append(roc_auc_score(y_test, y_predicted))
        scores['accuracy'].append(accuracy_score(y_test, y_predicted))
        scores['f1'].append(f1_score(y_test, y_predicted))
        scores['precision'].append(precision_score(y_test, y_predicted))
        scores['recall'].append(recall_score(y_test, y_predicted))

    return scores



def cv_iteration(model, X, y, iterations=10, folds=10):
    
    # Defining the final dictionary that will contain the scores
    iter_result_no_smote = {}
    # iter_result_smote = {}

    for _ in range(iterations):

        if _ % 2 == 0:
            # clear_output(wait=True)
            print(f'{(_/iterations)*100}%', end=' -> ')

        result_no_smote = cross_validate_balancing(model, X, y, balance='None', folds=folds) # Calling the CV function withOUT SMOTE
        # result_smote = cross_validate_balancing(model, X, y, folds=folds) # Calling the CV function with SMOTE
        result_no_smote = {k: np.mean(v) for k, v in result_no_smote.items()}
        # result_smote = {k: np.mean(v) for k, v in result_smote.items()}

        # Filling the dictionary with the resuts obtained WITHOUT applying SMOTE
        for k, v in result_no_smote.items():
            if k in iter_result_no_smote.keys():
                iter_result_no_smote[k].append(v)
            else:
                iter_result_no_smote[k] = [v]
    
    # computing the standard deviation
    sd_dict = {}

    for k, v in iter_result_no_smote.items():
        sd_dict[k] = np.std(v)  
                
    iter_result_no_smote = {k: np.mean(v) for k, v in iter_result_no_smote.items()}
    # iter_result_smote = {k: np.mean(v) for k, v in iter_result_smote.items()}

    return iter_result_no_smote, sd_dict # iter_result_smote
    


def cross_validate(models_dict, X, y, iterations=10, folds=10):

    results_no_smote_df = pd.DataFrame()
    # results_smote_df = pd.DataFrame()

    for key, model in models_dict.items():

        print('\n', key, '--> Executing...') 
        
        # Getting the results - Iterating n times the CV procedure
        model_result_no_smote, sd_dict = cv_iteration(model, X, y, iterations=iterations, folds=folds)
        # print(model_result_no_smote)
        # print(sd_dict)

        for k, v in model_result_no_smote.items():
            model_result_no_smote[k] = str(round(v, 3)) + ' +/- ' + str(round(sd_dict[k], 2))

        # Preparing the result dicts to be appended to the final DF 
        new_row_no_smote = pd.Series(data=model_result_no_smote, name=key)
        # new_row_smote = pd.Series(data=model_result_smote, name=key)
        # print(new_row_no_smote)
        
        # Append the results to the final DataFrame 
        results_no_smote_df = results_no_smote_df.append(new_row_no_smote, ignore_index=False)
        # results_smote_df = results_smote_df.append(new_row_smote, ignore_index=False)

    print('Done')

    # Arranging the order of the columns
    cols = list(results_no_smote_df.columns)
    acc, auc = cols.index('accuracy'), cols.index('auc')
    cols[auc], cols[acc] = cols[acc], cols[auc]

    results_no_smote_df = results_no_smote_df[cols]
    # results_smote_df = results_smote_df[cols]

    # Sorting the models by the results of the AUC metric
    results_no_smote_df = results_no_smote_df.sort_values(['auc'], ascending=False)
    # results_smote_df = results_smote_df.sort_values(['auc'], ascending=False)

    return results_no_smote_df #, results_smote_df

In [202]:
# Importing the Dataset
lisbon_coimbra_df = pd.read_excel('190 proteins quantified_analysis.xls', sheet_name='Lisbon_Coimbra_proteomics')

# Generating the y array
y = np.array([1 if i == 'Amyloid-Positive' else 2 for i in lisbon_coimbra_df['Group']]) # 1 -> A+ ; 2 -> A-

# Dropping the useless columns
lisbon_coimbra_df = lisbon_coimbra_df.drop(['Sample Name','Group'], axis=1)

# # Generating the X array 
X = lisbon_coimbra_df.values


models_dict = {'SVM': svm_model, 
                'NB': nb_model,
                'kNN': knn_model,
                'DT': dt_model}

lisbon_coimbra_no_smote_df = cross_validate(models_dict, X, y, iterations=10, folds=10)


 SVM --> Executing...
0.0% -> 20.0% -> 40.0% -> 60.0% -> 80.0% -> 
 NB --> Executing...
0.0% -> 20.0% -> 40.0% -> 60.0% -> 80.0% -> 
 kNN --> Executing...
0.0% -> 20.0% -> 40.0% -> 60.0% -> 80.0% -> 
 DT --> Executing...
0.0% -> 20.0% -> 40.0% -> 60.0% -> 80.0% -> Done


In [203]:
lisbon_coimbra_no_smote_df

Unnamed: 0,auc,accuracy,f1,precision,recall
NB,0.832 +/- 0.01,0.834 +/- 0.01,0.844 +/- 0.01,0.838 +/- 0.01,0.868 +/- 0.01
DT,0.824 +/- 0.02,0.825 +/- 0.02,0.83 +/- 0.02,0.847 +/- 0.02,0.838 +/- 0.03
SVM,0.735 +/- 0.02,0.742 +/- 0.01,0.778 +/- 0.01,0.731 +/- 0.01,0.846 +/- 0.02
kNN,0.652 +/- 0.02,0.662 +/- 0.02,0.719 +/- 0.01,0.654 +/- 0.02,0.816 +/- 0.02


In [196]:
model_result = {'auc': 0.650452380952381, 'accuracy': 0.6610897435897436, 'f1': 0.715760515885903, 'precision': 0.6497922077922078, 'recall': 0.8135714285714}
sd = {'auc': 0.016051645672584026, 'accuracy': 0.016677018481450392, 'f1': 0.016981700695697755, 'precision': 0.012600357189387828, 'recall': 0.028433197358336958}

for k, v in model_result.items():
    model_result[k] = str(round(v, 3)) + ' +/- ' + str(round(sd[k], 2))

new_row = pd.Series(data=model_result, name=key)
print(new_row)

auc           0.65 +/- 0.02
accuracy     0.661 +/- 0.02
f1           0.716 +/- 0.02
precision     0.65 +/- 0.01
recall       0.814 +/- 0.03
Name: xgb, dtype: object


# Recursive Feature Elimination & Addition

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from IPython.display import clear_output
from sklearn.utils import shuffle

In [12]:
# Importing the Dataset
lisbon_coimbra_df = pd.read_excel('190 proteins quantified_analysis.xls', sheet_name='Lisbon_Coimbra_proteomics')

# REMOVEEEEE --- It's to reduce the dimensions of the dataset
lisbon_coimbra_df = shuffle(lisbon_coimbra_df)[:30].reset_index().drop('index', axis=1) # Removing the n of examples
lisbon_coimbra_df = lisbon_coimbra_df.iloc[: , :31] # Removing the features

print(f'The dataset has: {lisbon_coimbra_df.shape[0]} examples and {lisbon_coimbra_df.shape[1]} features \n')

# Generating the y array
y = np.array([1 if i == 'Amyloid-Positive' else 2 for i in lisbon_coimbra_df['Group']]) # 1 -> A+ ; 2 -> A-
print(f'- The number of Amyloid-Positive patients is: {list(y).count(1)} \n- The number of Amyloid-Negative patients is: {list(y).count(2)} \n')

# Dropping the useless columns
lisbon_coimbra_df = lisbon_coimbra_df.drop(['Sample Name','Group'], axis=1)

# Generating the X array 
X = lisbon_coimbra_df.values
print(f'The final X matrix is made out of: {X.shape[0]} examples and {X.shape[1]} features')

The dataset has: 30 examples and 31 features 

- The number of Amyloid-Positive patients is: 13 
- The number of Amyloid-Negative patients is: 17 

The final X matrix is made out of: 30 examples and 29 features


#### Defining the function that I will use

In [13]:
def svm_feat_importance(model, Dataset, X, y, iterations=100):

    score_feature_dict_svm = {}

    # Iterative method for feature importance:
    for _ in range(iterations):

        if _ % 50 == 0:
            clear_output(wait=True)
            print(f'{(_/iterations)*100}%')

        # Generating the train and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

        # Fitting the model
        model.fit(X_train, y_train)
        
        imps = permutation_importance(model, X_test, y_test)

        # Storing the important features
        feat_importance_svm = imps['importances_mean']

        print(feat_importance_svm)

        # Adding the importance score of the features to the dictionary
        for idx, score in enumerate(feat_importance_svm):
            if Dataset.keys()[idx] in score_feature_dict_svm.keys():
                score_feature_dict_svm[Dataset.keys()[idx]].append(score)
            else:
                score_feature_dict_svm[Dataset.keys()[idx]] = [score]

    score_feature_dict_svm = {k: np.mean(v) for k, v in score_feature_dict_svm.items()}

    features_svm_sorted = dict(sorted(score_feature_dict_svm.items(), key=lambda x: x[1], reverse=True))

    feature_importance_name_svm = list(features_svm_sorted.keys())

    return features_svm_sorted, feature_importance_name_svm

## **Feature Addition**

In [25]:
def feature_addition_svm(Dataset, model, y):

    final_rank = []

    for i in range(len(Dataset.keys())):

        print(i)
        clear_output(wait=True)

        score_feat_imp_svm = {}

        iteration = 15
        for _ in range(iteration):
            
            # y = np.array([1 if i == 'Amyloid-Positive' else 2 for i in lisbon_coimbra_df['Group']])
            X = Dataset.values

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

            # Fitting the model
            model.fit(X_train, y_train)

            # Extracting the most important features
            imps = permutation_importance(model, X_test, y_test)

            feat_imp_svm = imps['importances_mean']

            for idx, score in enumerate(feat_imp_svm):
                if Dataset.keys()[idx] in score_feat_imp_svm.keys():
                    score_feat_imp_svm[Dataset.keys()[idx]].append(score)
                else:
                    score_feat_imp_svm[Dataset.keys()[idx]] = [score]

        # print(score_feat_imp_svm)
        score_feat_imp_svm = {k: np.mean(v) for k, v in score_feat_imp_svm.items()}
        
        # Sorting
        features_svm_sorted = dict(sorted(score_feat_imp_svm.items(), key=lambda x: x[1], reverse=True))

        # print(features_svm_sorted)

        best_feat = list(features_svm_sorted.keys())[0]

        final_rank.append(best_feat)

        Dataset = Dataset.drop(best_feat, axis=1)

    print(final_rank)

In [26]:
feature_addition_svm(lisbon_coimbra_df, svm_model, y)

['P02768_ALBU', 'P01011_AACT', 'P01019_ANGT', 'P01024_CO3', 'P02766_TTHY', 'P00450_CERU', 'P0DOX5_IGG1', 'P41222_PTGDS', 'P10909_CLUS', 'P02787_TRFE', 'P02649_APOE', 'P02790_HEMO', 'P02774_VTDB', 'P02647_APOA1', 'P01009_A1AT', 'O94985_CSTN1', 'P02749_APOH', 'P09871_C1S', 'P06396_GELS', 'P01023_A2MG', 'P02751_FINC', 'P23142_FBLN1', 'Q13822_ENPP2', 'Q92823_NRCAM', 'P02679_FIBG', 'P05155_IC1', 'Q96KN2_CNDP1', 'Q12860_CNTN1', 'P08603_CFAH']


## **Feature Elimination**

In [30]:
def feature_elimination_svm(Dataset, model, y):

    final_rank = []

    for i in range(len(Dataset.keys())):

        print(i)
        # clear_output(wait=True)

        score_feat_imp_svm = {}

        iteration = 10
        for _ in range(iteration):

            # y = np.array([1 if i == 'Amyloid-Positive' else 2 for i in lisbon_coimbra_df['Group']])
            X = Dataset.values

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

            # Fitting the model
            model.fit(X_train, y_train)

            # Extracting the most important features
            imps = permutation_importance(model, X_test, y_test)

            feat_imp_svm = imps['importances_mean']

            for idx, score in enumerate(feat_imp_svm):
                if Dataset.keys()[idx] in score_feat_imp_svm.keys():
                    score_feat_imp_svm[Dataset.keys()[idx]].append(score)
                else:
                    score_feat_imp_svm[Dataset.keys()[idx]] = [score]

        score_feat_imp_svm = {k: np.mean(v) for k, v in score_feat_imp_svm.items()}
        
        # Sorting
        features_svm_sorted = dict(sorted(score_feat_imp_svm.items(), key=lambda x: x[1], reverse=True))

        print(features_svm_sorted)

        worst_feat = list(features_svm_sorted.keys())[len(features_svm_sorted) - 1]    

        final_rank.insert(0, worst_feat)

        Dataset = Dataset.drop(worst_feat, axis=1)

    print(final_rank)

In [31]:
feature_elimination_svm(lisbon_coimbra_df, svm_model, y)

0
{'P02768_ALBU': 0.0225, 'P01024_CO3': 0.0075, 'P02787_TRFE': 0.005, 'P01011_AACT': 0.0025, 'P02766_TTHY': 0.0025, 'P10909_CLUS': 0.0025, 'P01009_A1AT': 0.0, 'P41222_PTGDS': 0.0, 'P00450_CERU': 0.0, 'P06396_GELS': 0.0, 'P02790_HEMO': 0.0, 'P01023_A2MG': 0.0, 'P09871_C1S': 0.0, 'P02774_VTDB': 0.0, 'P23142_FBLN1': 0.0, 'P01019_ANGT': 0.0, 'Q96KN2_CNDP1': 0.0, 'P02749_APOH': 0.0, 'P02751_FINC': 0.0, 'P0DOX5_IGG1': 0.0, 'Q92823_NRCAM': 0.0, 'P02647_APOA1': 0.0, 'O94985_CSTN1': 0.0, 'P05155_IC1': 0.0, 'P02679_FIBG': 0.0, 'P08603_CFAH': 0.0, 'Q13822_ENPP2': 0.0, 'Q12860_CNTN1': 0.0, 'P02649_APOE': -0.017499999999999998}
1
{'P01024_CO3': 0.0, 'P01011_AACT': 0.0, 'P02766_TTHY': 0.0, 'P41222_PTGDS': 0.0, 'P10909_CLUS': 0.0, 'P00450_CERU': 0.0, 'P02787_TRFE': 0.0, 'P06396_GELS': 0.0, 'P02790_HEMO': 0.0, 'P01023_A2MG': 0.0, 'P09871_C1S': 0.0, 'P02774_VTDB': 0.0, 'P23142_FBLN1': 0.0, 'P01019_ANGT': 0.0, 'Q96KN2_CNDP1': 0.0, 'P02749_APOH': 0.0, 'P02751_FINC': 0.0, 'Q92823_NRCAM': 0.0, 'P02647_APOA