# Libraries

In [None]:
#Classification Methods
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

#Metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from yellowbrick.classifier import ClassificationReport 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score

#Tools
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import model_selection
import string 
import time as tm
import os
from scipy.sparse import csr_matrix 
from yellowbrick.model_selection import FeatureImportances

#Class balance
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTEN
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import KMeansSMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.under_sampling import ClusterCentroids 
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks

import warnings
warnings.filterwarnings('ignore')

In [None]:
import winsound
duration = 1000  # milliseconds
freq = 440  # Hz

## Functions

In [None]:
def classifier_metrics(X_train,X_test,y_train,y_test,HO=True,CV=True):    
    def metrics(model):
        if HO == True:
            print("\nHold-Out in process...")
            start_time = tm.time()
            model.fit(X_train, y_train) 
            TIME = tm.time() - start_time 
            print("Time, Training: {0:.4f} [seconds]".format(TIME))
            start_time = tm.time()
            y_pred = model.predict(X_test)
            TIME = tm.time() - start_time 
            print("Time, Prediction: {0:.4f} [seconds]".format(TIME))

            accuracy_s  = accuracy_score(y_test,y_pred) 
            print('accuracy_score: {0:.4f}'.format(accuracy_s))
            f1_s        = f1_score(y_test,y_pred,average='weighted')
            print('f1_score: {0:.4f}'.format(f1_s))
            recall_s    = recall_score(y_test,y_pred,average='weighted')
            print('recall_score: {0:.4f}'.format(recall_s))
            precision_s = precision_score(y_test,y_pred,average='weighted')
            print('precision_score: {0:.4f}'.format(precision_s))

            if type(list(np.unique(np.array(y_train)))[0]).__name__ == 'str': #If the classes are categorical with string names
                le           = LabelEncoder() 
                le.fit(list(np.unique(np.array(y_train)))) 
                y_test_coded = le.transform(y_test) 
                y_pred_coded = le.transform(y_pred) 
                mse_s        = MSE(y_test_coded,y_pred_coded)
                print('MSE: {0:.4f}'.format(mse_s))
            else:
                mse_s        = MSE(y_test,y_pred)
                print('MSE: {0:.4f}'.format(mse_s))

            if len(list(np.unique(np.array(y_train)))) > 2: #For multiclass classification, more than 2 classes
                y_pred_proba = model.predict_proba(X_test)[:]
                roc_s        = roc_auc_score(y_test, y_pred_proba, multi_class='ovo', average='weighted')
                print('ROC_AUC: {0:.4f}'.format(roc_s))            
            else:
                y_pred_proba = model.predict_proba(X_test)[:,1]
                roc_s        = roc_auc_score(y_test, y_pred_proba, multi_class='ovo', average='weighted')
                print('ROC_AUC: {0:.4f}'.format(roc_s))

            ck_s         = cohen_kappa_score(y_test,y_pred)
            print('CK: {0:.4f}'.format(ck_s))
        
        if CV == True:
            print('\nCross-Validation in process...')
            start_time = tm.time() 
            kfold = model_selection.KFold(n_splits=10)
            y_CV = np.concatenate((y_train,y_test))
            if "GaussianNB" in str(name) or "LinearDiscriminantAnalysis" in str(name) or "QuadraticDiscriminantAnalysis" in str(name):
                X_CV = np.concatenate((X_train,X_test))
                cv_results = np.array(model_selection.cross_val_score(model, X_CV, y_CV, cv=kfold, scoring='accuracy', n_jobs=-3))
            else:
                X_CV = np.concatenate((X_train.toarray(),X_test.toarray()))
                X_CV = csr_matrix(X_CV)
                cv_results = np.array(model_selection.cross_val_score(model, X_CV, y_CV, cv=kfold, scoring='accuracy', n_jobs=-3))

            cv_results = cv_results[np.logical_not(np.isnan(cv_results))] 
            TIME = tm.time() - start_time 
            print("Time, CV: {0:.4f} [seconds]".format(TIME))
            print('CV: {0:.4f} {1:.4f}'.format(cv_results.mean(),cv_results.std()))

    for name in classifiers:
        print ("---------------------------------------------------------------------------------\n") 
        print(str(name))
        if "GaussianNB" in str(name) or "LinearDiscriminantAnalysis" in str(name) or "QuadraticDiscriminantAnalysis" in str(name):
            X_train=csr_matrix(X_train) 
            X_test =csr_matrix(X_test) 
            X_train=X_train.toarray() 
            X_test=X_test.toarray() 
        else:
            X_train=csr_matrix(X_train)
            X_test=csr_matrix(X_test)
            
        metrics(name)
        print()


In [None]:
def class_balance_over_sampling(features, labels, HO=False, CV=True, methods_list=["RandomOverSampler"]):
    
    best_acc=list()
    for method in methods_list:
        if method == "RandomOverSampler":
            print(method)
            print("originals labels unique: ",np.unique(labels, return_counts=True)) 
            X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                                test_size=0.20, random_state=21, stratify=labels)
            sampler = RandomOverSampler(random_state=21) 
            X_train, y_train = sampler.fit_resample(X_train, y_train)             
            print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
            print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 
            classifier_metrics(X_train,X_test,y_train,y_test,HO=HO,CV=CV)
            
        elif method == "SMOTE":
            print(method)
            print("originals labels unique: ",np.unique(labels, return_counts=True)) 
            X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                                test_size=0.20, random_state=21, stratify=labels)
            sampler = SMOTE(random_state=21,n_jobs=-1) 
            X_train, y_train = sampler.fit_resample(X_train, y_train)             
            print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
            print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 
            classifier_metrics(X_train,X_test,y_train,y_test,HO=HO,CV=CV)

        elif method == "SMOTEN":
            print(method)
            print("originals labels unique: ",np.unique(labels, return_counts=True)) 
            X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                                test_size=0.20, random_state=21, stratify=labels)
            sampler = SMOTEN(random_state=21,n_jobs=-1)
            X_train, y_train = sampler.fit_resample(X_train, y_train)             
            print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
            print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 
            classifier_metrics(X_train,X_test,y_train,y_test,HO=HO,CV=CV)
            
        elif method == "ADASYN":
            print(method)
            print("originals labels unique: ",np.unique(labels, return_counts=True)) 
            X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                                test_size=0.20, random_state=21, stratify=labels)
            sampler = ADASYN(random_state=21,n_jobs=-1) 
            X_train, y_train = sampler.fit_resample(X_train, y_train)             
            print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
            print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 
            classifier_metrics(X_train,X_test,y_train,y_test,HO=HO,CV=CV)
            
        elif method == "BorderlineSMOTE":
            print(method)
            print("originals labels unique: ",np.unique(labels, return_counts=True)) 
            X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                                test_size=0.20, random_state=21, stratify=labels)
            sampler = BorderlineSMOTE(random_state=21,n_jobs=-1) 
            X_train, y_train = sampler.fit_resample(X_train, y_train)             
            print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
            print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 
            classifier_metrics(X_train,X_test,y_train,y_test,HO=HO,CV=CV)
            
        elif method == "KMeansSMOTE":
            print(method)
            print("originals labels unique: ",np.unique(labels, return_counts=True)) 
            X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                                test_size=0.20, random_state=21, stratify=labels)
            sampler = KMeansSMOTE(random_state=21,n_jobs=-1, k_neighbors=np.unique(y_test).shape[0]) 
            X_train, y_train = sampler.fit_resample(X_train, y_train)             
            print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
            print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 
            classifier_metrics(X_train,X_test,y_train,y_test,HO=HO,CV=CV)
            
        elif method == "SVMSMOTE":
            print(method)
            print("originals labels unique: ",np.unique(labels, return_counts=True)) 
            X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                                test_size=0.20, random_state=21, stratify=labels)
            sampler = SVMSMOTE(random_state=8,n_jobs=-1) 
            X_train, y_train = sampler.fit_resample(X_train, y_train)             
            print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
            print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 
            classifier_metrics(X_train,X_test,y_train,y_test,HO=HO,CV=CV)
            
        else:
            continue

In [None]:
# Confusion Matrix
def CM_viz(y_test, y_pred, classes, name,
               path_img_base = './images',nrows=1,ncols=1,size_text_legend=25,size_text_title=25,title="",
           size_text_xy_labels=25,size_text_xy_tick=25,
          size_num_inter=25):
    if not os.path.exists(path_img_base):
        os.makedirs(path_img_base)
    
    if ncols==nrows and ncols==1:
        nrows=1
        ncols=1
        fig = plt.figure(figsize=(20*ncols,20*nrows))
        conf = confusion_matrix(y_test, y_pred) 
        annot_kws={'fontsize':size_num_inter, 'verticalalignment':'center' } 
        ax = sns.heatmap(conf, annot=True, cmap='Blues',fmt = 'd',annot_kws= annot_kws, 
                                      xticklabels=np.unique(classes), yticklabels=np.unique(classes)) 
        cbar = ax.collections[0].colorbar # use matplotlib.colorbar.Colorbar object
        cbar.ax.tick_params(labelsize=size_text_xy_tick) # here set the labelsize 
        ax.xaxis.set_tick_params(labelsize=size_text_xy_tick,rotation=90)
        ax.yaxis.set_tick_params(labelsize=size_text_xy_tick,rotation=0)
        ax.set_xlabel('Predicted Values',fontsize=size_text_xy_labels)
        ax.set_ylabel('Actual Values',fontsize=size_text_xy_labels)
        ax.set_title(title,fontsize=size_text_title)
        ax.figure.subplots_adjust(right=0.8)
        ax.figure.savefig(path_figures+"/"+name+"_CM"+".pdf", bbox_inches = "tight", format='pdf')
    else:
        conf = confusion_matrix(y_test, y_pred) 
        annot_kws={'fontsize':size_num_inter, 'verticalalignment':'center' }

        ax = sns.heatmap(conf, annot=True, cmap='Blues',fmt = 'd',annot_kws= annot_kws, 
                                      xticklabels=np.unique(classes), yticklabels=np.unique(classes)) 
        cbar = ax.collections[0].colorbar # use matplotlib.colorbar.Colorbar object
        cbar.ax.tick_params(labelsize=size_text_xy_tick) # here set the labelsize 
        ax.xaxis.set_tick_params(labelsize=size_text_xy_tick,rotation=90)
        ax.yaxis.set_tick_params(labelsize=size_text_xy_tick,rotation=0)
        ax.set_xlabel('Predicted Values',fontsize=size_text_xy_labels)
        ax.set_ylabel('Actual Values',fontsize=size_text_xy_labels)
        ax.set_title(title,fontsize=size_text_title)
    return ax

In [None]:
def load_data_complete_s1(path):
    df_complete=pd.read_csv(path)
    
    # Scenario 1: Tumor_Core & Tumor_Periphery
    # Se procede a eliminar el N_Periphery

    df2 = df_complete.copy()
    df2.drop(df2[df2.classes == "NP"].index, inplace=True)  
    
    # Eliminamos los labels
    features = df2.copy()
    features = features.drop(['classes'], axis=1)
    
    #Extraemos los labels
    labels = df2.copy()
    labels = labels['classes'].values
    
    return features,labels

def load_data_complete_s2(path):
    df_complete=pd.read_csv(path)
    
    # Scenario 2: Normal_Periphery & Tumor_Periphery
    # Se procede a eliminar el T_Core

    df2 = df_complete.copy()
    df2.drop(df2[df2.classes == "TC"].index, inplace=True)  
    
    # Eliminamos los labels
    features = df2.copy()
    features = features.drop(['classes'], axis=1)
    
    #Extraemos los labels
    labels = df2.copy()
    labels = labels['classes'].values
    
    return features,labels

def load_data_complete_s3(path):
    df_complete=pd.read_csv(path)
    # Scenario 3: Tumor_Periphery&Core & Normal_Periphery
    
    # Se procede aislar al N_Periphery
    df2 = df_complete.copy()
    df2.drop(df2[df2.classes == "TP"].index, inplace=True)  
    df2.drop(df2[df2.classes == "TC"].index, inplace=True)  
    
    # Eliminamos el N_Periphery
    df3 = df_complete.copy()
    df3.drop(df3[df3.classes == "NP"].index, inplace=True) 
    
    # y luego se procede a renombrar la columna classes con T_PC, al quedar la unión de estas
    df3["classes"] = "TPC"
    
    # Se procede a crear el DF ya con las clases que corresponde al Escenario 3: Tumor_Periphery&Core & Normal_Periphery
    #df2 N_Periphery
    #df3 T_PC

    df4 = pd.concat([df2,df3]).reset_index(drop=True) 
    
    # Eliminamos los labels
    features = df4.copy()
    features = features.drop(['classes'], axis=1)
    
    #Extraemos los labels
    labels = df4.copy()

    labels = labels['classes'].values
    
    return features,labels

def load_data_complete_s4(path):
    df_complete=pd.read_csv(path)
    
    # Scenario 4 new: Tumor_Core & Tumor_Periphery & N_Periphery

    # Eliminamos los labels
    features = df_complete.copy()
    features = features.drop(['classes'], axis=1)
    
    #Extraemos los labels
    labels = df_complete.copy()
    labels = labels['classes'].values
    
    return features,labels

# Dataset

In [None]:
path = '../Data/DATA_Complete_GBM.csv'

featuress1,labelss1=load_data_complete_s1(path)
featuress2,labelss2=load_data_complete_s2(path)
featuress3,labelss3=load_data_complete_s3(path)
featuress4,labelss4=load_data_complete_s4(path)


# Machine learning application

## Scenario 1

In [None]:
# Without balancing
X_train, X_test, y_train, y_test = train_test_split(featuress1, labelss1, test_size=0.20, random_state=21, stratify=labelss1)

print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

#ML Models
classifiers=[
    XGBClassifier(eval_metric='mlogloss',n_jobs=-1),
    LogisticRegression(solver='liblinear',n_jobs=-1),
    SVC(probability=True,kernel='linear'),
    GradientBoostingClassifier()  
    ] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test,HO=True,CV=False) 

In [None]:
#ML Models
classifiers=[
    XGBClassifier(eval_metric='mlogloss',n_jobs=-1),
    LogisticRegression(solver='liblinear',n_jobs=-1),
    SVC(probability=True,kernel='linear'),
    GradientBoostingClassifier()    
    ] 

'''
methods_list=["RandomOverSampler",  
              "SMOTE",
              "SMOTEN",
              "ADASYN",
              "BorderlineSMOTE",
              #"KMeansSMOTE", #No in scenario 1,2, and 4, only use if you have a lot of data
              "SVMSMOTE"
             ]
'''

class_balance_over_sampling(featuress1, labelss1, HO=True, CV=False, 
                            methods_list=["ADASYN","SVMSMOTE","SMOTE","RandomOverSampler",
                                          "SMOTEN","BorderlineSMOTE"])

In [None]:
winsound.Beep(freq, duration)

## Scenario 2

In [None]:
# Without balancing
X_train, X_test, y_train, y_test = train_test_split(featuress2, labelss2, test_size=0.20, random_state=21, stratify=labelss2)

print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

#ML Models
classifiers=[
    LogisticRegression(solver='liblinear',n_jobs=-1),
    SVC(probability=True,kernel='linear'),
    RandomForestClassifier(n_jobs=-1,random_state=8),
    ExtraTreesClassifier()    
    ] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test,HO=True,CV=False) 

In [None]:
#ML Models
classifiers=[
    LogisticRegression(solver='liblinear',n_jobs=-1),
    SVC(probability=True,kernel='linear'),
    RandomForestClassifier(n_jobs=-1,random_state=8),
    ExtraTreesClassifier()   
    ] 

'''
methods_list=["RandomOverSampler",  
              "SMOTE",
              "SMOTEN",
              "ADASYN",
              "BorderlineSMOTE",
              #"KMeansSMOTE", #No in scenario 1,2, and 4, only use if you have a lot of data
              "SVMSMOTE"
             ]
'''

class_balance_over_sampling(featuress2, labelss2, HO=True, CV=False, 
                            methods_list=["ADASYN","SVMSMOTE","SMOTE","RandomOverSampler",
                                          "SMOTEN","BorderlineSMOTE"])

In [None]:
winsound.Beep(freq, duration)

## Scenario 3

In [None]:
# Without balancing
X_train, X_test, y_train, y_test = train_test_split(featuress3, labelss3, test_size=0.20, random_state=21, stratify=labelss3)

print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

#ML Models
classifiers=[
    GradientBoostingClassifier(random_state=8),
    XGBClassifier(eval_metric='mlogloss',n_jobs=-1),
    RandomForestClassifier(n_jobs=-1,random_state=32),
    SVC(probability=True,kernel='linear')
    ] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test,HO=True,CV=False) 

In [None]:
#ML Models
classifiers=[
    GradientBoostingClassifier(random_state=8),
    XGBClassifier(eval_metric='mlogloss',n_jobs=-1),
    RandomForestClassifier(n_jobs=-1,random_state=32),
    SVC(probability=True,kernel='linear')
    ] 

'''
methods_list=["RandomOverSampler",  
              "SMOTE",
              "SMOTEN",
              "ADASYN",
              "BorderlineSMOTE",
              #"KMeansSMOTE", #No in scenario 1,2, and 4, only use if you have a lot of data
              "SVMSMOTE"
             ]
'''

class_balance_over_sampling(featuress3, labelss3, HO=True, CV=False, 
                            methods_list=["ADASYN","SVMSMOTE","SMOTE","RandomOverSampler",
                                          "SMOTEN","BorderlineSMOTE"])

In [None]:
winsound.Beep(freq, duration)

## Scenario 4

In [None]:
# Without balancing
X_train, X_test, y_train, y_test = train_test_split(featuress4, labelss4, test_size=0.20, random_state=21, stratify=labelss4)

print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

#ML Models
classifiers=[
    XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=4,
                  min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.8,
                  objective= 'binary:logistic', nthread=4, seed=27, eval_metric='mlogloss', n_jobs=-1),
    GradientBoostingClassifier(),
    LogisticRegression(solver='liblinear',n_jobs=-1),
    SVC(probability=True,kernel='linear')    
    ] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test,HO=True,CV=False) 

In [None]:
#ML Models
classifiers=[
    XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=4,
                  min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.8,
                  objective= 'binary:logistic', nthread=4, seed=27, eval_metric='mlogloss', n_jobs=-1),
    GradientBoostingClassifier(),
    LogisticRegression(solver='liblinear',n_jobs=-1),
    SVC(probability=True,kernel='linear')  
    ] 

'''
methods_list=["RandomOverSampler",  
              "SMOTE",
              "SMOTEN",
              "ADASYN",
              "BorderlineSMOTE",
              #"KMeansSMOTE", #No in scenario 1,2, and 4, only use if you have a lot of data
              "SVMSMOTE"
             ]
'''

class_balance_over_sampling(featuress4, labelss4, HO=True, CV=False, 
                            methods_list=["ADASYN","SVMSMOTE","SMOTE","RandomOverSampler",
                                          "SMOTEN","BorderlineSMOTE"])

In [None]:
winsound.Beep(freq, duration)