# LIBRARIES

In [None]:
#Classification Methods
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

#Metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from yellowbrick.classifier import ClassificationReport 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score

#Tools
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import model_selection
import string 
import time as tm
import os
from scipy.sparse import csr_matrix 
from yellowbrick.model_selection import FeatureImportances

#Class balance
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTEN
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import KMeansSMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.under_sampling import ClusterCentroids 
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks

import warnings
warnings.filterwarnings('ignore')

In [None]:
import winsound
duration = 1000  # milliseconds
freq = 440  # Hz

# FUNCTIONS

In [None]:
def load_data_complete_s1(path):
    df_complete=pd.read_csv(path)
    
    # Scenario 1: Tumor_Core & Tumor_Periphery
    # Se procede a eliminar el N_Periphery

    df2 = df_complete.copy()
    df2.drop(df2[df2.classes == "NP"].index, inplace=True)  
    
    # Eliminamos los labels
    features = df2.copy()
    features = features.drop(['classes'], axis=1)
    
    #Extraemos los labels
    labels = df2.copy()
    labels = labels['classes'].values
    
    return features,labels

def load_data_complete_s2(path):
    df_complete=pd.read_csv(path)
    
    # Scenario 2: Normal_Periphery & Tumor_Periphery
    # Se procede a eliminar el T_Core

    df2 = df_complete.copy()
    df2.drop(df2[df2.classes == "TC"].index, inplace=True)  
    
    # Eliminamos los labels
    features = df2.copy()
    features = features.drop(['classes'], axis=1)
    
    #Extraemos los labels
    labels = df2.copy()
    labels = labels['classes'].values
    
    return features,labels

def load_data_complete_s3(path):
    df_complete=pd.read_csv(path)
    # Scenario 3: Tumor_Periphery&Core & Normal_Periphery
    
    # Se procede aislar al N_Periphery
    df2 = df_complete.copy()
    df2.drop(df2[df2.classes == "TP"].index, inplace=True)  
    df2.drop(df2[df2.classes == "TC"].index, inplace=True)  
    
    # Eliminamos el N_Periphery
    df3 = df_complete.copy()
    df3.drop(df3[df3.classes == "NP"].index, inplace=True) 
    
    # y luego se procede a renombrar la columna classes con T_PC, al quedar la uni√≥n de estas
    df3["classes"] = "TPC"
    
    # Se procede a crear el DF ya con las clases que corresponde al Escenario 3: Tumor_Periphery&Core & Normal_Periphery
    #df2 N_Periphery
    #df3 T_PC

    df4 = pd.concat([df2,df3]).reset_index(drop=True) 
    
    # Eliminamos los labels
    features = df4.copy()
    features = features.drop(['classes'], axis=1)
    
    #Extraemos los labels
    labels = df4.copy()

    labels = labels['classes'].values
    
    return features,labels

def load_data_complete_s4(path):
    df_complete=pd.read_csv(path)
    
    # Scenario 4 new: Tumor_Core & Tumor_Periphery & N_Periphery

    # Eliminamos los labels
    features = df_complete.copy()
    features = features.drop(['classes'], axis=1)
    
    #Extraemos los labels
    labels = df_complete.copy()
    labels = labels['classes'].values
    
    return features,labels

In [None]:
# Confusion Matrix
def CM_viz(y_test, y_pred, classes, name,
               path_img_base = './images',nrows=1,ncols=1,size_text_legend=25,size_text_title=25,title="",
           size_text_xy_labels=25,size_text_xy_tick=25,
          size_num_inter=25):
    if not os.path.exists(path_img_base):
        os.makedirs(path_img_base)
    
    if ncols==nrows and ncols==1:
        nrows=1
        ncols=1
        fig = plt.figure(figsize=(20*ncols,20*nrows))
        conf = confusion_matrix(y_test, y_pred) 
        annot_kws={'fontsize':size_num_inter, 'verticalalignment':'center' } 
        ax = sns.heatmap(conf, annot=True, cmap='Blues',fmt = 'd',annot_kws= annot_kws, 
                                      xticklabels=np.unique(classes), yticklabels=np.unique(classes)) 
        cbar = ax.collections[0].colorbar # use matplotlib.colorbar.Colorbar object
        cbar.ax.tick_params(labelsize=size_text_xy_tick) # here set the labelsize 
        ax.xaxis.set_tick_params(labelsize=size_text_xy_tick,rotation=90)
        ax.yaxis.set_tick_params(labelsize=size_text_xy_tick,rotation=0)
        ax.set_xlabel('Predicted Values',fontsize=size_text_xy_labels)
        ax.set_ylabel('Actual Values',fontsize=size_text_xy_labels)
        ax.set_title(title,fontsize=size_text_title)
        ax.figure.subplots_adjust(right=0.8)
        ax.figure.savefig(path_img_base+"/"+name+"_CM"+".pdf", bbox_inches = "tight", format='pdf')
    else:
        conf = confusion_matrix(y_test, y_pred) 
        annot_kws={'fontsize':size_num_inter, 'verticalalignment':'center' }

        ax = sns.heatmap(conf, annot=True, cmap='Blues',fmt = 'd',annot_kws= annot_kws, 
                                      xticklabels=np.unique(classes), yticklabels=np.unique(classes)) 
        cbar = ax.collections[0].colorbar # use matplotlib.colorbar.Colorbar object
        cbar.ax.tick_params(labelsize=size_text_xy_tick) # here set the labelsize 
        ax.xaxis.set_tick_params(labelsize=size_text_xy_tick,rotation=90)
        ax.yaxis.set_tick_params(labelsize=size_text_xy_tick,rotation=0)
        ax.set_xlabel('Predicted Values',fontsize=size_text_xy_labels)
        ax.set_ylabel('Actual Values',fontsize=size_text_xy_labels)
        ax.set_title(title,fontsize=size_text_title)
    return ax

# LOADING DATA

In [None]:
path = '../Data/DATA_Complete_GBM.csv'

featuress1,labelss1=load_data_complete_s1(path)
featuress2,labelss2=load_data_complete_s2(path)
featuress3,labelss3=load_data_complete_s3(path)
featuress4,labelss4=load_data_complete_s4(path)


# MACHINE LEARNING APPLICATION

## Scenario 1

In [None]:
print("originals labels unique: ",np.unique(labelss1, return_counts=True)) 
X_trains1, X_tests1, y_trains1, y_tests1 = train_test_split(featuress1, labelss1, 
                                                    test_size=0.20, random_state=21, stratify=labelss1)
sampler = ADASYN(random_state=21,n_jobs=-1) 
X_trains1, y_trains1 = sampler.fit_resample(X_trains1, y_trains1)             
print("y_train labels unique:   ",np.unique(y_trains1, return_counts=True))
print("y_test labels unique:    ",np.unique(y_tests1, return_counts=True)) 

In [None]:
X_trains1=csr_matrix(X_trains1)
X_tests1=csr_matrix(X_tests1)
models1 = XGBClassifier(eval_metric='mlogloss',n_jobs=-1)
models1.fit(X_trains1, y_trains1) 
y_preds1 = models1.predict(X_tests1)
accuracy_s1  = accuracy_score(y_tests1,y_preds1) 
print('accuracy_score: {0:.4f}'.format(accuracy_s1))

In [None]:
feature_names=list(featuress1.columns)
viz = FeatureImportances(models1,labels=feature_names,topn=20)
viz.fit(X_trains1, y_trains1)
viz.show()

In [None]:
feature_importances=pd.DataFrame({'features':featuress1.columns,'feature_importance':models1.feature_importances_})
feature_importances.sort_values('feature_importance',ascending=False)[:20]

In [None]:
fi=list(feature_importances.sort_values('feature_importance',ascending=False)[:20]['features'])
for i in range(len(fi)):
    print(fi[i])

In [None]:
#features=featuress1.loc[:,fi]
#features

## Scenario 2

In [None]:
print("originals labels unique: ",np.unique(labelss2, return_counts=True)) 
X_trains2, X_tests2, y_trains2, y_tests2 = train_test_split(featuress2, labelss2, 
                                                    test_size=0.20, random_state=21, stratify=labelss2)
sampler = SVMSMOTE(random_state=8,n_jobs=-1) 
X_trains2, y_trains2 = sampler.fit_resample(X_trains2, y_trains2)             
print("y_train labels unique:   ",np.unique(y_trains2, return_counts=True))
print("y_test labels unique:    ",np.unique(y_tests2, return_counts=True)) 

In [None]:
models2 = LogisticRegression(solver='liblinear',n_jobs=-1)
models2.fit(X_trains2, y_trains2) 
y_preds2 = models2.predict(X_tests2)
accuracy_s2  = accuracy_score(y_tests2,y_preds2) 
print('accuracy_score: {0:.4f}'.format(accuracy_s2))

In [None]:
viz = FeatureImportances(models2, topn=20)
viz.fit(X_trains2, y_trains2)
viz.show() 

In [None]:
#https://predictivehacks.com/feature-importance-in-python/
#Las de una de las clases
feature_importance=pd.DataFrame({'feature':list(featuress2.columns),'feature_importance':[i for i in models2.coef_[0]]})
feature_importance.sort_values('feature_importance',ascending=False)[:8]

In [None]:
#Las de otra de las clases
cond1=(feature_importance["feature_importance"]<0)
feature_importance[cond1].sort_values('feature_importance',ascending=True)[:12]

In [None]:
#Dado que se usa model.coef_ se recomienda usar esta forma con el valor absoluto
#Porque los positivos son para una clase, y los negativos para la otra
feature_importance=pd.DataFrame({'feature':list(featuress2.columns),'feature_importance':[abs(i) for i in models2.coef_[0]]})
feature_importance.sort_values('feature_importance',ascending=False)[:20] 

In [None]:
fi=list(feature_importance.sort_values('feature_importance',ascending=False)[:20]['feature'])
for i in range(len(fi)):
    print(fi[i])

In [None]:
#features=featuress2.loc[:,fi]
#features

## Scenario 3

In [None]:
print("originals labels unique: ",np.unique(labelss3, return_counts=True)) 
X_trains3, X_tests3, y_trains3, y_tests3 = train_test_split(featuress3, labelss3, 
                                                    test_size=0.20, random_state=21, stratify=labelss3)
sampler = SMOTE(random_state=21,n_jobs=-1) 
X_trains3, y_trains3 = sampler.fit_resample(X_trains3, y_trains3)             
print("y_train labels unique:   ",np.unique(y_trains3, return_counts=True))
print("y_test labels unique:    ",np.unique(y_tests3, return_counts=True)) 

In [None]:
models3 = GradientBoostingClassifier(random_state=8)
models3.fit(X_trains3, y_trains3) 
y_preds3 = models3.predict(X_tests3)
accuracy_s3  = accuracy_score(y_tests3,y_preds3) 
print('accuracy_score: {0:.4f}'.format(accuracy_s3))

In [None]:
viz = FeatureImportances(models3, topn=20)
viz.fit(X_trains3, y_trains3)
viz.show() 

In [None]:
feature_importances=pd.DataFrame({'features':featuress3.columns,'feature_importance':models3.feature_importances_})
feature_importances.sort_values('feature_importance',ascending=False)[:20] 

In [None]:
fi=list(feature_importances.sort_values('feature_importance',ascending=False)[:20]['features'])
for i in range(len(fi)):
    print(fi[i])

In [None]:
#features=featuress3.loc[:,fi]
#features

## Scenario 4

In [None]:
print("originals labels unique: ",np.unique(labelss4, return_counts=True)) 
X_trains4, X_tests4, y_trains4, y_tests4 = train_test_split(featuress4, labelss4, 
                                                    test_size=0.20, random_state=21, stratify=labelss4)
sampler = RandomOverSampler(random_state=21) 
X_trains4, y_trains4 = sampler.fit_resample(X_trains4, y_trains4)             
print("y_train labels unique:   ",np.unique(y_trains4, return_counts=True))
print("y_test labels unique:    ",np.unique(y_tests4, return_counts=True)) 

In [None]:
models4 = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=4,
                  min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.8,
                  objective= 'binary:logistic', nthread=4, seed=27, eval_metric='mlogloss', n_jobs=-1)
models4.fit(X_trains4, y_trains4) 
y_preds4 = models4.predict(X_tests4)
accuracy_s4  = accuracy_score(y_tests4,y_preds4) 
print('accuracy_score: {0:.4f}'.format(accuracy_s4))

In [None]:
viz = FeatureImportances(models4, topn=20)
viz.fit(X_trains4, y_trains4)
viz.show() 

In [None]:
feature_importances=pd.DataFrame({'features':featuress4.columns,'feature_importance':models4.feature_importances_})
feature_importances.sort_values('feature_importance',ascending=False)[:20] 

In [None]:
fi=list(feature_importances.sort_values('feature_importance',ascending=False)[:20]['features'])
for i in range(len(fi)):
    print(fi[i])

# CONFUSION MATRICES

In [None]:
size_text_legend=60
size_text_title=95
size_text_xy_labels=60
size_text_xy_tick=60
size_num_inter=95

nrows=1
ncols=4

fig = plt.figure(figsize=(20*ncols,15*nrows))
fig.subplots_adjust(hspace=0.2, wspace=0.2)

i=1
ax = fig.add_subplot(nrows, ncols, i)
###############################################SCENARIO 1

model_name = "Scenario 1"
model_selected = models1
y_preds1 = model_selected.predict(X_tests1)
acc_score=accuracy_score(y_tests1,y_preds1) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_tests1)
classes = np.unique(y_tests1)

#title="Confusion Matrix for {}".format(model_name)
title="A"
CM_viz(y_tests1, y_preds1, classes, name=model_name, 
                            path_img_base = './images',nrows=nrows,ncols=ncols, 
                            size_text_legend=size_text_legend,size_text_title=size_text_title,title=title,
       size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick,size_num_inter=size_num_inter)
###############################################

i=2
ax = fig.add_subplot(nrows, ncols, i)
###############################################SCENARIO 2

model_name = "Scenario 2"
model_selected = models2
y_preds2 = model_selected.predict(X_tests2)
acc_score=accuracy_score(y_tests2,y_preds2) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_tests2)
classes = np.unique(y_tests2)

#title="Confusion Matrix for {}".format(model_name)
title="B"
CM_viz(y_tests2, y_preds2, classes, name=model_name, 
                            path_img_base = './images',nrows=nrows,ncols=ncols, 
                            size_text_legend=size_text_legend,size_text_title=size_text_title,title=title,
       size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick,size_num_inter=size_num_inter)
###############################################

i=3
ax = fig.add_subplot(nrows, ncols, i)
###############################################SCENARIO 3

model_name = "Scenario 3"
model_selected = models3
y_preds3 = model_selected.predict(X_tests3)
acc_score=accuracy_score(y_tests3,y_preds3) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_tests3)
classes = nclasses = np.unique(y_tests3)

#title="Confusion Matrix for {}".format(model_name)
title="C"
CM_viz(y_tests3, y_preds3, classes, name=model_name, 
                            path_img_base = './images',nrows=nrows,ncols=ncols, 
                            size_text_legend=size_text_legend,size_text_title=size_text_title,title=title,
       size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick,size_num_inter=size_num_inter)
###############################################

i=4
ax = fig.add_subplot(nrows, ncols, i)
###############################################SCENARIO 4

model_name = "Scenario 4"
model_selected = models4
y_preds4 = model_selected.predict(X_tests4)
acc_score=accuracy_score(y_tests4,y_preds4) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_tests4)
classes = np.unique(y_tests4)

#title="Confusion Matrix for {}".format(model_name)
title="D"
CM_viz(y_tests4, y_preds4, classes, name=model_name, 
                            path_img_base = './images',nrows=nrows,ncols=ncols, 
                            size_text_legend=size_text_legend,size_text_title=size_text_title,title=title,
       size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick,size_num_inter=size_num_inter)
###############################################

model_name = "CM_GBM"
fig.savefig("./images"+"/"+model_name+"_CM"+".pdf", bbox_inches = "tight", format='pdf') 