# Libraries

In [None]:
#Classification Methods
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

#Metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from yellowbrick.classifier import ClassificationReport 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score

#Tools
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import model_selection
import string 
import time as tm
import os
from scipy.sparse import csr_matrix 
from yellowbrick.model_selection import FeatureImportances

#Class balance
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTEN
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import KMeansSMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.under_sampling import ClusterCentroids 
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks

import warnings
warnings.filterwarnings('ignore')

In [None]:
#AutoGluon
!pip install -U pip
!pip install -U setuptools wheel
!pip install autogluon  # autogluon==0.3.1

In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor

## Functions

In [None]:
def load_data_complete_s1(path):
    df_complete=pd.read_csv(path)
    
    # Scenario 1: Tumor_Core & Tumor_Periphery
    # Se procede a eliminar el N_Periphery

    df2 = df_complete.copy()
    df2.drop(df2[df2.classes == "NP"].index, inplace=True)  
    
    # Eliminamos los labels
    features = df2.copy()
    features = features.drop(['classes'], axis=1)
    
    #Extraemos los labels
    labels = df2.copy()
    labels = labels['classes'].values
    
    return features,labels

def load_data_complete_s2(path):
    df_complete=pd.read_csv(path)
    
    # Scenario 2: Normal_Periphery & Tumor_Periphery
    # Se procede a eliminar el T_Core

    df2 = df_complete.copy()
    df2.drop(df2[df2.classes == "TC"].index, inplace=True)  
    
    # Eliminamos los labels
    features = df2.copy()
    features = features.drop(['classes'], axis=1)
    
    #Extraemos los labels
    labels = df2.copy()
    labels = labels['classes'].values
    
    return features,labels

def load_data_complete_s3(path):
    df_complete=pd.read_csv(path)
    # Scenario 3: Tumor_Periphery&Core & Normal_Periphery
    
    # Se procede aislar al N_Periphery
    df2 = df_complete.copy()
    df2.drop(df2[df2.classes == "TP"].index, inplace=True)  
    df2.drop(df2[df2.classes == "TC"].index, inplace=True)  
    
    # Eliminamos el N_Periphery
    df3 = df_complete.copy()
    df3.drop(df3[df3.classes == "NP"].index, inplace=True) 
    
    # y luego se procede a renombrar la columna classes con T_PC, al quedar la unión de estas
    df3["classes"] = "TPC"
    
    # Se procede a crear el DF ya con las clases que corresponde al Escenario 3: Tumor_Periphery&Core & Normal_Periphery
    #df2 N_Periphery
    #df3 T_PC

    df4 = pd.concat([df2,df3]).reset_index(drop=True) 
    
    # Eliminamos los labels
    features = df4.copy()
    features = features.drop(['classes'], axis=1)
    
    #Extraemos los labels
    labels = df4.copy()

    labels = labels['classes'].values
    
    return features,labels

def load_data_complete_s4(path):
    df_complete=pd.read_csv(path)
    
    # Scenario 4 new: Tumor_Core & Tumor_Periphery & N_Periphery

    # Eliminamos los labels
    features = df_complete.copy()
    features = features.drop(['classes'], axis=1)
    
    #Extraemos los labels
    labels = df_complete.copy()
    labels = labels['classes'].values
    
    return features,labels

In [None]:
def top_models_autogluon(leaderboard,top_value):
  for model_number in range(top_value):
    # Calculate accuracy, roc_auc, f1, precision, recall
    model_name=leaderboard["model"][model_number]
    print("\n",model_name)
    metricas = predictor.evaluate(X_test_classes,model=model_name);

    # Calculate MSE
    y_pred = predictor.predict(X_test, model=model_name).to_numpy()
    if type(list(np.unique(np.array(y_test)))[0]).__name__ == 'str': #If the classes are categorical with string names
        le           = LabelEncoder() 
        le.fit(list(np.unique(np.array(y_test)))) 
        y_test_coded = le.transform(y_test) 
        y_pred_coded = le.transform(y_pred) 
        mse_s        = MSE(y_test_coded,y_pred_coded)
        print('MSE: {0:.4f}'.format(mse_s))
    else:
        mse_s        = MSE(y_test,y_pred)
        print('MSE: {0:.4f}'.format(mse_s))

    if len(list(np.unique(np.array(y_test)))) > 2: #For multiclass classification (more than 2 classes)
        f1_s        = f1_score(y_test,y_pred,average='weighted')
        print('f1_score: {0:.4f}'.format(f1_s))
        recall_s    = recall_score(y_test,y_pred,average='weighted')
        print('recall_score: {0:.4f}'.format(recall_s))
        precision_s = precision_score(y_test,y_pred,average='weighted')
        print('precision_score: {0:.4f}'.format(precision_s))       
        y_pred_proba = predictor.predict_proba(X_test, model=model_name)[:]
        roc_s        = roc_auc_score(y_test, y_pred_proba, multi_class='ovo', average='weighted')
        print('ROC_AUC: {0:.4f}'.format(roc_s))   
  
    # Times
    training_time=leaderboard["fit_time"][model_number]
    print('Training_time: {0:.4}'.format(training_time))
    testing_time=leaderboard["pred_time_test"][model_number]
    print('Training_time: {0:.4}'.format(testing_time))

# Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#path = '../Data/DATA_Complete_GBM.csv'
path = '/content/drive/MyDrive/Colab Notebooks/IA/GBM/1Data/DATA_Complete_GBM.csv'

featuress1,labelss1=load_data_complete_s1(path)
featuress2,labelss2=load_data_complete_s2(path)
featuress3,labelss3=load_data_complete_s3(path)
featuress4,labelss4=load_data_complete_s4(path)


# Machine learning application

## Scenario 1

In [None]:
# Data split
features = featuress1
labels   = labelss1

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.20, random_state=21, stratify=labels)

print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

# Data balancing for training data
sampler = ADASYN(random_state=21,n_jobs=-1) 
X_train, y_train = sampler.fit_resample(X_train, y_train)             
print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 

# Union with the labels
X_train_class = pd.DataFrame()
X_train_class['classes'] = y_train
X_train_classes = pd.concat([X_train.reset_index(), X_train_class], axis=1)
del X_train_classes["index"]

X_test_class = pd.DataFrame()
X_test_class['classes'] = y_test
X_test_classes = pd.concat([X_test.reset_index(), X_test_class], axis=1)
X_test=X_test.reset_index()
del X_test["index"]
del X_test_classes["index"]

In [None]:
start_time = tm.time()
predictor = TabularPredictor(label='classes').fit(X_train_classes)#, time_limit=250)   
TIME = tm.time() - start_time 
print("Time, Training: {0:.4f} [seconds]".format(TIME))

In [None]:
# Summary, total models
predictor.fit_summary()

In [None]:
# Best models
leaderboard = predictor.leaderboard(X_test_classes) 


In [None]:
# Metrics for the top models
top_models_autogluon(leaderboard,top_value=4)


## Scenario 2

In [None]:
# Data split
features = featuress2
labels   = labelss2

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.20, random_state=21, stratify=labels)

print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

# Data balancing for training data
sampler = SVMSMOTE(random_state=8,n_jobs=-1) 
X_train, y_train = sampler.fit_resample(X_train, y_train)             
print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 

# Union with the labels
X_train_class = pd.DataFrame()
X_train_class['classes'] = y_train
X_train_classes = pd.concat([X_train.reset_index(), X_train_class], axis=1)
del X_train_classes["index"]

X_test_class = pd.DataFrame()
X_test_class['classes'] = y_test
X_test_classes = pd.concat([X_test.reset_index(), X_test_class], axis=1)
X_test=X_test.reset_index()
del X_test["index"]
del X_test_classes["index"]

In [None]:
start_time = tm.time()
predictor = TabularPredictor(label='classes').fit(X_train_classes)#, time_limit=250)   
TIME = tm.time() - start_time 
print("Time, Training: {0:.4f} [seconds]".format(TIME))

In [None]:
# Summary, total models
predictor.fit_summary()

In [None]:
# Best models
leaderboard = predictor.leaderboard(X_test_classes) 


In [None]:
# Metrics for the top models
top_models_autogluon(leaderboard,top_value=4)


## Scenario 3

In [None]:
# Data split
features = featuress3
labels   = labelss3

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.20, random_state=21, stratify=labels)

print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

# Data balancing for training data
sampler = SMOTE(random_state=21,n_jobs=-1) 
X_train, y_train = sampler.fit_resample(X_train, y_train) 
print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 

# Union with the labels
X_train_class = pd.DataFrame()
X_train_class['classes'] = y_train
X_train_classes = pd.concat([X_train.reset_index(), X_train_class], axis=1)
del X_train_classes["index"]

X_test_class = pd.DataFrame()
X_test_class['classes'] = y_test
X_test_classes = pd.concat([X_test.reset_index(), X_test_class], axis=1)
X_test=X_test.reset_index()
del X_test["index"]
del X_test_classes["index"]

In [None]:
start_time = tm.time()
predictor = TabularPredictor(label='classes').fit(X_train_classes)#, time_limit=250)   
TIME = tm.time() - start_time 
print("Time, Training: {0:.4f} [seconds]".format(TIME))

In [None]:
# Summary, total models
predictor.fit_summary()

In [None]:
# Best models
leaderboard = predictor.leaderboard(X_test_classes) 


In [None]:
# Metrics for the top models
top_models_autogluon(leaderboard,top_value=4)


## Scenario 4

In [None]:
# Data split
features = featuress4
labels   = labelss4

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.20, random_state=21, stratify=labels)

print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

# Data balancing for training data
sampler = RandomOverSampler(random_state=21) 
X_train, y_train = sampler.fit_resample(X_train, y_train)            
print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 

# Union with the labels
X_train_class = pd.DataFrame()
X_train_class['classes'] = y_train
X_train_classes = pd.concat([X_train.reset_index(), X_train_class], axis=1)
del X_train_classes["index"]

X_test_class = pd.DataFrame()
X_test_class['classes'] = y_test
X_test_classes = pd.concat([X_test.reset_index(), X_test_class], axis=1)
X_test=X_test.reset_index()
del X_test["index"]
del X_test_classes["index"]

In [None]:
start_time = tm.time()
predictor = TabularPredictor(label='classes').fit(X_train_classes)#, time_limit=250)   
TIME = tm.time() - start_time 
print("Time, Training: {0:.4f} [seconds]".format(TIME))

In [None]:
# Summary, total models
predictor.fit_summary()

In [None]:
# Best models
leaderboard = predictor.leaderboard(X_test_classes) 


In [None]:
# Metrics for the top models
top_models_autogluon(leaderboard,top_value=4)
