# LIBRARIES

In [None]:
#Classification Methods
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

#Metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from yellowbrick.classifier import ClassificationReport 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score

#Tools
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import model_selection
import string 
import time as tm
import os
from scipy.sparse import csr_matrix 
from yellowbrick.model_selection import FeatureImportances

#Class balance
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTEN
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import KMeansSMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.under_sampling import ClusterCentroids 
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks

import warnings
warnings.filterwarnings('ignore')

In [None]:
import winsound
duration = 1000  # milliseconds
freq = 440  # Hz

# FUNCTIONS

In [None]:
def classifier_metrics(X_train,X_test,y_train,y_test,HO=True,CV=True):    
    def metrics(model):
        if HO == True:
            print("\nHold-Out in process...")
            start_time = tm.time()
            model.fit(X_train, y_train) 
            TIME = tm.time() - start_time 
            print("Time, Training: {0:.4f} [seconds]".format(TIME))
            start_time = tm.time()
            y_pred = model.predict(X_test)
            TIME = tm.time() - start_time 
            print("Time, Prediction: {0:.4f} [seconds]".format(TIME))

            accuracy_s  = accuracy_score(y_test,y_pred) 
            print('accuracy_score: {0:.4f}'.format(accuracy_s))
            f1_s        = f1_score(y_test,y_pred,average='weighted')
            print('f1_score: {0:.4f}'.format(f1_s))
            recall_s    = recall_score(y_test,y_pred,average='weighted')
            print('recall_score: {0:.4f}'.format(recall_s))
            precision_s = precision_score(y_test,y_pred,average='weighted')
            print('precision_score: {0:.4f}'.format(precision_s))

            if type(list(np.unique(np.array(y_train)))[0]).__name__ == 'str': #If the classes are categorical with string names
                le           = LabelEncoder() 
                le.fit(list(np.unique(np.array(y_train)))) 
                y_test_coded = le.transform(y_test) 
                y_pred_coded = le.transform(y_pred) 
                mse_s        = MSE(y_test_coded,y_pred_coded)
                print('MSE: {0:.4f}'.format(mse_s))
            else:
                mse_s        = MSE(y_test,y_pred)
                print('MSE: {0:.4f}'.format(mse_s))

            if len(list(np.unique(np.array(y_train)))) > 2: #For multiclass classification, more than 2 classes
                y_pred_proba = model.predict_proba(X_test)[:]
                roc_s        = roc_auc_score(y_test, y_pred_proba, multi_class='ovo', average='weighted')
                print('ROC_AUC: {0:.4f}'.format(roc_s))            
            else:
                y_pred_proba = model.predict_proba(X_test)[:,1]
                roc_s        = roc_auc_score(y_test, y_pred_proba, multi_class='ovo', average='weighted')
                print('ROC_AUC: {0:.4f}'.format(roc_s))

            ck_s         = cohen_kappa_score(y_test,y_pred)
            print('CK: {0:.4f}'.format(ck_s))
        
        if CV == True:
            print('\nCross-Validation in process...')
            start_time = tm.time() 
            kfold = model_selection.KFold(n_splits=10)
            y_CV = np.concatenate((y_train,y_test))
            if "GaussianNB" in str(name) or "LinearDiscriminantAnalysis" in str(name) or "QuadraticDiscriminantAnalysis" in str(name):
                X_CV = np.concatenate((X_train,X_test))
                cv_results = np.array(model_selection.cross_val_score(model, X_CV, y_CV, cv=kfold, scoring='accuracy', n_jobs=-3))
            else:
                X_CV = np.concatenate((X_train.toarray(),X_test.toarray()))
                X_CV = csr_matrix(X_CV)
                cv_results = np.array(model_selection.cross_val_score(model, X_CV, y_CV, cv=kfold, scoring='accuracy', n_jobs=-3))

            cv_results = cv_results[np.logical_not(np.isnan(cv_results))] 
            TIME = tm.time() - start_time 
            print("Time, CV: {0:.4f} [seconds]".format(TIME))
            print('CV: {0:.4f} {1:.4f}'.format(cv_results.mean(),cv_results.std()))

    for name in classifiers:
        print ("---------------------------------------------------------------------------------\n") 
        print(str(name))
        if "GaussianNB" in str(name) or "LinearDiscriminantAnalysis" in str(name) or "QuadraticDiscriminantAnalysis" in str(name):
            X_train=csr_matrix(X_train) 
            X_test =csr_matrix(X_test) 
            X_train=X_train.toarray() 
            X_test=X_test.toarray() 
        else:
            X_train=csr_matrix(X_train)
            X_test=csr_matrix(X_test)
            
        metrics(name)
        print()


In [None]:
def class_balance_over_sampling(features, labels, HO=False, CV=True, methods_list=["RandomOverSampler"]):
    
    best_acc=list()
    for method in methods_list:
        if method == "RandomOverSampler":
            print(method)
            print("originals labels unique: ",np.unique(labels, return_counts=True)) 
            X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                                test_size=0.20, random_state=21, stratify=labels)
            sampler = RandomOverSampler(random_state=21) 
            X_train, y_train = sampler.fit_resample(X_train, y_train)             
            print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
            print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 
            classifier_metrics(X_train,X_test,y_train,y_test,HO=HO,CV=CV)
            
        elif method == "SMOTE":
            print(method)
            print("originals labels unique: ",np.unique(labels, return_counts=True)) 
            X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                                test_size=0.20, random_state=21, stratify=labels)
            sampler = SMOTE(random_state=21,n_jobs=-1) 
            X_train, y_train = sampler.fit_resample(X_train, y_train)             
            print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
            print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 
            classifier_metrics(X_train,X_test,y_train,y_test,HO=HO,CV=CV)

        elif method == "SMOTEN":
            print(method)
            print("originals labels unique: ",np.unique(labels, return_counts=True)) 
            X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                                test_size=0.20, random_state=21, stratify=labels)
            sampler = SMOTEN(random_state=21,n_jobs=-1)
            X_train, y_train = sampler.fit_resample(X_train, y_train)             
            print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
            print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 
            classifier_metrics(X_train,X_test,y_train,y_test,HO=HO,CV=CV)
            
        elif method == "ADASYN":
            print(method)
            print("originals labels unique: ",np.unique(labels, return_counts=True)) 
            X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                                test_size=0.20, random_state=21, stratify=labels)
            sampler = ADASYN(random_state=21,n_jobs=-1) 
            X_train, y_train = sampler.fit_resample(X_train, y_train)             
            print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
            print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 
            classifier_metrics(X_train,X_test,y_train,y_test,HO=HO,CV=CV)
            
        elif method == "BorderlineSMOTE":
            print(method)
            print("originals labels unique: ",np.unique(labels, return_counts=True)) 
            X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                                test_size=0.20, random_state=21, stratify=labels)
            sampler = BorderlineSMOTE(random_state=21,n_jobs=-1) 
            X_train, y_train = sampler.fit_resample(X_train, y_train)             
            print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
            print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 
            classifier_metrics(X_train,X_test,y_train,y_test,HO=HO,CV=CV)
            
        elif method == "KMeansSMOTE":
            print(method)
            print("originals labels unique: ",np.unique(labels, return_counts=True)) 
            X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                                test_size=0.20, random_state=21, stratify=labels)
            sampler = KMeansSMOTE(random_state=21,n_jobs=-1, k_neighbors=np.unique(y_test).shape[0]) 
            X_train, y_train = sampler.fit_resample(X_train, y_train)             
            print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
            print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 
            classifier_metrics(X_train,X_test,y_train,y_test,HO=HO,CV=CV)
            
        elif method == "SVMSMOTE":
            print(method)
            print("originals labels unique: ",np.unique(labels, return_counts=True)) 
            X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                                test_size=0.20, random_state=21, stratify=labels)
            sampler = SVMSMOTE(random_state=8,n_jobs=-1) 
            X_train, y_train = sampler.fit_resample(X_train, y_train)             
            print("y_train labels unique:   ",np.unique(y_train, return_counts=True))
            print("y_test labels unique:    ",np.unique(y_test, return_counts=True)) 
            classifier_metrics(X_train,X_test,y_train,y_test,HO=HO,CV=CV)
            
        else:
            continue

In [None]:
def load_data_complete_s1(path):
    df_complete=pd.read_csv(path)
    
    # Scenario 1: Tumor_Core & Tumor_Periphery
    # Se procede a eliminar el N_Periphery

    df2 = df_complete.copy()
    df2.drop(df2[df2.classes == "NP"].index, inplace=True)  
    
    # Eliminamos los labels
    features = df2.copy()
    features = features.drop(['classes'], axis=1)
    
    #Extraemos los labels
    labels = df2.copy()
    labels = labels['classes'].values
    
    return features,labels

def load_data_complete_s2(path):
    df_complete=pd.read_csv(path)
    
    # Scenario 2: Normal_Periphery & Tumor_Periphery
    # Se procede a eliminar el T_Core

    df2 = df_complete.copy()
    df2.drop(df2[df2.classes == "TC"].index, inplace=True)  
    
    # Eliminamos los labels
    features = df2.copy()
    features = features.drop(['classes'], axis=1)
    
    #Extraemos los labels
    labels = df2.copy()
    labels = labels['classes'].values
    
    return features,labels

def load_data_complete_s3(path):
    df_complete=pd.read_csv(path)
    # Scenario 3: Tumor_Periphery&Core & Normal_Periphery
    
    # Se procede aislar al N_Periphery
    df2 = df_complete.copy()
    df2.drop(df2[df2.classes == "TP"].index, inplace=True)  
    df2.drop(df2[df2.classes == "TC"].index, inplace=True)  
    
    # Eliminamos el N_Periphery
    df3 = df_complete.copy()
    df3.drop(df3[df3.classes == "NP"].index, inplace=True) 
    
    # y luego se procede a renombrar la columna classes con T_PC, al quedar la unión de estas
    df3["classes"] = "TPC"
    
    # Se procede a crear el DF ya con las clases que corresponde al Escenario 3: Tumor_Periphery&Core & Normal_Periphery
    #df2 N_Periphery
    #df3 T_PC

    df4 = pd.concat([df2,df3]).reset_index(drop=True) 
    
    # Eliminamos los labels
    features = df4.copy()
    features = features.drop(['classes'], axis=1)
    
    #Extraemos los labels
    labels = df4.copy()

    labels = labels['classes'].values
    
    return features,labels

def load_data_complete_s4(path):
    df_complete=pd.read_csv(path)
    
    # Scenario 4 new: Tumor_Core & Tumor_Periphery & N_Periphery

    # Eliminamos los labels
    features = df_complete.copy()
    features = features.drop(['classes'], axis=1)
    
    #Extraemos los labels
    labels = df_complete.copy()
    labels = labels['classes'].values
    
    return features,labels

# LOADING DATA

In [None]:
path = '../Data/DATA_Complete_GBM.csv'

featuress1,labelss1=load_data_complete_s1(path)
featuress2,labelss2=load_data_complete_s2(path)
featuress3,labelss3=load_data_complete_s3(path)
featuress4,labelss4=load_data_complete_s4(path)


# ML APPLICATION - OBTAINING THE 20 GENES PER MODEL

## Scenario 1

In [None]:
print("originals labels unique: ",np.unique(labelss1, return_counts=True)) 
X_trains1, X_tests1, y_trains1, y_tests1 = train_test_split(featuress1, labelss1, 
                                                    test_size=0.20, random_state=21, stratify=labelss1)
sampler = ADASYN(random_state=21,n_jobs=-1) 
print("y_train ORIGINAL labels unique:   ",np.unique(y_trains1, return_counts=True))
print("y_test ORIGINAL labels unique:   ",np.unique(y_tests1, return_counts=True))
X_trains1, y_trains1 = sampler.fit_resample(X_trains1, y_trains1)             
print("y_train labels unique:   ",np.unique(y_trains1, return_counts=True))
print("y_test labels unique:    ",np.unique(y_tests1, return_counts=True)) 

In [None]:
X_trains1=csr_matrix(X_trains1)
X_tests1=csr_matrix(X_tests1)
models1 = XGBClassifier(eval_metric='mlogloss',n_jobs=-1)
models1.fit(X_trains1, y_trains1) 
y_preds1 = models1.predict(X_tests1)
accuracy_s1  = accuracy_score(y_tests1,y_preds1) 
print('accuracy_score: {0:.4f}'.format(accuracy_s1))

In [None]:
feature_names=list(featuress1.columns)
viz = FeatureImportances(models1,labels=feature_names,topn=20)
viz.fit(X_trains1, y_trains1)
viz.show()

In [None]:
feature_importances=pd.DataFrame({'features':featuress1.columns,'feature_importance':models1.feature_importances_})
feature_importances.sort_values('feature_importance',ascending=False)[:20]

In [None]:
fi_s1=list(feature_importances.sort_values('feature_importance',ascending=False)[:20]['features'])
for i in range(len(fi_s1)):
    print(fi_s1[i])

In [None]:
#features=featuress1.loc[:,fi]
#features

## Scenario 2

In [None]:
print("originals labels unique: ",np.unique(labelss2, return_counts=True)) 
X_trains2, X_tests2, y_trains2, y_tests2 = train_test_split(featuress2, labelss2, 
                                                    test_size=0.20, random_state=21, stratify=labelss2)
sampler = SVMSMOTE(random_state=8,n_jobs=-1) 
print("y_train ORIGINAL labels unique:   ",np.unique(y_trains2, return_counts=True))
print("y_test ORIGINAL labels unique:   ",np.unique(y_tests2, return_counts=True))
X_trains2, y_trains2 = sampler.fit_resample(X_trains2, y_trains2)             
print("y_train labels unique:   ",np.unique(y_trains2, return_counts=True))
print("y_test labels unique:    ",np.unique(y_tests2, return_counts=True)) 

In [None]:
models2 = LogisticRegression(solver='liblinear',n_jobs=-1)
models2.fit(X_trains2, y_trains2) 
y_preds2 = models2.predict(X_tests2)
accuracy_s2  = accuracy_score(y_tests2,y_preds2) 
print('accuracy_score: {0:.4f}'.format(accuracy_s2))

In [None]:
viz = FeatureImportances(models2, topn=20)
viz.fit(X_trains2, y_trains2)
viz.show() 

In [None]:
#https://predictivehacks.com/feature-importance-in-python/
#Las de una de las clases
feature_importance=pd.DataFrame({'feature':list(featuress2.columns),'feature_importance':[i for i in models2.coef_[0]]})
feature_importance.sort_values('feature_importance',ascending=False)[:8]

In [None]:
#Las de otra de las clases
cond1=(feature_importance["feature_importance"]<0)
feature_importance[cond1].sort_values('feature_importance',ascending=True)[:12]

In [None]:
#Dado que se usa model.coef_ se recomienda usar esta forma con el valor absoluto
#Porque los positivos son para una clase, y los negativos para la otra
feature_importance=pd.DataFrame({'feature':list(featuress2.columns),'feature_importance':[abs(i) for i in models2.coef_[0]]})
feature_importance.sort_values('feature_importance',ascending=False)[:20] 

In [None]:
fi_s2=list(feature_importance.sort_values('feature_importance',ascending=False)[:20]['feature'])
for i in range(len(fi_s2)):
    print(fi_s2[i])

In [None]:
#features=featuress2.loc[:,fi]
#features

## Scenario 3

In [None]:
print("originals labels unique: ",np.unique(labelss3, return_counts=True)) 
X_trains3, X_tests3, y_trains3, y_tests3 = train_test_split(featuress3, labelss3, 
                                                    test_size=0.20, random_state=21, stratify=labelss3)
sampler = SMOTE(random_state=21,n_jobs=-1) 
print("y_train ORIGINAL labels unique:   ",np.unique(y_trains3, return_counts=True))
print("y_test ORIGINAL labels unique:   ",np.unique(y_tests3, return_counts=True))
X_trains3, y_trains3 = sampler.fit_resample(X_trains3, y_trains3)             
print("y_train labels unique:   ",np.unique(y_trains3, return_counts=True))
print("y_test labels unique:    ",np.unique(y_tests3, return_counts=True)) 

In [None]:
models3 = GradientBoostingClassifier(random_state=8)
models3.fit(X_trains3, y_trains3) 
y_preds3 = models3.predict(X_tests3)
accuracy_s3  = accuracy_score(y_tests3,y_preds3) 
print('accuracy_score: {0:.4f}'.format(accuracy_s3))

In [None]:
viz = FeatureImportances(models3, topn=20)
viz.fit(X_trains3, y_trains3)
viz.show() 

In [None]:
feature_importances=pd.DataFrame({'features':featuress3.columns,'feature_importance':models3.feature_importances_})
feature_importances.sort_values('feature_importance',ascending=False)[:20] 

In [None]:
fi_s3=list(feature_importances.sort_values('feature_importance',ascending=False)[:20]['features'])
for i in range(len(fi_s3)):
    print(fi_s3[i])

In [None]:
#features=featuress3.loc[:,fi]
#features

## Scenario 4

In [None]:
print("originals labels unique: ",np.unique(labelss4, return_counts=True)) 
X_trains4, X_tests4, y_trains4, y_tests4 = train_test_split(featuress4, labelss4, 
                                                    test_size=0.20, random_state=21, stratify=labelss4)
sampler = RandomOverSampler(random_state=21) 
print("y_train ORIGINAL labels unique:   ",np.unique(y_trains4, return_counts=True))
print("y_test ORIGINAL labels unique:   ",np.unique(y_tests4, return_counts=True))
X_trains4, y_trains4 = sampler.fit_resample(X_trains4, y_trains4)             
print("y_train labels unique:   ",np.unique(y_trains4, return_counts=True))
print("y_test labels unique:    ",np.unique(y_tests4, return_counts=True)) 

In [None]:
models4 = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=4,
                  min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.8,
                  objective= 'binary:logistic', nthread=4, seed=27, eval_metric='mlogloss', n_jobs=-1)
models4.fit(X_trains4, y_trains4) 
y_preds4 = models4.predict(X_tests4)
accuracy_s4  = accuracy_score(y_tests4,y_preds4) 
print('accuracy_score: {0:.4f}'.format(accuracy_s4))

In [None]:
viz = FeatureImportances(models4, topn=20)
viz.fit(X_trains4, y_trains4)
viz.show() 

In [None]:
feature_importances=pd.DataFrame({'features':featuress4.columns,'feature_importance':models4.feature_importances_})
feature_importances.sort_values('feature_importance',ascending=False)[:20] 

In [None]:
fi_s4=list(feature_importances.sort_values('feature_importance',ascending=False)[:20]['features'])
for i in range(len(fi_s4)):
    print(fi_s4[i])

# COMPARING PERFORMANCE WITH DIFFERENT NUMBER OF GENES

## 23,368 Genes

### Scenario 1

In [None]:
#ML Model
classifiers=[XGBClassifier(eval_metric='mlogloss',n_jobs=-1)] 
class_balance_over_sampling(featuress1, labelss1, HO=True, CV=False, methods_list=["ADASYN"])

### Scenario 2

In [None]:
#ML Model
classifiers=[LogisticRegression(solver='liblinear',n_jobs=-1)] 
class_balance_over_sampling(featuress2, labelss2, HO=True, CV=False, methods_list=["SVMSMOTE"])

### Scenario 3

In [None]:
#ML Model
classifiers=[GradientBoostingClassifier(random_state=8)] 
class_balance_over_sampling(featuress3, labelss3, HO=True, CV=False, methods_list=["SMOTE"])

### Scenario 4

In [None]:
#ML Model
classifiers=[XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=4,
                  min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.8,
                  objective= 'binary:logistic', nthread=4, seed=27, eval_metric='mlogloss', n_jobs=-1)] 
class_balance_over_sampling(featuress4, labelss4, HO=True, CV=False, methods_list=["RandomOverSampler"])

## 20 Genes per ML algorithm

In [None]:
len(fi_s1),len(fi_s2),len(fi_s3),len(fi_s4)

### Scenario 1

In [None]:
#ML Model
classifiers=[XGBClassifier(eval_metric='mlogloss',n_jobs=-1)] 
class_balance_over_sampling(featuress1[fi_s1], labelss1, HO=True, CV=False, methods_list=["ADASYN"])

### Scenario 2

In [None]:
#ML Model
classifiers=[LogisticRegression(solver='liblinear',n_jobs=-1)] 
class_balance_over_sampling(featuress2[fi_s2], labelss2, HO=True, CV=False, methods_list=["SVMSMOTE"])

### Scenario 3

In [None]:
#ML Model
classifiers=[GradientBoostingClassifier(random_state=8)] 
class_balance_over_sampling(featuress3[fi_s3], labelss3, HO=True, CV=False, methods_list=["SMOTE"])

### Scenario 4

In [None]:
#ML Model
classifiers=[XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=4,
                  min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.8,
                  objective= 'binary:logistic', nthread=4, seed=27, eval_metric='mlogloss', n_jobs=-1)] 
class_balance_over_sampling(featuress4[fi_s4], labelss4, HO=True, CV=False, methods_list=["RandomOverSampler"])

## 65 Genes extend

In [None]:
len(fi_s1),len(fi_s2),len(fi_s3),len(fi_s4)

In [None]:
fi_s1_ext = fi_s1.copy()
fi_s1_ext.extend([element for element in fi_s2 if element not in fi_s1_ext])
fi_s1_ext.extend([element for element in fi_s3 if element not in fi_s1_ext])
fi_s1_ext.extend([element for element in fi_s4 if element not in fi_s1_ext])
len(fi_s1_ext)

In [None]:
fi_s1_ext

### Scenario 1

In [None]:
#ML Model
classifiers=[XGBClassifier(eval_metric='mlogloss',n_jobs=-1)] 
class_balance_over_sampling(featuress1[fi_s1_ext], labelss1, HO=True, CV=False, methods_list=["ADASYN"])

### Scenario 2

In [None]:
#ML Model
classifiers=[LogisticRegression(solver='liblinear',n_jobs=-1)] 
class_balance_over_sampling(featuress2[fi_s1_ext], labelss2, HO=True, CV=False, methods_list=["SVMSMOTE"])

### Scenario 3

In [None]:
#ML Model
classifiers=[GradientBoostingClassifier(random_state=8)] 
class_balance_over_sampling(featuress3[fi_s1_ext], labelss3, HO=True, CV=False, methods_list=["SMOTE"])

### Scenario 4

In [None]:
#ML Model
classifiers=[XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=4,
                  min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.8,
                  objective= 'binary:logistic', nthread=4, seed=27, eval_metric='mlogloss', n_jobs=-1)] 
class_balance_over_sampling(featuress4[fi_s1_ext], labelss4, HO=True, CV=False, methods_list=["RandomOverSampler"])

## 12 Genes intersections

In [None]:
len(fi_s1+fi_s2+fi_s3+fi_s4)

In [None]:
list_gen_different_sets = list()

list_gen_different_sets.append(list(set(fi_s1)&set(fi_s2)))
list_gen_different_sets.append(list(set(fi_s1)&set(fi_s3)))
list_gen_different_sets.append(list(set(fi_s1)&set(fi_s4)))
list_gen_different_sets.append(list(set(fi_s2)&set(fi_s3)))
list_gen_different_sets.append(list(set(fi_s2)&set(fi_s4)))
list_gen_different_sets.append(list(set(fi_s3)&set(fi_s4)))

igen=list()
for list_gen in list_gen_different_sets:
    for gen in list_gen:
        igen.append(gen)

igen = list(np.unique(np.array(igen)))
igen

In [None]:
len(igen)

### Scenario 1

In [None]:
#ML Model
classifiers=[XGBClassifier(eval_metric='mlogloss',n_jobs=-1)] 
class_balance_over_sampling(featuress1[igen], labelss1, HO=True, CV=False, methods_list=["ADASYN"])

### Scenario 2

In [None]:
#ML Model
classifiers=[LogisticRegression(solver='liblinear',n_jobs=-1)] 
class_balance_over_sampling(featuress2[igen], labelss2, HO=True, CV=False, methods_list=["SVMSMOTE"])

### Scenario 3

In [None]:
#ML Model
classifiers=[GradientBoostingClassifier(random_state=8)] 
class_balance_over_sampling(featuress3[igen], labelss3, HO=True, CV=False, methods_list=["SMOTE"])

### Scenario 4

In [None]:
#ML Model
classifiers=[XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=4,
                  min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.8,
                  objective= 'binary:logistic', nthread=4, seed=27, eval_metric='mlogloss', n_jobs=-1)] 
class_balance_over_sampling(featuress4[igen], labelss4, HO=True, CV=False, methods_list=["RandomOverSampler"])

## 8 Genes, best selected for each scenario with the best 4 ML algorithms

In [None]:
#For each scenario these are the interesction between the best 4 ML algorithms. 
# Scenario 1: ATP1A2, SPARCL1, FTL
# Scenario 2: EGFR, SPOCK1, ANXA1
# Scenario 3: EGFR, APOD
# Scenario 4: ATP1A2, APOD, TMSB4X
gen8 = ["ATP1A2", "SPARCL1", "FTL", "EGFR", "SPOCK1", "ANXA1", "APOD", "TMSB4X"]


### Scenario 1

In [None]:
#ML Model
classifiers=[XGBClassifier(eval_metric='mlogloss',n_jobs=-1)] 
class_balance_over_sampling(featuress1[gen8], labelss1, HO=True, CV=False, methods_list=["ADASYN"])

### Scenario 2

In [None]:
#ML Model
classifiers=[LogisticRegression(solver='liblinear',n_jobs=-1)] 
class_balance_over_sampling(featuress2[gen8], labelss2, HO=True, CV=False, methods_list=["SVMSMOTE"])

### Scenario 3

In [None]:
#ML Model
classifiers=[GradientBoostingClassifier(random_state=8)] 
class_balance_over_sampling(featuress3[gen8], labelss3, HO=True, CV=False, methods_list=["SMOTE"])

### Scenario 4

In [None]:
#ML Model
classifiers=[XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=4,
                  min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.8,
                  objective= 'binary:logistic', nthread=4, seed=27, eval_metric='mlogloss', n_jobs=-1)] 
class_balance_over_sampling(featuress4[gen8], labelss4, HO=True, CV=False, methods_list=["RandomOverSampler"])