## **In this notebook all the combinations of models, datasets, preprocessing methods done until now will be joined together. The focus will be on scalability, dinamicity and correct saving of the results.**







USEFUL LINKS:

https://towardsdatascience.com/feature-selection-using-regularisation-a3678b71e499

https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html#sklearn.feature_selection.SelectFromModel

### Imports section: 


In [None]:
# mount Google Drive
from google.colab import drive
drive.mount('/content/Drive')

Drive already mounted at /content/Drive; to attempt to forcibly remount, call drive.mount("/content/Drive", force_remount=True).


In [None]:
# uncomment if needed: libraries to install on google colab
# ! pip install mrmr_selection
# ! pip install scikit-learn==0.24.2 # Downgrading the scikit learn library to obtain same results of previous experiments and Convergence

In [None]:
# Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, accuracy_score, balanced_accuracy_score, f1_score, matthews_corrcoef, classification_report, make_scorer
from sklearn.linear_model import LogisticRegression, Lasso
import matplotlib.pyplot as plt
from xlwt import Workbook
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
import os
from pandas_profiling import ProfileReport
from sklearn import svm
from sklearn.svm import SVC
from datetime import datetime
from sklearn.feature_selection import SelectFromModel
import mrmr
from mrmr import mrmr_classif

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
! pip freeze | grep scikit # check scikit-learn version for conversion in grid search

scikit-image==0.18.3
scikit-learn==0.24.2


### Download of all datasets with different preprocessing strategies and feature spaces

In [None]:
# Current working directory and other paths
cwd = os.getcwd()
print(cwd)
!cd Drive/
path = cwd + "/Drive/My Drive/magistrale/BioinformaticsProject/data/"
results_path = cwd + "/Drive/My Drive/magistrale/BioinformaticsProject/results/"

# Count per Million matrix
cpm_dataset = pd.read_csv(path+"CPM.csv",index_col=0) #read the main CPM dataset(67k × 719)
cpm_dataset = cpm_dataset.transpose() # (719 × 67k)
# Training and Testing datasets
training_ds =  pd.read_excel(path+"train_test_new.xlsx", sheet_name="train_new")
testing_ds = pd.read_excel(path+"train_test_new.xlsx", sheet_name="test_new")

# Feature space datesets
base_feature_space =path+"FEATURE_SPACES(RAW +CPM).xlsx"
# List of feature space name 
feature_space_files =["FEATURE_SPACE6(MAIN)", "FEATURE_SPACE1(PAM)", "FEATURE_SPACE2(PAM)","FEATURE_SPACE1(LIMMA)","FEATURE_SPACE2(LIMMA)", "FEATURE_SPACE7(pamsimilarity)","FEATURE_SPACE8(limmasimilarity)"]

/content


#### Additional step: Selecting only the samples that appear also in the gene expression dataset for a fair comparison

*This cell was commented after the process of selection and moving 10 samples from train to test to keep a 30-70 split.*

In [None]:

'''training_ds_pre = training_ds
testing_ds_pre = testing_ds
training_ds_pre['sample_label'].unique()
print('Len train and test befor removal: ', training_ds_pre.shape, testing_ds_pre.shape)
values = training_ds_pre['sample_label'].unique()
for val in values:
  print(val+' class : ', len(training_ds_pre[training_ds_pre['sample_label']== val])/550)

print('\n')
for val in values:
  print(val+' class : ', len(testing_ds_pre[testing_ds_pre['sample_label']== val])/137)

full_ds = pd.concat([training_ds_pre, testing_ds_pre], ignore_index=True)
print('Full samples len with isoform expression data', full_ds.shape)

gene_ds =  pd.read_csv(path+ 'gene_expression_data_and_metadata'+'/'+ '817_Patients_with_Subtype_from_TCGA_BRCA_hg19.csv', sep=';')
print('Len samples with gene expression features: ', len(gene_ds))
print('Samples with gene expression data: \n\n', gene_ds.iloc[:,0], '\n')

samples_in_common = np.intersect1d(full_ds.iloc[:,0],gene_ds.iloc[:,0])

training_ds = training_ds_pre[training_ds_pre.sample_id.isin(samples_in_common)]
testing_ds = testing_ds_pre[testing_ds_pre.sample_id.isin(samples_in_common)]


f"Training df only common len: {training_ds.shape}, testing df only common len: {testing_ds.shape} "

print('Len train and test befor removal: ', training_ds.shape, testing_ds.shape)
values = training_ds['sample_id.1'].unique()
for val in values:
  print(val+' class : ', len(training_ds[training_ds['sample_id.1']== val])/403, len(training_ds[training_ds['sample_id.1']== val]))

print('\n')
for val in values:
  print(val+' class : ', len(testing_ds[testing_ds['sample_id.1']== val])/134, len(testing_ds[testing_ds['sample_id.1']== val]))

with pd.ExcelWriter(path+"train_test_new.xlsx",engine='openpyxl', mode='a') as writer: 
  training_ds.to_excel( writer, sheet_name="train_new", index=False)
  testing_ds.to_excel(writer, sheet_name="test_new", index=False)'''

### Continue with extraction of data

In [None]:
def extract_and_reduce_by_columns(path, sheet_name, name, nofeats_ds, preproc_strategy: str= "none"): 
  """
     Function to extract dataset and a specific group of its columns.

     path: the path where to get the data values (isoforms)
     sheet_name: the excel sheet were to get the columns to select for the data (isoforms)
     nofeats_ds: the dataset without the additional columns
     name: 'trainingset' or 'testingset' for the excel 
     preproc_strategy: which preprocessing strategy to apply to the ds

  """
  full_df = pd.read_excel(path, sheet_name=sheet_name) # path of subdatset 
  full_list= full_df['isoform'].values.tolist()  #exatrct the list of isoforms names as list
  if preproc_strategy == 'loge':
    # https://stackoverflow.com/questions/49538185/purpose-of-numpy-log1p
    log_cpm_dataset = np.log1p(cpm_dataset)
    data = log_cpm_dataset[np.intersect1d(log_cpm_dataset.columns, full_list)]
  elif preproc_strategy == 'log2':
    log_cpm_dataset = np.log2(cpm_dataset + 1) # constant added to avoid reaching zero
    data = log_cpm_dataset[np.intersect1d(log_cpm_dataset.columns, full_list)]
  elif preproc_strategy == 'normperrow':
    # normalize per rows
    data = cpm_dataset[np.intersect1d(cpm_dataset.columns, full_list)]
    data = data.div(data.sum(axis=1), axis=0) # ----> preprocessing scaling step to try, not working
  elif preproc_strategy == 'none':
    data = cpm_dataset[np.intersect1d(cpm_dataset.columns, full_list)]  # find the mutual isoform between main datset and subdatset 
  
  data.reset_index(inplace=True)
  data.rename(columns={ data.columns[0]: "sample_id" }, inplace = True)

  x = nofeats_ds['sample_id'].values.tolist()
  data1= data.loc[data['sample_id'].isin(x)]
  result = pd.merge(data1, nofeats_ds, on='sample_id')
  result
  result.rename(columns={'sample_label':'subtype'}, inplace=True )
 
  # result.to_csv(name +".csv", index=False) # save as csv file 
  return result

 DATASET 1 CON FS PAM50
 

In [None]:
# List of feature space name 
feature_space_files =["FEATURE_SPACE6(MAIN)", "FEATURE_SPACE1(PAM)", "FEATURE_SPACE2(PAM)","FEATURE_SPACE1(LIMMA)","FEATURE_SPACE2(LIMMA)", "FEATURE_SPACE7(pamsimilarity)","FEATURE_SPACE8(limmasimilarity)"]

In [None]:
# Training Data import:
train = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE7(pamsimilarity)", 'trainingset', training_ds, 'none') 
X_train_pam = train.drop(["sample_id","subtype"],  axis = 1)
Y_train_pam =train.subtype
print("X_train size:", X_train_pam.shape)

# Testing Data import:
test = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE7(pamsimilarity)",'testingset', testing_ds,'none' ) 
X_test_pam = test.drop(["sample_id","subtype"], axis = 1)
Y_test_pam = test.subtype
print("X_test size:", X_test_pam.shape)

X_train size: (410, 131)
X_test size: (127, 131)


DATASET 2 CON FS LIMMA50

In [None]:
# Training Data import:
train = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE8(limmasimilarity)", 'trainingset', training_ds, 'none' ) 
X_train_limma = train.drop(["sample_id","subtype"],  axis = 1)
Y_train_limma =train.subtype
print("X_train size:", X_train_limma.shape)

# Testing Data import:
test = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE8(limmasimilarity)",'testingset', testing_ds, 'none') 
X_test_limma = test.drop(["sample_id","subtype"], axis = 1)
Y_test_limma = test.subtype
print("X_test size:", X_test_limma.shape)

X_train size: (410, 557)
X_test size: (127, 557)


DATASET 3 CON FS PAM50 E LOGE PREPROC




In [None]:
# Training Data import:
train = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE7(pamsimilarity)", 'trainingset', training_ds, 'loge') 
X_train_pam_loge = train.drop(["sample_id","subtype"],  axis = 1)
Y_train_pam_loge=train.subtype
print("X_train size:", X_train_pam_loge.shape)

# Testing Data import:
test = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE7(pamsimilarity)",'testingset', testing_ds,'loge' ) 
X_test_pam_loge = test.drop(["sample_id","subtype"], axis = 1)
Y_test_pam_loge = test.subtype
print("X_test size:", X_test_pam_loge.shape)

X_train size: (410, 131)
X_test size: (127, 131)


DATASET 4 CON FS LIMMA50 E LOGE PREPROC

In [None]:
# Training Data import:
train = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE8(limmasimilarity)", 'trainingset', training_ds, 'loge' ) 
X_train_limma_loge = train.drop(["sample_id","subtype"],  axis = 1)
Y_train_limma_loge =train.subtype
print("X_train size:", X_train_limma_loge.shape)

# Testing Data import:
test = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE8(limmasimilarity)",'testingset', testing_ds, 'loge') 
X_test_limma_loge = test.drop(["sample_id","subtype"], axis = 1)
Y_test_limma_loge = test.subtype
print("X_test size:", X_test_limma_loge.shape)

X_train size: (410, 557)
X_test size: (127, 557)


DATASET 5 CON FS PAM50 E LOG2 PREPROC

In [None]:
# Training Data import:
train = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE7(pamsimilarity)", 'trainingset', training_ds, 'log2') 
X_train_pam_log2 = train.drop(["sample_id","subtype"],  axis = 1)
Y_train_pam_log2 =train.subtype
print("X_train size:", X_train_pam_log2.shape)

# Testing Data import:
test = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE7(pamsimilarity)",'testingset', testing_ds,'log2' ) 
X_test_pam_log2 = test.drop(["sample_id","subtype"], axis = 1)
Y_test_pam_log2 = test.subtype
print("X_test size:", X_test_pam_log2.shape)

X_train size: (410, 131)
X_test size: (127, 131)


DATASET 6 CON FS LIMMA50 E LOG2 PREPROC

In [None]:
# Training Data import:
train = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE8(limmasimilarity)", 'trainingset', training_ds, 'log2' ) 
X_train_limma_log2 = train.drop(["sample_id","subtype"],  axis = 1)
Y_train_limma_log2 =train.subtype
print("X_train size:", X_train_limma_log2.shape)

# Testing Data import:
test = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE8(limmasimilarity)",'testingset', testing_ds, 'log2') 
X_test_limma_log2 = test.drop(["sample_id","subtype"], axis = 1)
Y_test_limma_log2 = test.subtype
print("X_test size:", X_test_limma_log2.shape)

X_train size: (410, 557)
X_test size: (127, 557)


DATASET 7 CON FS REDUCED FROM PAM50 E LOG PREPROCESSING (2 since more standard procedure and similar results)

In [None]:
sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))
sel_.fit(X_train_pam_log2,Y_train_pam_log2)

SelectFromModel(estimator=LogisticRegression(C=1, penalty='l1',
                                             solver='liblinear'))

In [None]:
sel_.get_support()
selected_feat = X_train_pam_log2.columns[(sel_.get_support())]

In [None]:
print('total features: {}'.format((X_train_pam_log2.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('Percentage features with coefficients shrank to zero: {}'.format(np.sum(sel_.estimator_.coef_ == 0)/131*5))

total features: 131
selected features: 118
Percentage features with coefficients shrank to zero: 15.801526717557252


In [None]:
X_train_pam_log2_sel = X_train_pam_log2[selected_feat].copy()
Y_train_pam_log2_sel = Y_train_pam_log2

X_test_pam_log2_sel = X_test_pam_log2[selected_feat].copy()
Y_test_pam_log2_sel = Y_test_pam_log2

DATASET 8 CON FS REDUCED FROM LIMMA50 E LOG PREPROCESSING (choosen is 2)

In [None]:
sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))
sel_.fit(X_train_limma_log2,Y_train_limma_log2)

SelectFromModel(estimator=LogisticRegression(C=1, penalty='l1',
                                             solver='liblinear'))

In [None]:
sel_.get_support()
selected_feat = X_train_limma.columns[(sel_.get_support())]

In [None]:
print('total features: {}'.format((X_train_limma_log2.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('Percentage features with coefficients shrank to zero: {}'.format(np.sum(sel_.estimator_.coef_ == 0)/557*5))

total features: 557
selected features: 226
Percentage features with coefficients shrank to zero: 21.974865350089768


In [None]:
X_train_limma_log2_sel = X_train_limma_log2[selected_feat].copy()
Y_train_limma_log2_sel = Y_train_limma_log2

X_test_limma_log2_sel = X_test_limma_log2[selected_feat].copy()
Y_test_limma_log2_sel = Y_test_limma_log2

DATASET 9 CON FS NEW E LOG PREPROCESSING (choose best try 2 for now) 

#### feature selection on all features

In [None]:
# Training Data import:
train = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE6(MAIN)", 'trainingset', training_ds, 'log2') 
X_train_pre_lasso = train.drop(["sample_id","subtype"],  axis = 1)
samples_train = train.sample_id
Y_train_pre_lasso=train.subtype
print("X_train size:", X_train_pre_lasso.shape)

# Testing Data import:
test = extract_and_reduce_by_columns(base_feature_space,"FEATURE_SPACE6(MAIN)", 'testingset', testing_ds,  'log2') 
X_test_pre_lasso = test.drop(["sample_id","subtype"], axis = 1)
Y_test_pre_lasso = test.subtype
print("X_test size:", X_test_pre_lasso.shape)

X_train size: (410, 49740)
X_test size: (127, 49740)


In [None]:
sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))
sel_.fit(X_train_pre_lasso,Y_train_pre_lasso)

SelectFromModel(estimator=LogisticRegression(C=1, penalty='l1',
                                             solver='liblinear'))

In [None]:
sel_.get_support()
selected_feat = X_train_pre_lasso.columns[(sel_.get_support())]
print(sel_.estimator_.coef_)

In [None]:
print('total features: {}'.format((X_train_pre_lasso.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('coefficients shrank to zero: {}'.format(
      np.sum(sel_.estimator_.coef_ == 0)))

perc_feat_selected = np.sum(sel_.estimator_.coef_ == 0)/ (49740*5)
print(perc_feat_selected)

total features: 49740
selected features: 1563
coefficients shrank to zero: 246722
0.9920466425412143


In [None]:
X_train_lasso = X_train_pre_lasso[selected_feat].copy()
Y_train_lasso = Y_train_pre_lasso

X_test_lasso = X_test_pre_lasso[selected_feat].copy()
Y_test_lasso = Y_test_pre_lasso

#### Feature Selection done with MRMR method; the choice of number of features to keep is based on an average between the other feature selection methods

In [None]:
# choosing the selected by the lasso for speed reasons and since they include a big part of limma and pam 50 ones which are more restricting
mrmr_features = mrmr_classif(X=X_train_lasso, y=Y_train_lasso, K=750)

# Next steps: check how many of the chosen features overlap with selected with lasso, pam50 and limam50
# in other training notebook: check how the models perform on it 

100%|██████████| 750/750 [05:17<00:00,  2.36it/s]


In [None]:
print(mrmr_features)

['uc010tpz.1', 'uc004ahm.2', 'uc003fdb.1', 'uc002hst.2', 'uc001wuf.2', 'uc003ojq.1', 'uc011eet.1', 'uc003mtp.2', 'uc002vwt.2', 'uc003sts.2', 'uc001slf.2', 'uc002amc.2', 'uc010jti.2', 'uc003qoo.3', 'uc003str.2', 'uc002hsl.2', 'uc010fyt.1', 'uc010fjf.2', 'uc003aed.2', 'uc011akl.1', 'uc003qol.2', 'uc002hsq.2', 'uc003qnt.2', 'uc002vwu.2', 'uc003yhc.2', 'uc001ygx.2', 'uc011eey.1', 'uc001cbj.2', 'uc002hsd.2', 'uc001cix.2', 'uc002bnh.3', 'uc002zav.2', 'uc001gtv.2', 'uc003ovb.2', 'uc011dev.1', 'uc002bsq.1', 'uc002hbq.2', 'uc002opb.3', 'uc001gxx.3', 'uc002ijx.3', 'uc010wek.1', 'uc002hxg.3', 'uc003hxc.1', 'uc002rbp.1', 'uc001mhh.1', 'uc011lwk.1', 'uc003qon.3', 'uc004ays.2', 'uc001qrg.2', 'uc002jmj.3', 'uc003pih.1', 'uc002hsi.1', 'uc004atb.2', 'uc003oon.2', 'uc002tlm.2', 'uc001sck.2', 'uc003flk.2', 'uc002huj.1', 'uc011mjf.1', 'uc002uid.1', 'uc010wei.1', 'uc003etn.2', 'uc003mau.2', 'uc003ybr.1', 'uc003gpp.2', 'uc003bhw.1', 'uc004bqi.2', 'uc003xnt.2', 'uc001mhi.1', 'uc010cwc.2', 'uc001hir.1', 'uc00

In [None]:
X_train_mrmr = X_train_lasso[mrmr_features].copy()
Y_train_mrmr = Y_train_lasso

X_test_mrmr = X_test_lasso[mrmr_features].copy()
Y_test_mrmr = Y_test_lasso

In [None]:
print(X_train_mrmr.shape, X_test_mrmr.shape)

(410, 750) (127, 750)


Next step is to do tuning on this k method to find the optimal number of features

### Model training, evaluation and saving of results


In [None]:
def write_results(results, final_path, name_file):
      '''
      Function to write results metrics and confing into a csv file with as name the current date
      '''
      
      # datetime object containing current date and time
      now = datetime.now()
      dt_string = now.strftime("%d%m%Y%H%M%S")
      dt = now.strftime("%d%m%Y")

      if not os.path.exists(results_path+final_path+dt+'/'):
        os.mkdir(results_path+final_path+dt+'/')

      
      df = pd.DataFrame(results)
      df.to_csv(results_path+final_path+dt+'/'+name_file+dt_string+".csv")

In [None]:
def training_and_saving_after_cv_and_single_split(which_ds, X_train, Y_train,X_test, Y_test, scores, param_values, model):

  C = []
  l1_ratio = []
  cv_best = []
  score_test_balanced_accuracy = []
  score_test_accuracy = []
  precision = []
  recall = []
  f1=[]

  for index, score in enumerate(scores):
      # -------RESULTS in CROSS_VALIDATION-----------
      print("Tuning hyper-parameters for %s" % score)
      # Fit and hyperparameter search
      selected_model = GridSearchCV(model(), param_values, scoring=score, cv=10)
      selected_model.fit(X_train, Y_train)
      # found best model and fit on training
      print("Parameter setting that gave the best results on the hold out data: ",  selected_model.best_params_)
      print("Mean cross-validated score of the best_estimator found, mean, std dev: ",  selected_model.best_score_, selected_model.cv_results_['mean_test_score'], selected_model.cv_results_['std_test_score'] )

      # save top config and score from grid search (only accuracy or balanced accuracy)
      # evaltype datasetdetails modelname parameters balancedaccuracy
      dic_result = {}
      dic_result['eval_type']= ['GRID SEARCH RESULTS']
      dic_result['dataset_details']= [which_ds]
      dic_result['model_name']=  [model.__name__ ]
      dic_result['top_parameters']= [str(selected_model.best_params_)]
      dic_result['name_score']= [score]
      dic_result['best_score'] = [selected_model.best_score_]
      dic_result['mean_test_score'] = [selected_model.best_score_]
      index, = np.where(selected_model.cv_results_['mean_test_score']==selected_model.best_score_)
      dic_result['std_test_score'] =selected_model.cv_results_['std_test_score'][index[0]]

      df_result = pd.DataFrame.from_dict(dic_result)
      print('Grid search results: ', df_result)
      write_results(df_result,model.__name__ +'/', 'cv_on_'+score)

      # use top config and trained model for evaluation on test
      y_true, y_pred = Y_test, selected_model.predict(X_test)

      # save results from test
      # evaltype datasetdetails modelname parameters balacc accc prec rec f1
      dic_result = {}
      dic_result['eval_type']= ['TEST GRID SEARCH RESULTS']
      dic_result['dataset_details']= [which_ds]
      dic_result['model_name']=  [model.__name__ ]
      dic_result['top_parameters']= [str(selected_model.best_params_)]
      dic_result['balanced_accuracy'] = [round(balanced_accuracy_score(y_true, y_pred), 3)]
      dic_result['accuracy'] = [round(accuracy_score(y_true, y_pred), 3)]
      dic_result['precision'] = [round(precision_score(y_true, y_pred, average="macro"), 3)]
      dic_result['recall'] = [round(recall_score(y_true, y_pred, average="macro"), 3)]
      dic_result['f1_macro'] = [round(f1_score(y_true, y_pred, average="macro"),3)]
      dic_result['f1_micro'] = [round(f1_score(y_true, y_pred, average="micro"),3)]

      df_result = pd.DataFrame.from_dict(dic_result)
      print('Grid search results on test eval: ', df_result)
      # not saved anymore because the results are the same as creating new model and performing eval on test set
      # it was initiially introduced for verification
      # write_results(df_result,model.__name__ +'/', 'testcv_on_'+score) 

      # create new model with top convig and evaluate for verification
      check_model = model(**selected_model.best_params_)
      check_model.fit(X_train, Y_train)
      
      y_true, y_pred = Y_test, check_model.predict(X_test)

      # save again the scores
      dic_result = {}
      dic_result['eval_type']= ['TEST GRID SEARCH RESULTS']
      dic_result['dataset_details']= [which_ds]
      dic_result['model_name']=  [model.__name__ ]
      dic_result['top_parameters']= [str(selected_model.best_params_)]
      dic_result['balanced_accuracy'] = [round(balanced_accuracy_score(y_true, y_pred), 3)]
      dic_result['accuracy'] = [round(accuracy_score(y_true, y_pred), 3)]
      dic_result['precision'] = [round(precision_score(y_true, y_pred, average="macro"), 3)]
      dic_result['recall'] = [round(recall_score(y_true, y_pred, average="macro"), 3)]
      dic_result['f1_macro'] = [round(f1_score(y_true, y_pred, average="macro"),3)]
      dic_result['f1_micro'] = [round(f1_score(y_true, y_pred, average="micro"),3)]

      df_result = pd.DataFrame.from_dict(dic_result)
      print('Results on test eval: ', df_result)
      write_results(df_result,model.__name__ +'/', 'test_on_'+score)

### Running section

In [None]:
# for each dataset
# for each model
# create all parameters and other details to pass to the fun
# run training and saving function 

Parameters definition

In [None]:
# Configuration of parameters and name

#GridSearch attributes
# Set the parameters by cross-validation
#'l1_ratio':[0.5], 'solver': ['saga'], 'penalty':['elasticnet']
logreg_tuned_parameters = [{
    'multi_class':  ['ovr'],
    'penalty':['elasticnet'],
    'solver': ['saga'], 
    'max_iter':[2000], 
    'C':  [ 0.1, 0.01], #[10 ** i for i in range(-2,1)],
    'l1_ratio': [ 0.01, 0.001] #[10 ** i for i in range(-2,1)] #'l1_ratio':[0.5]}]
    }]

svc_tuned_parameters = [{
    'kernel':['poly'],  
    'degree': [1, 2, 3], # 1 for linear, 2 for polynomial
    'gamma': [10 ** i for i in range(-3,3)],
    'max_iter':[1000], 
    'C': [10 ** i for i in range(-3,3)]}]

scores = [ "accuracy", "balanced_accuracy"]

print(logreg_tuned_parameters)
print(svc_tuned_parameters)

[{'multi_class': ['ovr'], 'penalty': ['elasticnet'], 'solver': ['saga'], 'max_iter': [2000], 'C': [0.1, 0.01], 'l1_ratio': [0.01, 0.001]}]
[{'kernel': ['poly'], 'degree': [2, 3], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100], 'max_iter': [1000], 'C': [0.001, 0.01, 0.1, 1, 10, 100]}]


dataset 1 with fs pam50

In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('pam_fs', X_train_pam, Y_train_pam, X_test_pam, Y_test_pam, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('pam_fs', X_train_pam, Y_train_pam, X_test_pam, Y_test_pam, scores, svc_tuned_parameters, SVC)

dataset 2 with fs limma50

In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('limma_fs', X_train_limma, Y_train_limma, X_test_limma, Y_test_limma, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('limma_fs', X_train_limma, Y_train_limma, X_test_limma, Y_test_limma, scores, svc_tuned_parameters, SVC)

dataset 3 with fs pam50 and loge preprocessing

In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('pam_fs_loge', X_train_pam_loge, Y_train_pam_loge, X_test_pam_loge, Y_test_pam_loge, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('pam_fs_loge', X_train_pam_loge, Y_train_pam_loge, X_test_pam_loge, Y_test_pam_loge, scores, svc_tuned_parameters, SVC)

dataset 4 with fs limma50 and loge preprocessing

In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('limma_fs_loge', X_train_limma_loge, Y_train_limma_loge, X_test_limma_loge, Y_test_limma_loge, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('limma_fs_loge', X_train_limma_loge, Y_train_limma_loge, X_test_limma_loge, Y_test_limma_loge, scores, svc_tuned_parameters, SVC)

dataset 5 with fs pam50 and log2 preprocessing

In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('pam_fs_log2', X_train_pam_log2, Y_train_pam_log2, X_test_pam_log2, Y_test_pam_log2, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('pam_fs_log2', X_train_pam_log2, Y_train_pam_log2, X_test_pam_log2, Y_test_pam_log2, scores, svc_tuned_parameters, SVC)

dataset 6 with fs limma50 and log2 preprocessing

In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('limma_fs_log2', X_train_limma_log2, Y_train_limma_log2, X_test_limma_log2, Y_test_limma_log2, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('limma_fs_log2', X_train_limma_log2, Y_train_limma_log2, X_test_limma_log2, Y_test_limma_log2, scores, svc_tuned_parameters, SVC)

DATASET 7 CON FS REDUCED FROM PAM50 E LOG PREPROCESSING (choose the best between 2 and e)



In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('pam_fs_log2_sel', X_train_pam_log2_sel, Y_train_pam_log2_sel, X_test_pam_log2_sel, Y_test_pam_log2_sel, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('pam_fs_log2_sel', X_train_pam_log2_sel, Y_train_pam_log2_sel, X_test_pam_log2_sel, Y_test_pam_log2_sel, scores, svc_tuned_parameters, SVC)

DATASET 8 CON FS REDUCED FROM LIMMA50 E LOG PREPROCESSING (choose the best between 2 and e)

In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('limma_fs_log2_sel', X_train_limma_log2_sel, Y_train_limma_log2_sel, X_test_limma_log2_sel, Y_test_limma_log2_sel, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('limma_fs_log2_sel', X_train_limma_log2_sel, Y_train_limma_log2_sel, X_test_limma_log2_sel, Y_test_limma_log2_sel, scores, svc_tuned_parameters, SVC)

DATASET 9 CON FS NEW with lasso for feature space E LOG PREPROCESSING (choose best try e for now)

In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('lasso_fs_loge_sel', X_train_lasso, Y_train_lasso, X_test_lasso, Y_test_lasso, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('lasso_fs_loge_sel', X_train_lasso, Y_train_lasso, X_test_lasso, Y_test_lasso, scores, svc_tuned_parameters, SVC)

##### DATASET 10 CON FS NEW with mrmr for feature space E LOG PREPROCESSING (choose best try e for now)

In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('mrmr_fs_log2_sel', X_train_mrmr, Y_train_mrmr, X_test_mrmr, Y_test_mrmr, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('mrmr_fs_log2_sel', X_train_mrmr, Y_train_mrmr, X_test_mrmr, Y_test_mrmr, scores, svc_tuned_parameters, SVC)

### Random Forest Classifier exploration

In [None]:
# EVALUATION with RandomForrest
from sklearn.ensemble import RandomForestClassifier
rf_values= [{
    'min_samples_split': [2,5,10],
    'max_depth': [ 100],
    'max_features': [ 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'n_estimators': [200, 500, 750]
    }]

In [None]:
# dataset 1
training_and_saving_after_cv_and_single_split('pam_fs', X_train_pam, Y_train_pam, X_test_pam, Y_test_pam, scores,rf_values,model=RandomForestClassifier)

In [None]:
# dataset 2
training_and_saving_after_cv_and_single_split('limma_fs', X_train_limma, Y_train_limma, X_test_limma, Y_test_limma, scores, rf_values,model=RandomForestClassifier)

In [None]:
# dataset 3
training_and_saving_after_cv_and_single_split('pam_fs_loge', X_train_pam_loge, Y_train_pam_loge, X_test_pam_loge, Y_test_pam_loge, scores, rf_values,model=RandomForestClassifier)

In [None]:
# dataset 4
training_and_saving_after_cv_and_single_split('limma_fs_loge', X_train_limma_loge, Y_train_limma_loge, X_test_limma_loge, Y_test_limma_loge, scores, rf_values,model=RandomForestClassifier)

In [None]:
# dataset 5
training_and_saving_after_cv_and_single_split('pam_fs_log2', X_train_pam_log2, Y_train_pam_log2, X_test_pam_log2, Y_test_pam_log2, scores, rf_values,model=RandomForestClassifier)

In [None]:
# dataset 6
training_and_saving_after_cv_and_single_split('limma_fs_log2', X_train_limma_log2, Y_train_limma_log2, X_test_limma_log2, Y_test_limma_log2, scores, rf_values,model=RandomForestClassifier)

In [None]:
# dataset 7
training_and_saving_after_cv_and_single_split('pam_fs_log2_sel', X_train_pam_log2_sel, Y_train_pam_log2_sel, X_test_pam_log2_sel, Y_test_pam_log2_sel, scores, rf_values,model=RandomForestClassifier)

In [None]:
# dataset 8
training_and_saving_after_cv_and_single_split('limma_fs_log2_sel', X_train_limma_log2_sel, Y_train_limma_log2_sel, X_test_limma_log2_sel, Y_test_limma_log2_sel, scores, rf_values,model=RandomForestClassifier)

In [None]:
# dataset 9
training_and_saving_after_cv_and_single_split('lasso_fs_log2_sel', X_train_lasso, Y_train_lasso, X_test_lasso, Y_test_lasso, scores, rf_values,model=RandomForestClassifier)

In [None]:
# dataset 10
training_and_saving_after_cv_and_single_split('mrmr_fs_log2_sel', X_train_mrmr, Y_train_mrmr, X_test_mrmr, Y_test_mrmr, scores, rf_values,model=RandomForestClassifier)