## **In this notebook all the combinations of models, datasets, preprocessing methods done until now will be joined together. The focus will be on scalability, dinamicity and correct saving of the results.**







USEFUL LINKS:

https://towardsdatascience.com/feature-selection-using-regularisation-a3678b71e499

https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html#sklearn.feature_selection.SelectFromModel

### Imports section: 


In [1]:
# mount Google Drive
from google.colab import drive
drive.mount('/content/Drive')

Drive already mounted at /content/Drive; to attempt to forcibly remount, call drive.mount("/content/Drive", force_remount=True).


In [2]:
# Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, accuracy_score, balanced_accuracy_score, f1_score, matthews_corrcoef, classification_report, make_scorer
from sklearn.linear_model import LogisticRegression, Lasso
import matplotlib.pyplot as plt
from xlwt import Workbook
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
import os
from pandas_profiling import ProfileReport
from sklearn import svm
from sklearn.svm import SVC
from datetime import datetime
from sklearn.feature_selection import SelectFromModel

In [None]:
import warnings
warnings.filterwarnings('ignore')
!pip freeze
# ! pip install scikit-learn==0.24.2 # Downgrading the scikit learn library to obtain same results of previous experiments and Convergence

### Download of all datasets with different preprocessing strategies and feature spaces

In [4]:
# Current working directory and other paths
cwd = os.getcwd()
print(cwd)
!cd Drive/
path = cwd + "/Drive/My Drive/magistrale/BioinformaticsProject/data/"
results_path = cwd + "/Drive/My Drive/magistrale/BioinformaticsProject/results/"

# Count per Million matrix
cpm_dataset = pd.read_csv(path+"CPM.csv",index_col=0) #read the main CPM dataset(67k × 719)
cpm_dataset = cpm_dataset.transpose() # (719 × 67k)
# Training and Testing datasets
training_ds =  pd.read_excel(path+"train.test.xlsx", sheet_name="train")
testing_ds = pd.read_excel(path+"train.test.xlsx", sheet_name="test")

# Feature space datesets
base_feature_space =path+"FEATURE_SPACES(RAW +CPM).xlsx"
# List of feature space name 
feature_space_files =["FEATURE_SPACE6(MAIN)", "FEATURE_SPACE1(PAM)", "FEATURE_SPACE2(PAM)","FEATURE_SPACE1(LIMMA)","FEATURE_SPACE2(LIMMA)", "FEATURE_SPACE7(pamsimilarity)","FEATURE_SPACE8(limmasimilarity)"]

/content


In [5]:
def extract_and_reduce_by_columns(path, sheet_name, name, nofeats_ds, preproc_strategy: str= "none"): 
  """
     Function to extract dataset and a specific group of its columns.

     path: the path where to get the data values (isoforms)
     sheet_name: the excel sheet were to get the columns to select for the data (isoforms)
     nofeats_ds: the dataset without the additional columns
     name: 'trainingset' or 'testingset' for the excel 
     preproc_strategy: which preprocessing strategy to apply to the ds

  """
  full_df = pd.read_excel(path, sheet_name=sheet_name) # path of subdatset 
  full_list= full_df['isoform'].values.tolist()  #exatrct the list of isoforms names as list
  if preproc_strategy == 'loge':
    # https://stackoverflow.com/questions/49538185/purpose-of-numpy-log1p
    log_cpm_dataset = np.log1p(cpm_dataset)
    data = log_cpm_dataset[np.intersect1d(log_cpm_dataset.columns, full_list)]
  elif preproc_strategy == 'log2':
    log_cpm_dataset = np.log2(cpm_dataset + 1) # constant added to avoid reaching zero
    data = log_cpm_dataset[np.intersect1d(log_cpm_dataset.columns, full_list)]
  elif preproc_strategy == 'normperrow':
    # normalize per rows
    data = cpm_dataset[np.intersect1d(cpm_dataset.columns, full_list)]
    data = data.div(data.sum(axis=1), axis=0) # ----> preprocessing scaling step to try, not working
  elif preproc_strategy == 'none':
    data = cpm_dataset[np.intersect1d(cpm_dataset.columns, full_list)]  # find the mutual isoform between main datset and subdatset 
  
  data.reset_index(inplace=True)
  data.rename(columns={ data.columns[0]: "sample_id" }, inplace = True)

  x = nofeats_ds['sample_id'].values.tolist()
  data1= data.loc[data['sample_id'].isin(x)]
  result = pd.merge(data1, nofeats_ds, on='sample_id')
  result
  result.rename(columns={'sample_id.1':'subtype'}, inplace=True )
 
  # result.to_csv(name +".csv", index=False) # save as csv file 
  return result

 DATASET 1 CON FS PAM50
 

In [6]:
# List of feature space name 
feature_space_files =["FEATURE_SPACE6(MAIN)", "FEATURE_SPACE1(PAM)", "FEATURE_SPACE2(PAM)","FEATURE_SPACE1(LIMMA)","FEATURE_SPACE2(LIMMA)", "FEATURE_SPACE7(pamsimilarity)","FEATURE_SPACE8(limmasimilarity)"]

In [None]:
# Training Data import:
train = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE7(pamsimilarity)", 'trainingset', training_ds, 'none') 
X_train_pam = train.drop(["sample_id","subtype"],  axis = 1)
Y_train_pam =train.subtype
print("X_train size:", X_train_pam.shape)

# Testing Data import:
test = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE7(pamsimilarity)",'testingset', testing_ds,'none' ) 
X_test_pam = test.drop(["sample_id","subtype"], axis = 1)
Y_test_pam = test.subtype
print("X_test size:", X_test_pam.shape)

X_train size: (550, 131)
X_test size: (137, 131)


DATASET 2 CON FS LIMMA50

In [None]:
# Training Data import:
train = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE8(limmasimilarity)", 'trainingset', training_ds, 'none' ) 
X_train_limma = train.drop(["sample_id","subtype"],  axis = 1)
Y_train_limma =train.subtype
print("X_train size:", X_train_limma.shape)

# Testing Data import:
test = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE8(limmasimilarity)",'testingset', testing_ds, 'none') 
X_test_limma = test.drop(["sample_id","subtype"], axis = 1)
Y_test_limma = test.subtype
print("X_test size:", X_test_limma.shape)

X_train size: (550, 557)
X_test size: (137, 557)


DATASET 3 CON FS PAM50 E LOGE PREPROC




In [None]:
# Training Data import:
train = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE7(pamsimilarity)", 'trainingset', training_ds, 'loge') 
X_train_pam_loge = train.drop(["sample_id","subtype"],  axis = 1)
Y_train_pam_loge=train.subtype
print("X_train size:", X_train_pam_loge.shape)

# Testing Data import:
test = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE7(pamsimilarity)",'testingset', testing_ds,'loge' ) 
X_test_pam_loge = test.drop(["sample_id","subtype"], axis = 1)
Y_test_pam_loge = test.subtype
print("X_test size:", X_test_pam_loge.shape)

X_train size: (550, 131)
X_test size: (137, 131)


DATASET 4 CON FS LIMMA50 E LOGE PREPROC

In [None]:
# Training Data import:
train = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE8(limmasimilarity)", 'trainingset', training_ds, 'loge' ) 
X_train_limma_loge = train.drop(["sample_id","subtype"],  axis = 1)
Y_train_limma_loge =train.subtype
print("X_train size:", X_train_limma_loge.shape)

# Testing Data import:
test = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE8(limmasimilarity)",'testingset', testing_ds, 'loge') 
X_test_limma_loge = test.drop(["sample_id","subtype"], axis = 1)
Y_test_limma_loge = test.subtype
print("X_test size:", X_test_limma_loge.shape)

X_train size: (550, 557)
X_test size: (137, 557)


DATASET 5 CON FS PAM50 E LOG2 PREPROC

In [None]:
# Training Data import:
train = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE7(pamsimilarity)", 'trainingset', training_ds, 'log2') 
X_train_pam_log2 = train.drop(["sample_id","subtype"],  axis = 1)
Y_train_pam_log2 =train.subtype
print("X_train size:", X_train_pam_log2.shape)

# Testing Data import:
test = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE7(pamsimilarity)",'testingset', testing_ds,'log2' ) 
X_test_pam_log2 = test.drop(["sample_id","subtype"], axis = 1)
Y_test_pam_log2 = test.subtype
print("X_test size:", X_test_pam_log2.shape)

X_train size: (550, 131)
X_test size: (137, 131)


DATASET 6 CON FS LIMMA50 E LOG2 PREPROC

In [None]:
# Training Data import:
train = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE8(limmasimilarity)", 'trainingset', training_ds, 'log2' ) 
X_train_limma_log2 = train.drop(["sample_id","subtype"],  axis = 1)
Y_train_limma_log2 =train.subtype
print("X_train size:", X_train_limma_log2.shape)

# Testing Data import:
test = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE8(limmasimilarity)",'testingset', testing_ds, 'log2') 
X_test_limma_log2 = test.drop(["sample_id","subtype"], axis = 1)
Y_test_limma_log2 = test.subtype
print("X_test size:", X_test_limma_log2.shape)

X_train size: (550, 557)
X_test size: (137, 557)


DATASET 7 CON FS REDUCED FROM PAM50 E LOG PREPROCESSING (choose the best between 2 and e)

In [None]:
sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))
sel_.fit(X_train_pam_loge,Y_train_pam_loge)

SelectFromModel(estimator=LogisticRegression(C=1, penalty='l1',
                                             solver='liblinear'))

In [None]:
sel_.get_support()
selected_feat = X_train_pam_loge.columns[(sel_.get_support())]

In [None]:
print('total features: {}'.format((X_train_pam_loge.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('Percentage features with coefficients shrank to zero: {}'.format(np.sum(sel_.estimator_.coef_ == 0)/131*5))

total features: 131
selected features: 118
Percentage features with coefficients shrank to zero: 13.931297709923665


In [None]:
X_train_pam_loge_sel = X_train_pam_loge[selected_feat].copy()
Y_train_pam_loge_sel = Y_train_pam_loge

X_test_pam_loge_sel = X_test_pam_loge[selected_feat].copy()
Y_test_pam_loge_sel = Y_test_pam_loge

DATASET 8 CON FS REDUCED FROM LIMMA50 E LOG PREPROCESSING (choose the best between 2 and e)

In [None]:
sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))
sel_.fit(X_train_limma_loge,Y_train_limma_loge)

SelectFromModel(estimator=LogisticRegression(C=1, penalty='l1',
                                             solver='liblinear'))

In [None]:
sel_.get_support()
selected_feat = X_train_limma.columns[(sel_.get_support())]

In [None]:
print('total features: {}'.format((X_train_limma_loge.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('Percentage features with coefficients shrank to zero: {}'.format(np.sum(sel_.estimator_.coef_ == 0)/557*5))

total features: 557
selected features: 248
Percentage features with coefficients shrank to zero: 21.481149012567325


In [None]:
X_train_limma_loge_sel = X_train_limma_loge[selected_feat].copy()
Y_train_limma_loge_sel = Y_train_limma_loge

X_test_limma_loge_sel = X_test_limma_loge[selected_feat].copy()
Y_test_limma_loge_sel = Y_test_limma_loge

DATASET 9 CON FS NEW E LOG PREPROCESSING (choose best try 2 for now) 

#### feature selection on all features

In [7]:
# Training Data import:
train = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE6(MAIN)", 'trainingset', training_ds, 'loge') 
X_train_lasso = train.drop(["sample_id","subtype"],  axis = 1)
samples_train = train.sample_id
Y_train_lasso=train.subtype
print("X_train size:", X_train_lasso.shape)

# Testing Data import:
test = extract_and_reduce_by_columns(base_feature_space,"FEATURE_SPACE6(MAIN)", 'testingset', testing_ds,  'loge') 
X_test_lasso = test.drop(["sample_id","subtype"], axis = 1)
Y_test_lasso = test.subtype
print("X_test size:", X_test_lasso.shape)

X_train size: (550, 49740)
X_test size: (137, 49740)


In [8]:
sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))
sel_.fit(X_train_lasso,Y_train_lasso)

SelectFromModel(estimator=LogisticRegression(C=1, penalty='l1',
                                             solver='liblinear'))

In [9]:
sel_.get_support()
selected_feat = X_train_lasso.columns[(sel_.get_support())]
print(sel_.estimator_.coef_)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [10]:
print('total features: {}'.format((X_train_lasso.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('coefficients shrank to zero: {}'.format(
      np.sum(sel_.estimator_.coef_ == 0)))

perc_feat_selected = np.sum(sel_.estimator_.coef_ == 0)/ (49740*5)
print(perc_feat_selected)

total features: 49740
selected features: 1962
coefficients shrank to zero: 246260
0.9901889827100925


In [11]:
X_train_lasso = X_train_lasso[selected_feat].copy()
Y_train_lasso = Y_train_lasso

X_test_lasso = X_test_lasso[selected_feat].copy()
Y_test_lasso = Y_test_lasso

### Model training, evaluation and saving of results


In [12]:
def write_results(results, final_path, name_file):
      '''
      Function to write results metrics and confing into a csv file with as name the current date
      '''
      if not os.path.exists(results_path+final_path):
        os.mkdir(results_path+final_path)

      # datetime object containing current date and time
      now = datetime.now()
      dt_string = now.strftime("%d%m%Y%H%M%S")
      
      df = pd.DataFrame(results)
      df.to_csv(results_path+final_path+name_file+dt_string+".csv")

In [13]:
def training_and_saving_after_cv_and_single_split(which_ds, X_train, Y_train,X_test, Y_test, scores, param_values, model):

  C = []
  l1_ratio = []
  cv_best = []
  score_test_balanced_accuracy = []
  score_test_accuracy = []
  precision = []
  recall = []
  f1=[]

  for index, score in enumerate(scores):
      # -------RESULTS in CROSS_VALIDATION-----------
      print("Tuning hyper-parameters for %s" % score)
      # Fit and hyperparameter search
      selected_model = GridSearchCV(model(), param_values, scoring=score, cv=10)
      selected_model.fit(X_train, Y_train)
      # found best model and fit on training
      print("Parameter setting that gave the best results on the hold out data: ",  selected_model.best_params_)
      print("Mean cross-validated score of the best_estimator found: ",  selected_model.best_score_)

      # save top config and score from grid search (only accuracy or balanced accuracy)
      # evaltype datasetdetails modelname parameters balancedaccuracy
      dic_result = {}
      dic_result['eval_type']= ['GRID SEARCH RESULTS']
      dic_result['dataset_details']= [which_ds]
      dic_result['model_name']=  [model.__name__ ]
      dic_result['top_parameters']= [str(selected_model.best_params_)]
      dic_result['name_score']= [score]
      dic_result['score_value'] = [selected_model.best_score_]
      df_result = pd.DataFrame.from_dict(dic_result)
      print('Grid search results: ', df_result)
      write_results(df_result,model.__name__ +'/', 'cv_on_'+score)

      # use top config and trained model for evaluation on test
      y_true, y_pred = Y_test, selected_model.predict(X_test)

      # save results from test
      # evaltype datasetdetails modelname parameters balacc accc prec rec f1
      dic_result = {}
      dic_result['eval_type']= ['TEST GRID SEARCH RESULTS']
      dic_result['dataset_details']= [which_ds]
      dic_result['model_name']=  [model.__name__ ]
      dic_result['top_parameters']= [str(selected_model.best_params_)]
      dic_result['balanced_accuracy'] = [round(balanced_accuracy_score(y_true, y_pred), 3)]
      dic_result['accuracy'] = [round(accuracy_score(y_true, y_pred), 3)]
      dic_result['precision'] = [round(precision_score(y_true, y_pred, average="macro"), 3)]
      dic_result['recall'] = [round(recall_score(y_true, y_pred, average="macro"), 3)]
      dic_result['f1'] = [round(f1_score(y_true, y_pred, average="macro"),3)]

      df_result = pd.DataFrame.from_dict(dic_result)
      print('Grid search results on test eval: ', df_result)
      # not saved anymore because the results are the same as creating new model and performing eval on test set
      # it was initiially introduced for verification
      # write_results(df_result,model.__name__ +'/', 'testcv_on_'+score) 

      # create new model with top convig and evaluate for verification
      check_model = model(**selected_model.best_params_)
      check_model.fit(X_train, Y_train)
      
      y_true, y_pred = Y_test, check_model.predict(X_test)

      # save again the scores
      dic_result = {}
      dic_result['eval_type']= ['TEST GRID SEARCH RESULTS']
      dic_result['dataset_details']= [which_ds]
      dic_result['model_name']=  [model.__name__ ]
      dic_result['top_parameters']= [str(selected_model.best_params_)]
      dic_result['balanced_accuracy'] = [round(balanced_accuracy_score(y_true, y_pred), 3)]
      dic_result['accuracy'] = [round(accuracy_score(y_true, y_pred), 3)]
      dic_result['precision'] = [round(precision_score(y_true, y_pred, average="macro"), 3)]
      dic_result['recall'] = [round(recall_score(y_true, y_pred, average="macro"), 3)]
      dic_result['f1'] = [round(f1_score(y_true, y_pred, average="macro"),3)]
      
      df_result = pd.DataFrame.from_dict(dic_result)
      print('Results on test eval: ', df_result)
      write_results(df_result,model.__name__ +'/', 'test_on_'+score)

### Running section

In [None]:
# for each dataset
# for each model
# create all parameters and other details to pass to the fun
# run training and saving function 

Parameters definition

In [15]:
# Configuration of parameters and name

#GridSearch attributes
# Set the parameters by cross-validation
#'l1_ratio':[0.5], 'solver': ['saga'], 'penalty':['elasticnet']
logreg_tuned_parameters = [{
    'multi_class':  ['ovr'],
    'penalty':['elasticnet'],
    'solver': ['saga'], 
    'max_iter':[2000], 
    'C':  [ 0.1, 0.01], #[10 ** i for i in range(-2,1)],
    'l1_ratio': [ 0.01, 0.001] #[10 ** i for i in range(-2,1)] #'l1_ratio':[0.5]}]
    }]

svc_tuned_parameters = [{
    'kernel':['poly'], 
    'degree': [2, 3], 
    'gamma': [10 ** i for i in range(-3,3)],
    'max_iter':[1000], 
    'C': [10 ** i for i in range(-3,3)]}]

scores = [ "accuracy", "balanced_accuracy"]

print(logreg_tuned_parameters)
print(svc_tuned_parameters)

[{'multi_class': ['ovr'], 'penalty': ['elasticnet'], 'solver': ['saga'], 'max_iter': [2000], 'C': [0.1, 0.01], 'l1_ratio': [0.01, 0.001]}]
[{'kernel': ['poly'], 'degree': [2, 3], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100], 'max_iter': [1000], 'C': [0.001, 0.01, 0.1, 1, 10, 100]}]


dataset 1 with fs pam50

In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('pam_fs', X_train_pam, Y_train_pam, X_test_pam, Y_test_pam, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('pam_fs', X_train_pam, Y_train_pam, X_test_pam, Y_test_pam, scores, svc_tuned_parameters, SVC)

Tuning hyper-parameters for accuracy
Parameter setting that gave the best results on the hold out data:  {'C': 0.1, 'l1_ratio': 0.01, 'max_iter': 2000, 'multi_class': 'ovr', 'penalty': 'elasticnet', 'solver': 'saga'}
Mean cross-validated score of the best_estimator found:  0.8454545454545455
Grid search results:               eval_type dataset_details          model_name  \
0  GRID SEARCH RESULTS          pam_fs  LogisticRegression   

                                      top_parameters name_score  score_value  
0  {'C': 0.1, 'l1_ratio': 0.01, 'max_iter': 2000,...   accuracy     0.845455  
Grid search results on test eval:                    eval_type dataset_details          model_name  \
0  TEST GRID SEARCH RESULTS          pam_fs  LogisticRegression   

                                      top_parameters  balanced_accuracy  \
0  {'C': 0.1, 'l1_ratio': 0.01, 'max_iter': 2000,...              0.685   

   accuracy  precision  recall     f1  
0     0.825      0.837   0.685  0.701  
R

dataset 2 with fs limma50

In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('limma_fs', X_train_limma, Y_train_limma, X_test_limma, Y_test_limma, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('limma_fs', X_train_limma, Y_train_limma, X_test_limma, Y_test_limma, scores, svc_tuned_parameters, SVC)

Tuning hyper-parameters for accuracy


KeyboardInterrupt: ignored

dataset 3 with fs pam50 and loge preprocessing

In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('pam_fs_loge', X_train_pam_loge, Y_train_pam_loge, X_test_pam_loge, Y_test_pam_loge, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('pam_fs_loge', X_train_pam_loge, Y_train_pam_loge, X_test_pam_loge, Y_test_pam_loge, scores, svc_tuned_parameters, SVC)

Tuning hyper-parameters for accuracy
Parameter setting that gave the best results on the hold out data:  {'C': 0.01, 'l1_ratio': 0.01, 'max_iter': 2000, 'multi_class': 'ovr', 'penalty': 'elasticnet', 'solver': 'saga'}
Mean cross-validated score of the best_estimator found:  0.8690909090909091
Grid search results:               eval_type dataset_details          model_name  \
0  GRID SEARCH RESULTS     pam_fs_loge  LogisticRegression   

                                      top_parameters name_score  score_value  
0  {'C': 0.01, 'l1_ratio': 0.01, 'max_iter': 2000...   accuracy     0.869091  
Grid search results on test eval:                    eval_type dataset_details          model_name  \
0  TEST GRID SEARCH RESULTS     pam_fs_loge  LogisticRegression   

                                      top_parameters  balanced_accuracy  \
0  {'C': 0.01, 'l1_ratio': 0.01, 'max_iter': 2000...              0.609   

   accuracy  precision  recall     f1  
0     0.825      0.627   0.609  0.613  


dataset 4 with fs limma50 and loge preprocessing

In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('limma_fs_loge', X_train_limma_loge, Y_train_limma_loge, X_test_limma_loge, Y_test_limma_loge, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('limma_fs_loge', X_train_limma_loge, Y_train_limma_loge, X_test_limma_loge, Y_test_limma_loge, scores, svc_tuned_parameters, SVC)

Tuning hyper-parameters for accuracy
Parameter setting that gave the best results on the hold out data:  {'C': 0.1, 'l1_ratio': 0.001, 'max_iter': 2000, 'multi_class': 'ovr', 'penalty': 'elasticnet', 'solver': 'saga'}
Mean cross-validated score of the best_estimator found:  0.8727272727272728
Grid search results:               eval_type dataset_details          model_name  \
0  GRID SEARCH RESULTS   limma_fs_loge  LogisticRegression   

                                      top_parameters name_score  score_value  
0  {'C': 0.1, 'l1_ratio': 0.001, 'max_iter': 2000...   accuracy     0.872727  
Grid search results on test eval:                    eval_type dataset_details          model_name  \
0  TEST GRID SEARCH RESULTS   limma_fs_loge  LogisticRegression   

                                      top_parameters  balanced_accuracy  \
0  {'C': 0.1, 'l1_ratio': 0.001, 'max_iter': 2000...              0.744   

   accuracy  precision  recall     f1  
0     0.861        0.8   0.744  0.761  


dataset 5 with fs pam50 and log2 preprocessing

In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('pam_fs_log2', X_train_pam_log2, Y_train_pam_log2, X_test_pam_log2, Y_test_pam_log2, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('pam_fs_log2', X_train_pam_log2, Y_train_pam_log2, X_test_pam_log2, Y_test_pam_log2, scores, svc_tuned_parameters, SVC)

Tuning hyper-parameters for accuracy
Parameter setting that gave the best results on the hold out data:  {'C': 0.01, 'l1_ratio': 0.01, 'max_iter': 2000, 'multi_class': 'ovr', 'penalty': 'elasticnet', 'solver': 'saga'}
Mean cross-validated score of the best_estimator found:  0.8781818181818182
Grid search results:               eval_type dataset_details          model_name  \
0  GRID SEARCH RESULTS     pam_fs_log2  LogisticRegression   

                                      top_parameters name_score  score_value  
0  {'C': 0.01, 'l1_ratio': 0.01, 'max_iter': 2000...   accuracy     0.878182  
Grid search results on test eval:                    eval_type dataset_details          model_name  \
0  TEST GRID SEARCH RESULTS     pam_fs_log2  LogisticRegression   

                                      top_parameters  balanced_accuracy  \
0  {'C': 0.01, 'l1_ratio': 0.01, 'max_iter': 2000...              0.609   

   accuracy  precision  recall     f1  
0     0.825      0.615   0.609  0.609  


dataset 6 with fs limma50 and log2 preprocessing

In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('limma_fs_log2', X_train_limma_log2, Y_train_limma_log2, X_test_limma_log2, Y_test_limma_log2, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('limma_fs_log2', X_train_limma_log2, Y_train_limma_log2, X_test_limma_log2, Y_test_limma_log2, scores, svc_tuned_parameters, SVC)

Tuning hyper-parameters for accuracy
Parameter setting that gave the best results on the hold out data:  {'C': 0.01, 'l1_ratio': 0.001, 'max_iter': 2000, 'multi_class': 'ovr', 'penalty': 'elasticnet', 'solver': 'saga'}
Mean cross-validated score of the best_estimator found:  0.8727272727272728
Grid search results:               eval_type dataset_details          model_name  \
0  GRID SEARCH RESULTS   limma_fs_log2  LogisticRegression   

                                      top_parameters name_score  score_value  
0  {'C': 0.01, 'l1_ratio': 0.001, 'max_iter': 200...   accuracy     0.872727  
Grid search results on test eval:                    eval_type dataset_details          model_name  \
0  TEST GRID SEARCH RESULTS   limma_fs_log2  LogisticRegression   

                                      top_parameters  balanced_accuracy  \
0  {'C': 0.01, 'l1_ratio': 0.001, 'max_iter': 200...              0.729   

   accuracy  precision  recall     f1  
0     0.861        0.8   0.729  0.752  

DATASET 7 CON FS REDUCED FROM PAM50 E LOG PREPROCESSING (choose the best between 2 and e)



In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('pam_fs_loge_sel', X_train_pam_loge_sel, Y_train_pam_loge_sel, X_test_pam_loge_sel, Y_test_pam_loge_sel, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('pam_fs_loge_sel', X_train_pam_loge_sel, Y_train_pam_loge_sel, X_test_pam_loge_sel, Y_test_pam_loge_sel, scores, svc_tuned_parameters, SVC)

Tuning hyper-parameters for accuracy
Parameter setting that gave the best results on the hold out data:  {'C': 0.01, 'l1_ratio': 0.001, 'max_iter': 2000, 'multi_class': 'ovr', 'penalty': 'elasticnet', 'solver': 'saga'}
Mean cross-validated score of the best_estimator found:  0.8690909090909089
Grid search results:               eval_type  dataset_details          model_name  \
0  GRID SEARCH RESULTS  pam_fs_loge_sel  LogisticRegression   

                                      top_parameters name_score  score_value  
0  {'C': 0.01, 'l1_ratio': 0.001, 'max_iter': 200...   accuracy     0.869091  
Grid search results on test eval:                    eval_type  dataset_details          model_name  \
0  TEST GRID SEARCH RESULTS  pam_fs_loge_sel  LogisticRegression   

                                      top_parameters  balanced_accuracy  \
0  {'C': 0.01, 'l1_ratio': 0.001, 'max_iter': 200...              0.612   

   accuracy  precision  recall     f1  
0     0.832      0.633   0.612  0.6

DATASET 8 CON FS REDUCED FROM LIMMA50 E LOG PREPROCESSING (choose the best between 2 and e)

In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('limma_fs_loge_sel', X_train_limma_loge_sel, Y_train_limma_loge_sel, X_test_limma_loge_sel, Y_test_limma_loge_sel, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('limma_fs_loge_sel', X_train_limma_loge_sel, Y_train_limma_loge_sel, X_test_limma_loge_sel, Y_test_limma_loge_sel, scores, svc_tuned_parameters, SVC)

Tuning hyper-parameters for accuracy
Parameter setting that gave the best results on the hold out data:  {'C': 0.1, 'l1_ratio': 0.01, 'max_iter': 2000, 'multi_class': 'ovr', 'penalty': 'elasticnet', 'solver': 'saga'}
Mean cross-validated score of the best_estimator found:  0.890909090909091
Grid search results:               eval_type    dataset_details          model_name  \
0  GRID SEARCH RESULTS  limma_fs_loge_sel  LogisticRegression   

                                      top_parameters name_score  score_value  
0  {'C': 0.1, 'l1_ratio': 0.01, 'max_iter': 2000,...   accuracy     0.890909  
Grid search results on test eval:                    eval_type    dataset_details          model_name  \
0  TEST GRID SEARCH RESULTS  limma_fs_loge_sel  LogisticRegression   

                                      top_parameters  balanced_accuracy  \
0  {'C': 0.1, 'l1_ratio': 0.01, 'max_iter': 2000,...              0.768   

   accuracy  precision  recall     f1  
0     0.883      0.831   0.768

DATASET 9 CON FS NEW E LOG PREPROCESSING (choose best try e for now)

In [16]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('lasso_fs_loge_sel', X_train_lasso, Y_train_lasso, X_test_lasso, Y_test_lasso, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('lasso_fs_loge_sel', X_train_lasso, Y_train_lasso, X_test_lasso, Y_test_lasso, scores, svc_tuned_parameters, SVC)

Tuning hyper-parameters for accuracy
Parameter setting that gave the best results on the hold out data:  {'C': 0.1, 'l1_ratio': 0.001, 'max_iter': 2000, 'multi_class': 'ovr', 'penalty': 'elasticnet', 'solver': 'saga'}
Mean cross-validated score of the best_estimator found:  0.9527272727272725
Grid search results:               eval_type    dataset_details          model_name  \
0  GRID SEARCH RESULTS  lasso_fs_loge_sel  LogisticRegression   

                                      top_parameters name_score  score_value  
0  {'C': 0.1, 'l1_ratio': 0.001, 'max_iter': 2000...   accuracy     0.952727  
Grid search results on test eval:                    eval_type    dataset_details          model_name  \
0  TEST GRID SEARCH RESULTS  lasso_fs_loge_sel  LogisticRegression   

                                      top_parameters  balanced_accuracy  \
0  {'C': 0.1, 'l1_ratio': 0.001, 'max_iter': 2000...              0.704   

   accuracy  precision  recall     f1  
0     0.876      0.885   0.7

### Random Forest Classifier exploration

In [21]:
# EVALUATION with RandomForrest
from sklearn.ensemble import RandomForestClassifier
rf_values=logreg_tuned_parameters = [{
    }]

training_and_saving_after_cv_and_single_split('lasso_fs_loge_sel', X_train_lasso, Y_train_lasso, X_test_lasso, Y_test_lasso, scores, rf_values,model=RandomForestClassifier)

Tuning hyper-parameters for accuracy
Parameter setting that gave the best results on the hold out data:  {}
Mean cross-validated score of the best_estimator found:  0.8818181818181818
Grid search results:               eval_type    dataset_details              model_name  \
0  GRID SEARCH RESULTS  lasso_fs_loge_sel  RandomForestClassifier   

  top_parameters name_score  score_value  
0             {}   accuracy     0.881818  
Grid search results on test eval:                    eval_type    dataset_details              model_name  \
0  TEST GRID SEARCH RESULTS  lasso_fs_loge_sel  RandomForestClassifier   

  top_parameters  balanced_accuracy  accuracy  precision  recall     f1  
0             {}              0.658     0.876      0.707   0.658  0.671  
Results on test eval:                    eval_type    dataset_details              model_name  \
0  TEST GRID SEARCH RESULTS  lasso_fs_loge_sel  RandomForestClassifier   

  top_parameters  balanced_accuracy  accuracy  precision  recall 