# In this notebook, we show the steps taken to preprocess the **isoforms** expression data, generate and select features through feature selection strategies which include lasso, mrmr, fisher score thresholds, mutual information score thresholds and the new proposed method **ReRa**. After this, different models are tuned and evaluated in the classification task using training and test dataset.



Related links:

*correlation pearson:*
https://towardsdatascience.com/what-it-takes-to-be-correlated-ce41ad0d8d7f

https://towardsdatascience.com/pearson-coefficient-of-correlation-explained-369991d93404

https://medium.com/@joseph.magiya/pearson-coefficient-of-correlation-using-pandas-ca68ce678c04


*fisher score:*
https://jundongl.github.io/scikit-feature/tutorial.html

https://ranasinghiitkgp.medium.com/implementing-feature-selection-methods-for-machine-learning-bfa2e4b4e02

*mutual information:*
https://towardsdatascience.com/select-features-for-machine-learning-model-with-mutual-information-534fe387d5c8

In [None]:
# mount Google Drive
from google.colab import drive
drive.mount('/content/Drive')

Mounted at /content/Drive


In [None]:
# uncomment if needed: libraries to install on google colab
! pip install mrmr_selection
! pip install scikit-learn==0.24.2 # Downgrading the scikit learn library to obtain same results of previous experiments and Convergence

In [None]:
# Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, accuracy_score, balanced_accuracy_score, f1_score, matthews_corrcoef, classification_report, make_scorer
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from xlwt import Workbook
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
import os
from pandas_profiling import ProfileReport
from sklearn import svm
from sklearn.svm import LinearSVC, SVC
from datetime import datetime
from sklearn.feature_selection import SelectFromModel
import seaborn as sb
from collections import OrderedDict
import warnings
warnings.filterwarnings('ignore')

In [None]:
! pip freeze | grep scikit # check scikit-learn version for conversion in grid search

scikit-image==0.18.3
scikit-learn==0.24.2


### Download of all datasets with different preprocessing strategies and feature spaces

In [None]:
# Current working directory and other paths
cwd = os.getcwd()
print(cwd)
!cd Drive/
path = cwd + "/Drive/My Drive/magistrale/BioinformaticsProject/data/" ### <- insert here path to retrieve data
results_path = cwd + "/Drive/My Drive/magistrale/BioinformaticsProject/results/" ### <- insert here path to save results

# Count per Million matrix
cpm_dataset = pd.read_csv(path+"CPM.csv",index_col=0) #read the main CPM dataset(67k × 719)
cpm_dataset = cpm_dataset.transpose() # (719 × 67k)

# Training and Testing datasets
training_ds =  pd.read_excel( path+"train_test_new.xlsx", sheet_name="train_new") # path+"train_test_new.xlsx", sheet_name="train_new")
testing_ds = pd.read_excel( path+"train_test_new.xlsx", sheet_name="test_new" ) # path+"train_test_new.xlsx", sheet_name="test_new")

# Feature space datesets
base_feature_space =path+"FEATURE_SPACES(RAW +CPM).xlsx"
# List of feature space name 
feature_space_files =["FEATURE_SPACE6(MAIN)", "FEATURE_SPACE1(PAM)", "FEATURE_SPACE2(PAM)","FEATURE_SPACE1(LIMMA)","FEATURE_SPACE2(LIMMA)", "FEATURE_SPACE7(pamsimilarity)","FEATURE_SPACE8(limmasimilarity)"]

### Data estraction and preprocessing

In [None]:
def extract_and_reduce_by_columns(path, sheet_name, name, nofeats_ds, preproc_strategy: str= "none"): 
  """
     Function to extract dataset and a specific group of its columns.

     path: the path where to get the data values (isoforms)
     sheet_name: the excel sheet were to get the columns to select for the data (isoforms)
     nofeats_ds: the dataset without the additional columns
     name: 'trainingset' or 'testingset' for the excel 
     preproc_strategy: which preprocessing strategy to apply to the ds

  """
  full_df = pd.read_excel(path, sheet_name=sheet_name) # path of subdatset 
  full_list= full_df['isoform'].values.tolist()  #exatrct the list of isoforms names as list
  if preproc_strategy == 'loge':
    log_cpm_dataset = np.log1p(cpm_dataset)
    data = log_cpm_dataset[np.intersect1d(log_cpm_dataset.columns, full_list)]
  elif preproc_strategy == 'log2':
    log_cpm_dataset = np.log2(cpm_dataset + 1) # constant added to avoid reaching zero
    data = log_cpm_dataset[np.intersect1d(log_cpm_dataset.columns, full_list)]
  elif preproc_strategy == 'normperrow':
    # normalize per rows
    data = cpm_dataset[np.intersect1d(cpm_dataset.columns, full_list)]
    data = data.div(data.sum(axis=1), axis=0) # ----> preprocessing scaling step to try, not working
  elif preproc_strategy == 'none':
    data = cpm_dataset[np.intersect1d(cpm_dataset.columns, full_list)]  # find the mutual isoform between main datset and subdatset 
  
  data.reset_index(inplace=True)
  data.rename(columns={ data.columns[0]: "sample_id" }, inplace = True)

  x = nofeats_ds['sample_id'].values.tolist()
  data1= data.loc[data['sample_id'].isin(x)]
  result = pd.merge(data1, nofeats_ds, on='sample_id')
  result
  result.rename(columns={'sample_label':'subtype'}, inplace=True )
 
  # result.to_csv(name +".csv", index=False) # save as csv file 
  return result

In [None]:
# This functions shows the evaluation results for the entire test set and also the metrics for each single class

def show_single_class_evaluation(y_pred: int, y_test: int, labels):
  print("Balanced accuracy: ", round(balanced_accuracy_score(y_test, y_pred), 3)) # not possible for single class
  print("Accuracy: ", round(accuracy_score(y_test, y_pred), 3)) # not possible for single class
  print('precision ', round(precision_score(y_test, y_pred, average="macro"), 3))
  print('recall ', round(recall_score(y_test, y_pred, average="macro"), 3))
  print('f1_macro ', round(f1_score(y_test, y_pred, average="macro"),3))
  print('f1_micro ', round(f1_score(y_test, y_pred, average="micro"),3))
  print("Precision: ", [round(i, 3) for i in precision_score(y_test, y_pred, average=None) ])
  print("Recall: ",  [round(i, 3) for i in recall_score(y_test, y_pred, average=None) ]) 
  print("F1 Score: ", [round(i, 3) for i in f1_score(y_test, y_pred, average=None) ]) 
  print('--------------------------------------------')

  dic_result = {}
  dic_result['balanced_accuracy'] = [round(balanced_accuracy_score(y_test, y_pred), 3)]
  dic_result['accuracy'] = [round(accuracy_score(y_test, y_pred), 3)]
  for i in range(len(labels)):
    dic_result[labels[i]+'-precision'] =  round( precision_score(y_test, y_pred, average=None)[i], 3)
  for i in range(len(labels)):
    dic_result[labels[i]+'-recall'] =  round( recall_score(y_test, y_pred, average=None)[i], 3)
  for i in range(len(labels)):   
    dic_result[labels[i]+'-f1_score'] =  round( f1_score(y_test, y_pred, average=None)[i], 3)
  

In [None]:
# List of feature space names 
feature_space_files =["FEATURE_SPACE6(MAIN)", "FEATURE_SPACE1(PAM)", "FEATURE_SPACE2(PAM)","FEATURE_SPACE1(LIMMA)","FEATURE_SPACE2(LIMMA)", "FEATURE_SPACE7(pamsimilarity)","FEATURE_SPACE8(limmasimilarity)"]

In [None]:
# Here the train and test set are created with the initial filterings to reach 49k features is 
# Training Data import:
train = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE6(MAIN)", 'trainingset', training_ds, 'log2') 
X_train_49kfs = train.drop(["sample_id","subtype"],  axis = 1)
samples_train = train.sample_id
Y_train_49kfs=train.subtype
print("X_train size:", X_train_49kfs.shape)

# Testing Data import:
test = extract_and_reduce_by_columns(base_feature_space,"FEATURE_SPACE6(MAIN)", 'testingset', testing_ds,  'log2') 
X_test_49kfs = test.drop(["sample_id","subtype"], axis = 1)
Y_test_49kfs = test.subtype
print("X_test size:", X_test_49kfs.shape)

X_train size: (410, 49740)
X_test size: (127, 49740)


DATASET with PAM50 Feature Space and LOG2 PREPROCESSING

In [None]:
# Training Data import:
train = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE7(pamsimilarity)", 'trainingset', training_ds, 'log2') 
X_train_pam_log2 = train.drop(["sample_id","subtype"],  axis = 1)
Y_train_pam_log2 =train.subtype
print("X_train size:", X_train_pam_log2.shape)

# Testing Data import:
test = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE7(pamsimilarity)",'testingset', testing_ds,'log2' ) 
X_test_pam_log2 = test.drop(["sample_id","subtype"], axis = 1)
Y_test_pam_log2 = test.subtype
print("X_test size:", X_test_pam_log2.shape)

X_train size: (410, 131)
X_test size: (127, 131)


DATASET with LIMMA50 Feature Space and LOG2 PREPROCESSING

In [None]:
# Training Data import:
train = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE8(limmasimilarity)", 'trainingset', training_ds, 'log2' ) 
X_train_limma_log2 = train.drop(["sample_id","subtype"],  axis = 1)
Y_train_limma_log2 =train.subtype
print("X_train size:", X_train_limma_log2.shape)

# Testing Data import:
test = extract_and_reduce_by_columns(base_feature_space, "FEATURE_SPACE8(limmasimilarity)",'testingset', testing_ds, 'log2') 
X_test_limma_log2 = test.drop(["sample_id","subtype"], axis = 1)
Y_test_limma_log2 = test.subtype
print("X_test size:", X_test_limma_log2.shape)

X_train size: (410, 557)
X_test size: (127, 557)


DATASET with features selected throgh LASSO REGRESSION from PAM50 and LOG2 PREPROCESSING 

In [None]:
sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))
sel_.fit(X_train_pam_log2,Y_train_pam_log2)

SelectFromModel(estimator=LogisticRegression(C=1, penalty='l1',
                                             solver='liblinear'))

In [None]:
sel_.get_support()
selected_feat = X_train_pam_log2.columns[(sel_.get_support())]

In [None]:
print('total features: {}'.format((X_train_pam_log2.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('Percentage features with coefficients shrank to zero: {}'.format(np.sum(sel_.estimator_.coef_ == 0)/131*5))

total features: 131
selected features: 118
Percentage features with coefficients shrank to zero: 15.763358778625953


In [None]:
# Datasets wuth the reduced feature spaces
X_train_pam_log2_sel = X_train_pam_log2[selected_feat].copy()
Y_train_pam_log2_sel = Y_train_pam_log2

X_test_pam_log2_sel = X_test_pam_log2[selected_feat].copy()
Y_test_pam_log2_sel = Y_test_pam_log2

DATASET with features selected throgh LASSO REGRESSION from LIMMA50 and LOG2 PREPROCESSING 

In [None]:
sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))
sel_.fit(X_train_limma_log2,Y_train_limma_log2)

SelectFromModel(estimator=LogisticRegression(C=1, penalty='l1',
                                             solver='liblinear'))

In [None]:
sel_.get_support()
selected_feat = X_train_limma_log2.columns[(sel_.get_support())]

In [None]:
print('total features: {}'.format((X_train_limma_log2.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('Percentage features with coefficients shrank to zero: {}'.format(np.sum(sel_.estimator_.coef_ == 0)/557*5))

total features: 557
selected features: 225
Percentage features with coefficients shrank to zero: 21.974865350089768


In [None]:
# Datasets with feature space reduced
X_train_limma_log2_sel = X_train_limma_log2[selected_feat].copy()
Y_train_limma_log2_sel = Y_train_limma_log2

X_test_limma_log2_sel = X_test_limma_log2[selected_feat].copy()
Y_test_limma_log2_sel = Y_test_limma_log2

## For the paper methodological anlysis, furhter feature spaces will be included which are: 
1. selection of features through fisher score
2. selection of features through mutual information
3. selection of features through ReRa, having as initial feature speces pam50 ,limma50, fisher score and mutual information with diferent thresholds


### Feature space with Fisher Score ( and log2 preprocessing)

In [None]:
# in order to compute the fisher score we need to clone and use this github project with uses sklearn to compute it
! git clone https://github.com/jundongl/scikit-feature.git
%cd scikit-feature/
! python setup.py install

Cloning into 'scikit-feature'...
remote: Enumerating objects: 1089, done.[K
remote: Counting objects: 100% (132/132), done.[K
remote: Compressing objects: 100% (74/74), done.[K
remote: Total 1089 (delta 57), reused 132 (delta 57), pack-reused 957[K
Receiving objects: 100% (1089/1089), 194.81 MiB | 36.45 MiB/s, done.
Resolving deltas: 100% (643/643), done.


In [None]:
import skfeature
import skfeature.function.similarity_based.fisher_score as fisher_score
# computing all fisher scores for all 49k features
score = fisher_score.fisher_score(X_train_49kfs.values, Y_train_49kfs.values)
score.sort()
score_dataframe = pd.DataFrame(score)

In [None]:
score_dataframe.describe([0.7, 0.8, 0.85, 0.9, 0.95, 0.97, 0.98, 0.99])

Unnamed: 0,0
count,49740.0
mean,0.094051
std,0.132675
min,0.000198
50%,0.052348
70%,0.093724
80%,0.133039
85%,0.165264
90%,0.216286
95%,0.320291


In [None]:
# The code for the actual generation and selection is commented 
# # Creating the new dataframe with only the chosen features by this method -> THRESHOLD CHOSEN IS THE 95% percentile value
# fisher_score_selected_index = np.where(score >score_97[score_97.index == '97%'].values[0][0])[0]
# selected_col_names = X_train_49kfs.columns[fisher_score_selected_index ]

# Load the previously saved and generated feature space with the current method
selected_col_names= pd.read_csv(path+"/isoformsfs/mrmr_from12k.csv", index_col=0,skiprows=1, names=['isoform_id'])
selected_col_names= selected_col_names['isoform_id'].tolist()
len(selected_col_names)

0.4018561997292888

In [None]:
X_train_fisher = X_train_49kfs[selected_col_names]
print(X_train_fisher.shape)

X_test_fisher = X_test_49kfs[selected_col_names]
print(X_test_fisher.shape)

# Y of the datasets remain the same
Y_train_fisher = Y_train_49kfs
Y_test_fisher = Y_test_49kfs

(410, 750)
(127, 750)


In [None]:
# Uncomment to save the current feature space
# pd.DataFrame(selected_col_names).to_csv(path+"/isoformsfs/fisher_th97.csv")

### Feature spaces from scores with mutual information

In [None]:
from sklearn.feature_selection import mutual_info_classif as MIC

In [None]:
# Computing mutual information scores for all 49k features using the library
mi_score = MIC(X_train_49kfs,Y_train_49kfs)

In [None]:
mi_score_dataframe= pd.DataFrame(mi_score)
# with the describe we can se the distribution of the scores
mi_score_dataframe.describe([0.7, 0.8, 0.85, 0.9, 0.95, 0.97, 0.98, 0.99])

Unnamed: 0,0
count,49740.0
mean,0.050626
std,0.05527
min,0.0
50%,0.036075
70%,0.063142
80%,0.083432
85%,0.098878
90%,0.121053
95%,0.160291


In [None]:
score_97 = mi_score_dataframe.describe([ 0.97])
score_97[score_97.index == '97%'].values[0][0]

0.1887791224592662

In [None]:
# The code for the actual generation and selection is commented 
# # Creating the new dataframe with only the chosen features by this method -> THRESHOLD IS TH 95% percentile
# mi_score_selected_index = np.where(mi_score >score_97[score_97.index == '97%'].values[0][0])[0]
# selected_col_names = X_train_49kfs.columns[mi_score_selected_index ]


# Load the previously saved and generated feature space with the current method
selected_col_names= pd.read_csv(path+"mutualinformation_th97.csv", index_col=0,skiprows=1, names=['isoform_id'])
selected_col_names= selected_col_names['isoform_id'].tolist()
len(selected_col_names)

In [None]:
X_train_mi = X_train_49kfs[selected_col_names]
print(X_train_mi.shape)

X_test_mi = X_test_49kfs[selected_col_names]
print(X_test_mi.shape)

Y_train_mi = Y_train_49kfs
Y_test_mi = Y_test_49kfs

(410, 1493)
(127, 1493)


In [None]:
# Uncomment to save the current feature space
# pd.DataFrame(selected_col_names).to_csv(path+"/isoformsfs/mutualinformation_th97.csv")

### 10 Random feature spaces

In [None]:
X_train_randoms = []
random_spaces_names = []
X_test_randoms = []

for i in range(10):
  rand_selected_feat = pd.read_csv(path+"isoformsfs/random_"+str(i)+".csv", index_col=0,skiprows=1, names=['isoform_id'])
  rand_selected_feat = rand_selected_feat['isoform_id'].tolist()

  selected_col_names =rand_selected_feat
  X_train_random = X_train_49kfs[selected_col_names]
  print("Shape of the new train dataset", X_train_random.shape)
  X_test_random = X_test_49kfs[selected_col_names]
  print("Shape of the new test dataset",X_test_random.shape)

  X_train_randoms.append(X_train_random)
  X_test_randoms.append(X_test_random)
  random_spaces_names.append("random"+str(i))

Shape of the new train dataset (410, 887)
Shape of the new test dataset (127, 887)
Shape of the new train dataset (410, 887)
Shape of the new test dataset (127, 887)
Shape of the new train dataset (410, 887)
Shape of the new test dataset (127, 887)
Shape of the new train dataset (410, 887)
Shape of the new test dataset (127, 887)
Shape of the new train dataset (410, 887)
Shape of the new test dataset (127, 887)
Shape of the new train dataset (410, 887)
Shape of the new test dataset (127, 887)
Shape of the new train dataset (410, 887)
Shape of the new test dataset (127, 887)
Shape of the new train dataset (410, 887)
Shape of the new test dataset (127, 887)
Shape of the new train dataset (410, 887)
Shape of the new test dataset (127, 887)
Shape of the new train dataset (410, 887)
Shape of the new test dataset (127, 887)


In [None]:
Y_train_random = Y_train_49kfs
Y_test_random = T_test_49kfs

## ReRa method: using the similarity based filter -> analysis and selection

In [None]:
# support function to find the median of the values in a column
def find_median_less_than_one(df, col_name):
    # Get only the values in the column that are less than 1
    values = df[df[col_name] < 1][col_name]
    # Compute the median of the values
    median = values.median()
    return median

In [None]:
# support function to find the waste (difference between max median and min median local)
def find_deviation_value(dfs, col_name):
  median_values = []
  for df in dfs:
    m =  find_median_less_than_one(df, col_name)
    median_values.append(m)
  return max(median_values) - min(median_values)

In [None]:
def sbf_analysis(X, Y, additional_link: bool=False, link_mask= None ):
  # computing the correlation matrix with pearson correlation from the train set with 49k features 
  ##### current issue -> ram crashes because computation power is not enough -> reduced to 30k
  pc_global = X.corr()
  
  # checking the scores in the pearson correlation
  print(pc_global.describe())
  # more information about the distribution of the correlations
  # print(pc_global.describe())
  # to display the correlation matrix let's plot and show them with a heatmap
  '''sb.heatmap(pearson_corr, 
            xticklabels=pearson_corr.columns,
            yticklabels=pearson_corr.columns,
            cmap='RdBu_r',
            annot=True,
            linewidth=0.5)
  '''

  # find median after removing values equal to 1 or duplicated ones 
  values = pc_global.values
  # get only the vlaues under the diagonal (since simmetric matrix with duplicated values) to compute the global stats
  lower_triangular = values[np.tril_indices(values.shape[0], -1)]
  flatten = lower_triangular.flatten()
  flatten_df = pd.DataFrame(flatten)
  flatten_df = flatten_df[flatten_df <1]
  # compute global median and save it
  global_median = np.median(flatten_df)
  print("Global median:", global_median)

  pc_global[ pc_global ==1] =  0

  print("Per class correlation with pearson: \n\n")

  local_medians_dic = {}
  local_pcs = {}
  # for each class in the classes available
  for name_class in Y.unique():
    print(name_class)
    # find the samples from the train assigned to that class
    sample_per_class = Y[Y==name_class]
    print("Samples with label:", len(sample_per_class))
    # choose the corresponding X assigned to the samples with as label the current class
    X_class = X.loc[sample_per_class.index]
    print(X_class.shape)
    # compute correlation matrix only for those samples
    class_corr = X_class.corr()
    # print("Correlation of class \n", class_corr)

    values = class_corr.values
    lower_triangular = values[np.tril_indices(values.shape[0], -1)]
    flatten = lower_triangular.flatten()
    flatten_df = pd.DataFrame(flatten)
    flatten_df = flatten_df[flatten_df <1]
    median_local = np.median(flatten_df)
    print("Local median:", median_local)
    
    class_corr[ class_corr ==1] =  0
    # saving in two dictionaries the information for the next steps of filtering
    # which are: pearson correlation matrices (complete) for only samples of each class
    # local median values for each of these "LOCAL" pearson correlation matrices
    local_pcs[name_class] = class_corr
    local_medians_dic[name_class]= median_local
  
  print(local_medians_dic)

  # now I have the global variables: class_corr which is the matrix with correlations and median_global which is the global median threshold
  # and the local variables which are the matrix of correlation with samples of each class and local median threshold for each one of them

  # Initialize array temp
  temp = []
  i = 0 # counter to check loop status
  # Loop through each column in matrix a
  for col in pc_global.columns:

      print("current feature: ", col, " we are at the feature n.", i)
      i += 1
      # Check if all values in the column are less than median_a
      if all(pc_global[col] < global_median):
          print(col, "added to temp array")
          temp.append(col)
      else:
          # Find rows where values in the column are greater than or equal to the threshold
          # these means that these two features are similar and the scores have to be checked locally
          row_feats = pc_global[col][pc_global[col] >=  global_median].index.tolist() 

          for row_feat in row_feats:
             # assuimption: the two features are "also" locally similar 
              not_locally_similar = False
              # Check if all values in the corresponding columns of local matrices are less than their respective medians
              for c_name in local_pcs.keys():
                local_pc = local_pcs[c_name]
                local_median = local_medians_dic[c_name]
                if local_pc[col][row_feat] < local_median: # if all(local_pc[col] < local_median):
                  # if all values in the column of the local pearson correlation matrix are under the local threshold we can add both features (row_feat and col) in the temp
                  not_locally_similar = True
                  break
              
              if not_locally_similar:
                # print(col, row_feat, " have been added to temp since locally not similar")
                temp.append(col)
                temp.append(row_feat)
              else:
                # print("Choosing by looking at higher waste between two features")
                dev_col = find_deviation_value(list(local_pcs.values()), col)
                dev_row = find_deviation_value(list(local_pcs.values()), row_feat)
                # for the final choice of the feature two keep, looking at the maximum waste for each of the features and choosing the wider one
                if dev_col > dev_row:
                  temp.append(col)
                  # removing all values from temp that correspond to the feature with less waste
                  temp = list(filter(lambda a: a != row_feat, temp))
                else:
                  temp.append(row_feat)
                  temp = list(filter(lambda a: a != col, temp))

  print(temp)
  # removing duplicates
  temp_nodup = list(OrderedDict.fromkeys(temp))
  return temp_nodup

### Feature Space with ReRa applied on PAM50-log2

In [None]:
# output of the sbf analysis will be the list of filtered features
sbf_pam =  sbf_analysis(X_train_pam_log2,  Y_train_pam_log2)

In [None]:
from collections import OrderedDict
sbf_nodup = list(OrderedDict.fromkeys(sbf_pam))
len(sbf_nodup)

for i in sbf_nodup:
  if i in X_train_pam_log2.columns:
    # print(i)
    x = 0
  else:
    print(i, "Not in ---> problem")

In [None]:
selected_col_names = sbf_pam
# New x train dataset with only the selected features
X_train_pam_sbf = X_train_49kfs[selected_col_names]
print("Shape of the new train dataset", X_train_pam_sbf.shape)

test_selected_col_names = sbf_pam
# New x test dataset with only the selected features
X_test_pam_sbf = X_test_49kfs[selected_col_names]
print("Shape of the new test dataset",X_test_pam_sbf.shape)

Shape of the new train dataset (410, 129)
Shape of the new test dataset (127, 129)


In [None]:
# Uncomment to save the current feature space
# pd.DataFrame(selected_col_names).to_csv(path+"/isoformsfs/sbf_pam.csv")

### Feature Space with ReRa applied on LIMMA50 log2

In [None]:
# output of the sbf_analysis will be the lsit of selected features 
sbf_limma = sbf_analysis(X_train_limma_log2,  Y_train_limma_log2)

In [None]:
selected_col_names = sbf_limma
X_train_limma_sbf = X_train_49kfs[selected_col_names]
print("Shape of the new train dataset", X_train_limma_sbf.shape)

test_selected_col_names = sbf_limma
X_test_limma_sbf = X_test_49kfs[selected_col_names]
print("Shape of the new test dataset",X_test_limma_sbf.shape)

Shape of the new train dataset (410, 533)
Shape of the new test dataset (127, 533)


In [None]:
# Uncomment to save the current feature space
# pd.DataFrame(selected_col_names).to_csv(path+"/isoformsfs/sbf_limma.csv")

### FS with ReRa applied on Fisher Score log2

In [None]:
# The code for the actual generation and selection is commented 
# sbf_fisher = sbf_analysis(X_train_fisher,  Y_train_fisher)

# Load the previously saved and generated feature space with the current method
sbf_fisher = pd.read_csv(path+"/isoformsfs/sbf_fisher_th97.csv", index_col=0,skiprows=1, names=['isoform_id'])
sbf_fisher = sbf_fisher['isoform_id'].tolist()

In [None]:
selected_col_names = sbf_fisher
X_train_fisher_sbf = X_train_49kfs[selected_col_names]
print(X_train_fisher_sbf.shape)

test_selected_col_names =sbf_fisher
X_test_fisher_sbf = X_test_49kfs[selected_col_names]
print(X_test_fisher_sbf.shape)

(410, 1208)
(127, 1208)


In [None]:
# Uncomment to save the current feature space
# pd.DataFrame(sbf_fisher).to_csv(path+"/isoformsfs/sbf_fisher_th97.csv")

### Feature Space with SBF on Mutual Info log2

In [None]:
# The code for the actual generation and selection is commented 
# output will be the list of selected features
# sbf_mi =  sbf_analysis(X_train_mi,  Y_train_mi)

# Load the previously saved and generated feature space with the current method
sbf_mi= pd.read_csv(path+"/isoformsfs/sbf_mi_th97.csv", index_col=0,skiprows=1, names=['isoform_id'])
sbf_mi = sbf_mi['isoform_id'].tolist()
len(sbf_mi)

In [None]:
selected_col_names =sbf_mi
X_train_mi_sbf = X_train_49kfs[selected_col_names]
print("Shape of the new train dataset", X_train_mi_sbf.shape)

test_selected_col_names = sbf_mi
X_test_mi_sbf = X_test_49kfs[selected_col_names]
print("Shape of the new test dataset",X_test_mi_sbf.shape)

Shape of the new train dataset (410, 1264)
Shape of the new test dataset (127, 1264)


In [None]:
# Uncomment to save the current feature space
# pd.DataFrame(sbf_mi).to_csv(path+"/isoformsfs/sbf_mi_th97.csv")

### MRMR Feature spaces

mrmr 500 size

In [None]:
# Load the previously saved and generated feature space with the current method
selected_feat = pd.read_csv(path+"/isoformsfs/mrmr500_from12k.csv", index_col=0,skiprows=1, names=['isoform_id'])
selected_feat = selected_feat['isoform_id'].tolist()

In [None]:
X_train_mrmr_500 = X_train_49kfs[selected_feat].copy()
Y_train_mrmr = Y_train_49kfs

X_test_mrmr_500 = X_test_49kfs[selected_feat].copy()
Y_test_mrmr = Y_test_49kfs
print(X_train_mrmr_500.shape, X_test_mrmr_500.shape)

(410, 500) (127, 500)


mrmr 750 size

In [None]:
# Load the previously saved and generated feature space with the current method
selected_feat = pd.read_csv(path+"/isoformsfs/mrmr750_from12kfs.csv", index_col=0,skiprows=1, names=['gene_id'])
selected_feat = selected_feat['gene_id'].tolist()

In [None]:
X_train_mrmr_750 = X_train_49kfs[selected_feat].copy()
Y_train_mrmr = Y_train_49kfs

X_test_mrmr_750 = X_test_49kfs[selected_feat].copy()
Y_test_mrmr = Y_test_49kfs
print(X_train_mrmr_750.shape, X_test_mrmr_750.shape)

(410, 250) (127, 250)


mrmr 1000 size

In [None]:
# Load the previously saved and generated feature space with the current method
selected_feat = pd.read_csv(path+"/isoformsfs/mrmr1000_from12k.csv", index_col=0,skiprows=1, names=['isoform_id'])
selected_feat = selected_feat['isoform_id'].tolist()

In [None]:
X_train_mrmr_1000 = X_train_49kfs[selected_feat].copy()
Y_train_mrmr = Y_train_49kfs

X_test_mrmr_1000 = X_test_49kfs[selected_feat].copy()
Y_test_mrmr = Y_test_49kfs
print(X_train_mrmr_1000.shape, X_test_mrmr_1000.shape)

(410, 1000) (127, 1000)


### Model training, evaluation and saving of results


In [None]:
def write_results(results, final_path, name_file):
      '''
      Function to write results metrics and confing into a csv file with as name the current date
      '''
      # datetime object containing current date and time
      now = datetime.now()
      dt_string = now.strftime("%d%m%Y%H%M%S")
      dt = now.strftime("%d%m%Y")

      if not os.path.exists(results_path+final_path+dt+'/'):
        os.mkdir(results_path+final_path+dt+'/')
        
      df = pd.DataFrame(results)
      df.to_csv(results_path+final_path+dt+'/'+name_file+dt_string+".csv")

In [None]:
def training_and_saving_after_cv_and_single_split(which_ds, X_train, Y_train,X_test, Y_test, scores, param_values, model):
  '''
  This function computes the grid search on the parameter space give and on the model and dataset; then it 
  trains again using the top configuration found, evaluates on the test set and saves the results
  '''
  C = []
  l1_ratio = []
  cv_best = []
  score_test_balanced_accuracy = []
  score_test_accuracy = []
  precision = []
  recall = []
  f1=[]

  for index, score in enumerate(scores):
      # -------RESULTS in CROSS_VALIDATION-----------
      print("Tuning hyper-parameters for %s" % score)
      # Fit and hyperparameter search
      selected_model = GridSearchCV(model(), param_values, scoring=score, cv=10)
      selected_model.fit(X_train, Y_train)
      # found best model and fit on training
      print("Parameter setting that gave the best results on the hold out data: ",  selected_model.best_params_)
      print("Mean cross-validated score of the best_estimator found, mean, std dev: ",  selected_model.best_score_, selected_model.cv_results_['mean_test_score'], selected_model.cv_results_['std_test_score'] )

      # save top config and score from grid search (only accuracy or balanced accuracy)
      # evaltype datasetdetails modelname parameters balancedaccuracy
      dic_result = {}
      dic_result['eval_type']= ['GRID SEARCH RESULTS']
      dic_result['dataset_details']= [which_ds]
      dic_result['model_name']=  [model.__name__ ]
      dic_result['top_parameters']= [str(selected_model.best_params_)]
      dic_result['name_score']= [score]
      dic_result['best_score'] = [selected_model.best_score_]
      dic_result['mean_test_score'] = [selected_model.best_score_]
      index, = np.where(selected_model.cv_results_['mean_test_score']==selected_model.best_score_)
      dic_result['std_test_score'] =selected_model.cv_results_['std_test_score'][index[0]]

      df_result = pd.DataFrame.from_dict(dic_result)
      print('Grid search results: ', df_result)
      write_results(df_result,model.__name__ +'/', 'cv_on_'+score)

      # use top config and trained model for evaluation on test
      y_true, y_pred = Y_test, selected_model.predict(X_test)

      # save results from test
      # evaltype datasetdetails modelname parameters balacc accc prec rec f1
      dic_result = {}
      dic_result['eval_type']= ['TEST GRID SEARCH RESULTS']
      dic_result['dataset_details']= [which_ds]
      dic_result['model_name']=  [model.__name__ ]
      dic_result['top_parameters']= [str(selected_model.best_params_)]
      dic_result['balanced_accuracy'] = [round(balanced_accuracy_score(y_true, y_pred), 3)]
      dic_result['accuracy'] = [round(accuracy_score(y_true, y_pred), 3)]
      dic_result['precision'] = [round(precision_score(y_true, y_pred, average="macro"), 3)]
      dic_result['recall'] = [round(recall_score(y_true, y_pred, average="macro"), 3)]
      dic_result['f1_macro'] = [round(f1_score(y_true, y_pred, average="macro"),3)]
      dic_result['f1_micro'] = [round(f1_score(y_true, y_pred, average="micro"),3)]

      df_result = pd.DataFrame.from_dict(dic_result)
      print('Grid search results on test eval: ', df_result)
      # not saved anymore because the results are the same as creating new model and performing eval on test set
      # it was initiially introduced for verification
      # write_results(df_result,model.__name__ +'/', 'testcv_on_'+score) 

      # create new model with top convig and evaluate for verification
      check_model = model(**selected_model.best_params_)
      check_model.fit(X_train, Y_train)
      
      y_true, y_pred = Y_test, check_model.predict(X_test)

      # save again the scores
      dic_result = {}
      dic_result['eval_type']= ['TEST GRID SEARCH RESULTS']
      dic_result['dataset_details']= [which_ds]
      dic_result['model_name']=  [model.__name__ ]
      dic_result['top_parameters']= [str(selected_model.best_params_)]
      dic_result['balanced_accuracy'] = [round(balanced_accuracy_score(y_true, y_pred), 3)]
      dic_result['accuracy'] = [round(accuracy_score(y_true, y_pred), 3)]
      dic_result['precision'] = [round(precision_score(y_true, y_pred, average="macro"), 3)]
      dic_result['recall'] = [round(recall_score(y_true, y_pred, average="macro"), 3)]
      dic_result['f1_macro'] = [round(f1_score(y_true, y_pred, average="macro"),3)]
      dic_result['f1_micro'] = [round(f1_score(y_true, y_pred, average="micro"),3)]

      df_result = pd.DataFrame.from_dict(dic_result)
      print('Results on test eval: ', df_result)
      write_results(df_result,model.__name__ +'/', 'test_on_'+score)

### Training, Tuning and Evaluating Models section

In [None]:
# for each dataset
# for each model
# create all parameters and other details to pass to the fun
# run training and saving function 

Parameters spaces definition

In [None]:
scores = [ "balanced_accuracy"]

In [None]:
# Configuration of parameters and name

# GridSearch attributes
# Set the parameters by cross-validation
#'l1_ratio':[0.5], 'solver': ['saga'], 'penalty':['elasticnet']
logreg_tuned_parameters = [{
    'multi_class':  ['ovr'],
    'penalty':['elasticnet'],
    'solver': ['saga'], 
    'max_iter':[1000], 
    'C':  [ 0.1, 0.01, 0.001], #[10 ** i for i in range(-2,1)],
    'l1_ratio': [ 0.1, 0.01, 0.001, 0.005] #[10 ** i for i in range(-2,1)] #'l1_ratio':[0.5]}]
    }]

rf_parameters= [{
    'min_samples_split': [2,5,10],
    'max_depth': [ 100],
    'max_features': [ 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'n_estimators': [150, 200, 500, 750]
    }]

lin_svc_tuned_parameters = [{
    #'kernel':['linear'],  
    # 'degree': [1, 2, 3], # 1 for linear, 2 for polynomial
    'penalty' : ['l1', 'l2'],
    'max_iter':[2000], 
    'C': [10 ** i for i in range(-3,3)]
    }]


svc_tuned_parameters = [{
    'kernel':['poly'],  
    'degree': [1, 2, 3], # 1 for linear, 2 for polynomial
    'gamma': [10 ** i for i in range(-3,3)],
    'max_iter':[2000], 
    'C': [10 ** i for i in range(-3,3)]}]

print(logreg_tuned_parameters)
print(svc_tuned_parameters)
print(rf_parameters)
print(lin_svc_tuned_parameters)

[{'multi_class': ['ovr'], 'penalty': ['elasticnet'], 'solver': ['saga'], 'max_iter': [1000], 'C': [0.1, 0.01, 0.001], 'l1_ratio': [0.1, 0.01, 0.001, 0.005]}]
[{'kernel': ['poly'], 'degree': [1, 2, 3], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100], 'max_iter': [1000], 'C': [0.001, 0.01, 0.1, 1, 10, 100]}]
[{'min_samples_split': [2, 5, 10], 'max_depth': [100], 'max_features': ['sqrt'], 'min_samples_leaf': [1, 2, 4], 'n_estimators': [150, 200, 500, 750]}]
[{'penalty': ['l1', 'l2'], 'max_iter': [1000], 'C': [0.001, 0.01, 0.1, 1, 10, 100]}]


### Running experiments section

In [None]:
for i in range(1,10):
    # EVALUATION with LOGISTIC REGRESSION
    training_and_saving_after_cv_and_single_split('random_'+str(i), X_train_randoms[i], Y_train_random, X_test_randoms[i], Y_test_random, scores, logreg_tuned_parameters, LogisticRegression)

    # EVALUATION with SVC
    training_and_saving_after_cv_and_single_split('random_'+str(i),  X_train_randoms[i], Y_train_random, X_test_randoms[i], Y_test_random, scores, svc_tuned_parameters, SVC)

    # EVALUATION with Linear SVC
    training_and_saving_after_cv_and_single_split('random_'+str(i), X_train_randoms[i], Y_train_random, X_test_randoms[i], Y_test_random, scores,  lin_svc_tuned_parameters, LinearSVC)

    # dataset 4
    training_and_saving_after_cv_and_single_split('random_'+str(i),  X_train_randoms[i], Y_train_random, X_test_randoms[i], Y_test_random, scores, rf_parameters,model=RandomForestClassifier)


 FIsher feature space

In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('mrmr_from12k', X_train_fisher, Y_train_fisher, X_test_fisher, Y_test_fisher, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('mrmr_from12k', X_train_fisher, Y_train_fisher, X_test_fisher, Y_test_fisher, scores, svc_tuned_parameters, SVC)

# EVALUATION with Linear SVC
training_and_saving_after_cv_and_single_split('mrmr_from12k', X_train_fisher, Y_train_fisher, X_test_fisher, Y_test_fisher, scores,  lin_svc_tuned_parameters, LinearSVC)

# dataset 4
training_and_saving_after_cv_and_single_split('mrmr_from12k', X_train_fisher, Y_train_fisher, X_test_fisher, Y_test_fisher, scores, rf_parameters,model=RandomForestClassifier)

Mutual information space


In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('mi_fs_97', X_train_mi, Y_train_mi, X_test_mi, Y_test_mi, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('mi_fs_97', X_train_mi, Y_train_mi, X_test_mi, Y_test_mi, scores, svc_tuned_parameters, SVC)

# EVALUATION with Linear SVC
training_and_saving_after_cv_and_single_split('mi_fs_97', X_train_mi, Y_train_mi, X_test_mi, Y_test_mi, scores,  lin_svc_tuned_parameters, LinearSVC)

# dataset 4
training_and_saving_after_cv_and_single_split('mi_fs_97', X_train_mi, Y_train_mi, X_test_mi, Y_test_mi, scores, rf_parameters,model=RandomForestClassifier)

## ReRa tuning and testing

pam

In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('sbf_pam_fs', X_train_pam_sbf, Y_train_mi, X_test_pam_sbf, Y_test_mi, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('sbf_pam_fs', X_train_pam_sbf, Y_train_mi, X_test_pam_sbf, Y_test_mi, scores, svc_tuned_parameters, SVC)

# dataset random forest
training_and_saving_after_cv_and_single_split('sbf_pam_fs', X_train_pam_sbf, Y_train_mi, X_test_pam_sbf, Y_test_mi, scores, rf_parameters,model=RandomForestClassifier)


# EVALUATION with Linear SVC
training_and_saving_after_cv_and_single_split('sbf_pam_fs', X_train_pam_sbf, Y_train_mi, X_test_pam_sbf, Y_test_mi, scores, lin_svc_tuned_parameters, LinearSVC)


limma

In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('sbf_limma_fs', X_train_limma_sbf, Y_train_mi, X_test_limma_sbf, Y_test_mi, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('sbf_limma_fs', X_train_limma_sbf, Y_train_mi, X_test_limma_sbf, Y_test_mi, scores, svc_tuned_parameters, SVC)

# dataset random forest
training_and_saving_after_cv_and_single_split('sbf_limma_fs', X_train_limma_sbf, Y_train_mi, X_test_limma_sbf, Y_test_mi, scores, rf_parameters,model=RandomForestClassifier)

# EVALUATION with Linear SVC
training_and_saving_after_cv_and_single_split('sbf_limma_fs', X_train_limma_sbf, Y_train_mi, X_test_limma_sbf, Y_test_mi, scores, lin_svc_tuned_parameters, LinearSVC)

fisher

In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('sbf_fisher_fs_97', X_train_fisher_sbf, Y_train_mi, X_test_fisher_sbf, Y_test_mi, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('sbf_fisher_fs_97', X_train_fisher_sbf, Y_train_mi, X_test_fisher_sbf, Y_test_mi, scores, svc_tuned_parameters, SVC)

# dataset random forest
training_and_saving_after_cv_and_single_split('sbf_fisher_fs_97', X_train_fisher_sbf, Y_train_mi, X_test_fisher_sbf, Y_test_mi, scores, rf_parameters,model=RandomForestClassifier)

# EVALUATION with Linear SVC
training_and_saving_after_cv_and_single_split('sbf_fisher_fs_97', X_train_fisher_sbf, Y_train_mi, X_test_fisher_sbf, Y_test_mi, scores, lin_svc_tuned_parameters, LinearSVC)


mutual info

In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('sbf_mutualinfo_fs_97', X_train_mi_sbf, Y_train_mi, X_test_mi_sbf, Y_test_mi, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('sbf_mutualinfo_fs_97', X_train_mi_sbf, Y_train_mi, X_test_mi_sbf, Y_test_mi, scores, svc_tuned_parameters, SVC)

# dataset random forest
training_and_saving_after_cv_and_single_split('sbf_mutualinfo_fs_97', X_train_mi_sbf, Y_train_mi, X_test_mi_sbf, Y_test_mi, scores, rf_parameters,model=RandomForestClassifier)

# EVALUATION with Linear SVC
training_and_saving_after_cv_and_single_split('sbf_mutualinfo_fs_97', X_train_mi_sbf, Y_train_mi, X_test_mi_sbf, Y_test_mi, scores, lin_svc_tuned_parameters, LinearSVC)

# mrmr



In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('mrmr_500_from12k', X_train_mrmr_500, Y_train_mrmr, X_test_mrmr_500, Y_test_mrmr, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('mrmr_500_from12k', X_train_mrmr_500, Y_train_mrmr, X_test_mrmr_500, Y_test_mrmr, scores, svc_tuned_parameters, SVC)

# EVALUATION with Linear SVC
training_and_saving_after_cv_and_single_split('mrmr_500_from12k', X_train_mrmr_500, Y_train_mrmr, X_test_mrmr_500, Y_test_mrmr, scores,  lin_svc_tuned_parameters, LinearSVC)

# dataset 4
training_and_saving_after_cv_and_single_split('mrmr_500_from12k', X_train_mrmr_500, Y_train_mrmr, X_test_mrmr_500, Y_test_mrmr, scores, rf_parameters,model=RandomForestClassifier)

In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('mrmr_750_from12k', X_train_mrmr_750, Y_train_mrmr, X_test_mrmr_750, Y_test_mrmr, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('mrmr_750_from12k', X_train_mrmr_750, Y_train_mrmr, X_test_mrmr_750, Y_test_mrmr, scores, svc_tuned_parameters, SVC)

# EVALUATION with Linear SVC
training_and_saving_after_cv_and_single_split('mrmr_750_from12k', X_train_mrmr_750, Y_train_mrmr, X_test_mrmr_750, Y_test_mrmr, scores,  lin_svc_tuned_parameters, LinearSVC)

# dataset 4
training_and_saving_after_cv_and_single_split('mrmr_750_from12k', X_train_mrmr_750, Y_train_mrmr, X_test_mrmr_750, Y_test_mrmr, scores, rf_parameters,model=RandomForestClassifier)

In [None]:
# EVALUATION with LOGISTIC REGRESSION
training_and_saving_after_cv_and_single_split('mrmr_1000_from12k', X_train_mrmr_1000, Y_train_mrmr, X_test_mrmr_1000, Y_test_mrmr, scores, logreg_tuned_parameters, LogisticRegression)

# EVALUATION with SVC
training_and_saving_after_cv_and_single_split('mrmr_1000_from12k', X_train_mrmr_1000, Y_train_mrmr, X_test_mrmr_1000, Y_test_mrmr, scores, svc_tuned_parameters, SVC)

# EVALUATION with Linear SVC
training_and_saving_after_cv_and_single_split('mrmr_1000_from12k', X_train_mrmr_1000, Y_train_mrmr, X_test_mrmr_1000, Y_test_mrmr, scores,  lin_svc_tuned_parameters, LinearSVC)

# dataset 4
training_and_saving_after_cv_and_single_split('mrmr_1000_from12k', X_train_mrmr_1000, Y_train_mrmr, X_test_mrmr_1000, Y_test_mrmr, scores, rf_parameters,model=RandomForestClassifier)