# **TASK: Classification using Logistic Regression and parameters from Grid Search on preprocessed dataset with PAM50/LIMMA50 filters**

Useful links:

https://stackoverflow.com/questions/54608088/what-is-gridsearch-cv-results-could-any-explain-all-the-things-in-that-i-e-me

https://python.plainenglish.io/how-to-use-pandas-profiling-on-google-colab-e34f34ff1c9f

https://towardsdatascience.com/tuning-the-hyperparameters-of-your-machine-learning-model-using-gridsearchcv-7fc2bb76ff27

https://rpubs.com/cliex159/884981

https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
https://scikit-learn.org/stable/modules/linear_model.html#lasso


In [None]:
# mount Google Drive
from google.colab import drive
drive.mount('/content/Drive')

Drive already mounted at /content/Drive; to attempt to forcibly remount, call drive.mount("/content/Drive", force_remount=True).


In [None]:
# Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, accuracy_score, balanced_accuracy_score, f1_score, matthews_corrcoef, classification_report, make_scorer
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from xlwt import Workbook
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
import os
from pandas_profiling import ProfileReport

In [None]:
import warnings
warnings.filterwarnings('ignore')
!pip freeze
! pip install scikit-learn==0.24.2 # Downgrading the scikit learn library to obtain same results of previous experiments and Convergence

In [None]:
# Current working directory and other paths
cwd = os.getcwd()
print(cwd)
!cd Drive/
path = cwd + "/Drive/My Drive/magistrale/BioinformaticsProject/data/"
results_path = cwd + "/Drive/My Drive/magistrale/BioinformaticsProject/results/"

# Count per Million matrix
cpm_dataset = pd.read_csv(path+"CPM.csv",index_col=0) #read the main CPM dataset(67k × 719)
cpm_dataset = cpm_dataset.transpose() # (719 × 67k)
# Training and Testing datasets
training_ds =  pd.read_excel(path+"train.test.xlsx", sheet_name="train")
testing_ds = pd.read_excel(path+"train.test.xlsx", sheet_name="test")

# Feature space datesets
base_feature_space =path+"FEATURE_SPACES(RAW +CPM).xlsx"
# List of feature space name 
feature_space_files =["FEATURE_SPACE1(PAM)", "FEATURE_SPACE2","FEATURE_SPACE3(LIMMA)","FEATURE_SPACE4(LIMMA)", "FEATURE_SPACE7(pamsimilarity)","FEATURE_SPACE8(limmasimilarity)"]

/content


In [None]:
def extract_and_reduce_by_columns(path, sheet_name, columns_ds, name):
  '''
      Function to extract dataset given a path, an excel sheet
  '''
  full_df = pd.read_excel(path, sheet_name=sheet_name) # path of subdatset 

  full_list= full_df['isoform'].values.tolist()  #exatrct the list of isoforms names as list
  data = cpm_dataset[np.intersect1d(cpm_dataset.columns, full_list)]  # find the mutual isoform between main datset and subdatset 
  data.reset_index(inplace=True)
  data.rename(columns={ data.columns[0]: "sample_id" }, inplace = True)

  x = columns_ds['sample_id'].values.tolist()
  data1= data.loc[data['sample_id'].isin(x)]
  result = pd.merge(data1, columns_ds, on='sample_id')
  result.rename(columns={'sample_id.1':'subtype'}, inplace=True )
  return result

In [None]:
#---- Training Data import:
train = extract_and_reduce_by_columns(base_feature_space, feature_space_files[4], training_ds, 'trainingset') 
X_train = train.drop(["sample_id","subtype"],  axis = 1)
samples_train = train.sample_id
Y_train=train.subtype
print("X_train size:", X_train.shape)

#---- Testing Data import:
test = extract_and_reduce_by_columns(base_feature_space, feature_space_files[4], testing_ds, 'testingset') 
X_test = test.drop(["sample_id","subtype"], axis = 1)
Y_test = test.subtype
print("X_test size:", X_test.shape)

X_train size: (550, 131)
X_test size: (137, 131)


In [None]:
# profile = ProfileReport(train, title='Train Dataset', html={'style':{'full_width':True}})
# profile.to_notebook_iframe()

In [None]:
# print("X_train contains the samples in the train dataset: ", X_train)
# print("Y_train contains the labels in the train dataset: ", Y_train)
print(train.describe())

In [None]:
def write_results(titles, results, name_of_file):
      '''
      Function to write results metrics and confing into a csv file 
      '''
      df = pd.DataFrame(results)
      df.to_csv(results_path+name_of_file + ".csv")

In [None]:
def plot_confusion_matrix(y_true, y_pred, le, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(le.transform(y_true), le.transform(y_pred))]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    # print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
# Configuration of parameters and names

#RESULTS_file_name
fs= "pam_similarity"
cv_results_file_name = "10-CV_results_Log_Reg"
test_results_file_name = "10-CV_TEST_Log_Reg"

#CV models attributes
#-10-folds:
column_titles_cv = ["metric", "C", "l1_ratio", "mean_CVtest_score", "std_CVtest_score",
                    "fold_0_test_score", "fold_1_test_score", "fold_2_test_score",
                    "fold_3_test_score", "fold_4_test_score", "fold_5_test_score",
                    "fold_6_test_score", "fold_7_test_score", "fold_8_test_score", "fold_9_test_score",
                    "mean_fit_time"]

column_titles_test = ["metric", "C", "l1_ratio", "10-f_CV", "TEST_Acc", "TEST_P", "TEST_R", "TEST_Ba"]
metrics = ["Balanced_accuracy","Accuracy"]
C = []
l1_ratio = []
cv_best = []
score_test_balanced_accuracy = []
score_test_accuracy = []
precision = []
recall = []
f1=[]

#'dual':[False]
#GridSearch attributes
# Set the parameters by cross-validation
#'l1_ratio':[0.5], 'solver': ['saga'], 'penalty':['elasticnet']

tuned_parameters = [{
    'multi_class':  ['ovr'],
    'penalty':['elasticnet'],
    'solver': ['saga'], 
    'max_iter':[2000], 
    'C':  [ 0.1], #[10 ** i for i in range(-2,1)],
    'l1_ratio': [0.1, 0.01] #[10 ** i for i in range(-2,1)] #'l1_ratio':[0.5]}]
    }]

scores = ["balanced_accuracy","accuracy"]
print(tuned_parameters)

In [None]:


for index, score in enumerate(scores):
    # -------RESULTS in CROSS_VALIDATION-----------
    print("\n\n# Tuning hyper-parameters for %s" % score)

    # Fit and hyperparameter search
    clf = GridSearchCV(LogisticRegression(), tuned_parameters, scoring=score, cv=10)
    clf.fit(X_train, Y_train)

    print("Best parameters set found on development set: ", clf.best_params_)
    # appending CV_results
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    split_0_test_score = clf.cv_results_["split0_test_score"]
    split_1_test_score = clf.cv_results_["split1_test_score"]
    split_2_test_score = clf.cv_results_["split2_test_score"]
    split_3_test_score = clf.cv_results_["split3_test_score"]
    split_4_test_score = clf.cv_results_["split4_test_score"]
    split_5_test_score = clf.cv_results_["split5_test_score"]
    split_6_test_score = clf.cv_results_["split6_test_score"]
    split_7_test_score = clf.cv_results_["split7_test_score"]
    split_8_test_score = clf.cv_results_["split8_test_score"]
    split_9_test_score = clf.cv_results_["split9_test_score"]

    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

    # saving CV_test_scores
    # 10-folds
    cv_results = np.array([metrics[index], clf.cv_results_["param_C"], clf.cv_results_["param_l1_ratio"],
                        clf.cv_results_["mean_test_score"], clf.cv_results_["std_test_score"],
                           split_0_test_score, split_1_test_score, split_2_test_score, split_3_test_score, split_4_test_score,
                           split_5_test_score, split_6_test_score, split_7_test_score, split_8_test_score, split_9_test_score,
                           clf.cv_results_["mean_fit_time"] ])
    
    write_results(results_path+ column_titles_cv, cv_results, cv_results_file_name + fs + "-" + metrics[index])
    print("printed")

     #--------TEST results-------
    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = Y_test, clf.predict(X_test) # change here
    
    print()
    # bestCV_model results appending:
    C.append(clf.best_params_["C"])
    l1_ratio.append(clf.best_params_["l1_ratio"])
    cv_best.append(clf.best_score_)
    

    score_test_balanced_accuracy.append(round(balanced_accuracy_score(y_true, y_pred), 3))
    score_test_accuracy.append(round(accuracy_score(y_true, y_pred), 3))
    precision.append(round(precision_score(y_true, y_pred, average="macro"), 3))
    recall.append(round(recall_score(y_true, y_pred, average="macro"), 3))
    f1.append(round(f1_score(y_true, y_pred, average="macro"),3))

    ## CONFUSION_MATRIX
    np.set_printoptions(precision=2)
    class_names = np.array(["Basal", "Her2", "LumA", "LumB", "Normal"])

    le = LabelEncoder()
    le.fit(class_names)
    
    y_pred_train=clf.predict(X_train)

    # on train set
    # Plot non-normalized confusion matrix
    plot_confusion_matrix(Y_train, y_pred_train, le, classes=class_names,
                          title='Confusion matrix on training set- from best ' + metrics[index])
    plt.savefig("/confusion_matrix_training-" + metrics[index] + ".png")
    # Plot normalized confusion matrix
    plot_confusion_matrix(Y_train, y_pred_train, le, classes=class_names, normalize=True,
                          title='Normalized confusion matrix on training set- from best ' + metrics[index])
    plt.savefig("/confusion_matrix_normalized_training-" + metrics[index] + ".png")
    
    # on test set
    # Plot non-normalized confusion matrix
    plot_confusion_matrix(Y_test, y_pred, le, classes=class_names,
                          title='Confusion matrix on testing set- from best ' + metrics[index])
    plt.savefig("/confusion_matrix_testing-" + metrics[index] + ".png")
    # Plot normalized confusion matrix
    plot_confusion_matrix(Y_test, y_pred, le, classes=class_names, normalize=True,
                          title='Normalized confusion matrix on testing set- from best ' + metrics[index])
    plt.savefig("/confusion_matrix_normalized_testing-" + metrics[index] + ".png")

    # create a dataframe with training sample_id and y_pred_train and Y_train on columuns 
    #(columns names>> sample id, predicted subtype, original subtypes)and save it as a csv file with
    # title 'Predictions on training- from best' + metrics[index] +'.csv'
    pretraining = {'sample id' : train['sample_id'], 'predicted subtype' : y_pred_train, 'original subtypes' :Y_train} 
    pretraining_dataframe= pd.DataFrame(pretraining)
    pretrainingcsv = pretraining_dataframe.to_csv('Predictions on training- from best' + metrics[index] + fs +'.csv')


    # create a dataframe with testing sample_id y_pred and Y_test on columuns 
    # (columns names>> sample id, predicted subtype, original subtypes)and save it as a csv file with
    # title 'Predictions on testing- from best' + metrics[index] +'.csv'
    pretesting = {'sample id' : test['sample_id'], 'predicted subtype' : y_pred, 'original subtypes' :Y_test} 
    pretesting_dataframe= pd.DataFrame(pretesting)
    pretestingcsv = pretesting_dataframe.to_csv('Predictions on testing- from best' + metrics[index] + fs +'.csv')

In [None]:
test_results = np.array([metrics, C, l1_ratio, cv_best, score_test_accuracy,  precision, recall, score_test_balanced_accuracy])
write_results(column_titles_test, test_results,test_results_file_name )
print("saved ")

saved
saved 


  """Entry point for launching an IPython kernel.


In [None]:
np.set_printoptions(precision=2)
class_names = np.array(["Basal", "Her2", "LumA", "LumB", "Normal"])

le = LabelEncoder()
le.fit(class_names)
    
y_pred_train=clf.predict(X_train)
# on train set
# Plot non-normalized confusion matrix
plot_confusion_matrix(Y_train, y_pred_train, le, classes=class_names,
                          title='Confusion matrix - from best' + metrics[0])
plt.savefig("/confusion_matrix-" + metrics[0] + ".png")
# Plot normalized confusion matrix
plot_confusion_matrix(Y_train, y_pred_train, le, classes=class_names, normalize=True,
                          title='Normalized confusion matrix- from best' + metrics[0])
plt.savefig("/confusion_matrix_normalized-" + metrics[0] + ".png")

In [None]:
#create a dataframe with training sample_id and y_pred_train and Y_train on columuns 
#(columns names>> sample id, predicted subtype, original subtypes)and save it as a csv file with
# title 'Predictions on training- from best' + metrics[index] +'.csv'
pretraining = {'sample id' : train['sample_id'], 'predicted subtype' : y_pred_train, 'original subtypes' :Y_train} 
pretraining_dataframe= pd.DataFrame(pretraining)
pretrainingcsv = pretraining_dataframe.to_csv('Predictions on training- from best.csv')

#create a dataframe with testing sample_id y_pred and Y_test on columuns 
#(columns names>> sample id, predicted subtype, original subtypes)and save it as a csv file with
# title 'Predictions on testing- from best' + metrics[index] +'.csv'
pretesting = {'sample id' : test['sample_id'], 'predicted subtype' : y_pred, 'original subtypes' :Y_test} 
pretesting_dataframe= pd.DataFrame(pretesting)
pretestingcsv = pretesting_dataframe.to_csv('Predictions on testing- from best.csv')
print(pretesting_dataframe,pretraining_dataframe)

        sample id predicted subtype original subtypes
0    TCGA-3C-AALJ              LumB              LumB
1    TCGA-5T-A9QA              LumA              LumB
2    TCGA-A1-A0SF              LumA              LumA
3    TCGA-A1-A0SJ              LumA              LumA
4    TCGA-A1-A0SK             Basal             Basal
..            ...               ...               ...
132  TCGA-BH-A204              LumB              LumB
133  TCGA-BH-A208              LumA            Normal
134  TCGA-BH-A209              LumB              LumB
135  TCGA-BH-A42T              LumB              LumB
136  TCGA-C8-A3M7              LumA              LumA

[137 rows x 3 columns]         sample id predicted subtype original subtypes
0    TCGA-3C-AAAU              LumA              LumA
1    TCGA-3C-AALI              Her2              Her2
2    TCGA-3C-AALK              LumA              LumA
3    TCGA-4H-AAAK              LumA              LumA
4    TCGA-5L-AAT0              LumA              LumA
..  

In [None]:
# Single Train-Test Split Evaluation on model with tuned parameters
LogReg_trained = LogisticRegression(random_state=0, C=0.001,l1_ratio=0.1, multi_class = 'ovr', penalty= 'elasticnet',solver='saga', max_iter=1000).fit(X_train, Y_train)

y_pred=LogReg_trained.predict(X_test)
print("Balanced accuracy: ", round(balanced_accuracy_score(Y_test, y_pred), 3))
print("Accuracy: ", round(accuracy_score(Y_test, y_pred), 3))
print("Precision: ", round(precision_score(Y_test, y_pred, average="macro"), 3))
print("Recall: ",  round(recall_score(Y_test, y_pred, average="macro"), 3)) 
print("F1 Score: ", round(f1_score(Y_test, y_pred, average="macro"), 3)) 

Balanced accuracy:  0.628
Accuracy:  0.81
Precision:  0.795
Recall:  0.628
F1 Score:  0.65


In [None]:
b= pd.DataFrame(LogReg_trained.coef_, columns = X_train.columns )
b.to_csv(results_path+"coef_limma_similairty.csv")

odds = np.exp(LogReg_trained.coef_)
b = pd.DataFrame(odds, columns=X_train.columns)
b.to_csv(results_path +"coef_limma_similairty.csv")