Classification__cervix_ADENO_SQUAMOUS.Rmd


---
title: "Clasificación de muestras cancerígenas de cérvix: ADENO - SQUAMOUS"
author: "Lucía Almorox Antón"
date: "2023-01-24"
output: html_document
---

This code was created for the study "Uterine Cervix and Corpus Cancers Characterization through Gene Expression Analysis Using the Knowseq Tool".

Department of Computer Engineering, Automatics and Robotics,University of Granada. C.I.T.I.C., Periodista Rafael G´omez Montero, 2, 18014. Granada, Spain.

luciaalmorox@correo.ugr.es


```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
require(KnowSeq)
require(caret)
rm(list = ls())
memory.limit(size=10000)
set.seed(111)
```

In this document, we start with the corrected expression matrix and the sample labels that appear on it. These objects were created with the "Preprocessing_cervix_ADENO_SQUAMOUS.Rmd" document.

```{r 1}
# created with the document: Preprocessing_cervix_ADENO_SQUAMOUS.Rmd
load("CERVIX_batchMatrix_ADENO_SQUAMOUS.RData")
load("CERVIX_quality_labels_ADENO_SQUAMOUS.RData")
```

```{r}
table(qualityLabels)
```
Modification of the source code of the knn_test function to perform joint normalization of the train and test sets:

```{r}
knn_test <-function(train,labelsTrain,test,labelsTest,vars_selected, bestK){

  if(!is.data.frame(train) && !is.matrix(train)){
    
    stop("The train argument must be a dataframe or a matrix.")
    
  }
  
  if(dim(train)[1] != length(labelsTrain)){
    
    stop("The length of the rows of the argument train must be the same than the length of the lablesTrain. Please, ensures that the rows are the samples and the columns are the variables.")
    
  }
  
  if(!is.character(labelsTrain)  && !is.factor(labelsTrain)){stop("The class of the labelsTrain parameter must be character vector or factor.")}
  if(is.character(labelsTrain)){ labelsTrain <- as.factor(labelsTrain) }
  
  if(!is.character(labelsTest)  && !is.factor(labelsTest)){stop("The class of the labelsTest parameter must be character vector or factor.")}
  if(is.character(labelsTest)){ labelsTest <- as.factor(labelsTest) }
  
  if(!is.data.frame(test) && !is.matrix(test)){
    
    stop("The test argument must be a dataframe or a matrix.")
    
  }
  
  if(dim(test)[1] != length(labelsTest)){
    
    stop("The length of the rows of the argument test must be the same than the length of the lablesTest. Please, ensures that the rows are the samples and the columns are the variables.")
    
  }
  ntrain <- nrow(train)
  ntest <- nrow(test)
  train <- train[,vars_selected]
  test <- test[,vars_selected]
  train_plus_test <- rbind(train,test)
  train_plus_test <-  as.data.frame(apply(train_plus_test,2,as.double))
  #train <- as.data.frame(apply(train,2,as.double))
  #train <- train[,vars_selected]
  #test <- as.data.frame(apply(test,2,as.double))
  #test <- test[,vars_selected]
  
  train_plus_test = vapply(train_plus_test, function(x){ 
    max <- max(x)
    min <- min(x)
    if(max >  min){
      x <- ((x - min) / (max - min)) * 2 - 1
    }
    else{
      x
    }}, double(nrow(train_plus_test)))
  
  train_plus_test <- as.data.frame(train_plus_test)
  
  train <- train_plus_test[1:ntrain,]
  train <- as.data.frame(apply(train,2,as.double))
  test <- train_plus_test[(ntrain+1):nrow(train_plus_test),]
  test <- as.data.frame(apply(test,2,as.double))

  accVector <- double()
  sensVector <- double()
  specVector <- double()
  f1Vector <- double()
  cfMatList  <- list()
  predictsVector <- list()
  
  # Firstly with one variable
  cat(paste("Testing with ", 1," variables...\n",sep=""))
  knn_mod = knn3(x = train[, 1, drop=FALSE], y = labelsTrain, k = bestK)
  predicts <- predict(knn_mod, test[, 1, drop=FALSE], type = "class")
  
  cfMat<-confusionMatrix(predicts,labelsTest)
  if (length(levels(labelsTrain))==2){
    sens <- cfMat$byClass[[1]]
    spec <- cfMat$byClass[[2]]
    f1 <- cfMat$byClass[[7]]
  } else{
    sens <- mean(cfMat$byClass[,1])
    spec <- mean(cfMat$byClass[,2])
    f1 <- mean(cfMat$byClass[,7])
  }

  cfMatList[[1]] <- cfMat
  accVector[1] <- cfMat$overall[[1]]
  sensVector[1] <- sens
  specVector[1] <- spec
  f1Vector[1] <- f1
  predictsVector[[1]] <- predicts
  if(is.na(f1Vector[1])) f1Vector[i] <- 0
  
  for(i in c(2:dim(test)[2])){

    cat(paste("Testing with ", i," variables...\n",sep=""))
    knn_mod = knn3(x = train[,seq(i)], y = labelsTrain, k = bestK)
    predicts <- predict(knn_mod, test[,seq(i)], type = "class")

    cfMat<-confusionMatrix(predicts,labelsTest)
    
    if (length(levels(labelsTrain))==2){
      sens <- cfMat$byClass[[1]]
      spec <- cfMat$byClass[[2]]
      f1 <- cfMat$byClass[[7]]
    } else{
      sens <- mean(cfMat$byClass[,1])
      spec <- mean(cfMat$byClass[,2])
      f1 <- mean(cfMat$byClass[,7])
    }
    
    cfMatList[[i]] <- cfMat
    accVector[i] <- cfMat$overall[[1]]
    sensVector[i] <- sens
    specVector[i] <- spec
    f1Vector[i] <- f1
    predictsVector[[i]] <- predicts
    if(is.na(f1Vector[i])) f1Vector[i] <- 0
  }

  cat("Classification done successfully!\n")
  names(accVector) <- vars_selected
  names(sensVector) <- vars_selected
  names(specVector) <- vars_selected
  names(f1Vector) <- vars_selected

  results <- list(cfMatList,accVector,sensVector,specVector,f1Vector,predictsVector)
  names(results) <- c("cfMats","accVector","sensVector","specVector","f1Vector","predictions")
  invisible(results)

}
```

# 1. 5-fold CV ADENO-SQUAMOUS classification, using the top 10 MRMR ranking genes for each fold.



Next, we use only the "MRMR" method to perform the ranking of genes with the highest importance in predicting the output variable (we only use the top 10 genes of the ranking). In this case, we perform a 5-fold cross-validation. With each fold, we train a Knn classifier (obtaining the previous 3 quality measures of the training), and we use the validation subset to predict the classes of each sample and obtain the accuracy of that validation.

For each fold, we also obtain the four graphs (heatmap, confusion matrix, boxplot, and graphs with quality measures as a function of the number of genes) related to the training.




```{r}
library(class)


set.seed(21)

FOLDS_ACC_TRN = data.frame()
FOLDS_ACC_TST = data.frame()
TABLA_10_GENES = data.frame()

nDEGS <- c()
expressionMatrixCorrected <- batchMatrix
cv.Index <- createFolds(qualityLabels,5,returnTrain = T)
for (oneFOLD in 1:5 ){
  train_ind = cv.Index[[oneFOLD]]
  XTRN =expressionMatrixCorrected[,train_ind] 
  XTEST = expressionMatrixCorrected[,-train_ind]

  YTRN= qualityLabels[train_ind]
  YTEST=qualityLabels[-train_ind]
 

# Extract differentially expressed genes taking into account the correction by SVA
 
  DEGsInfo <- DEGsExtraction(XTRN, YTRN, lfc = 2, pvalue = 0.001, cov=2)



# Extract the table of statistics of the differentially expressed genes, 
  # as well as the filtered matrix with these genes.
  topTable <- DEGsInfo$DEG_Results$MulticlassLFC
  DEGsMatrix <- DEGsInfo$DEG_Results$DEGs_Matrix
  nDEGS <- c(nDEGS,nrow(DEGsMatrix))

# Top-12 boxplots and heatmap
  dataPlot(DEGsMatrix[1:12,], YTRN, mode = "genesBoxplot", toPNG=FALSE, 
           toPDF=FALSE, main=paste("FOLD",oneFOLD))

  dataPlot(DEGsMatrix[1:12,], YTRN, mode = "heatmap", toPNG=FALSE, toPDF=FALSE,
           main=paste("FOLD",oneFOLD))


# Prepare both the matrix and the labels
MLMatrix <- t(DEGsMatrix) # genes in the columns and samples in the rows
MLLabels <- YTRN 

# Carry out a Feature Selection process
# NOW ONLY MRMR
FSRankingMRMR <- featureSelection(MLMatrix, MLLabels, mode = "mrmr", 
                                  vars_selected = colnames(MLMatrix))
TABLA_10_GENES <- rbind(TABLA_10_GENES,names(FSRankingMRMR)[1:10])



   knn_trn <- knn_trn(MLMatrix, MLLabels, 
                      vars_selected = names(FSRankingMRMR)[1:10],numFold=5)
   
   knn_results <- rbind(knn_trn$accuracyInfo$meanAccuracy,        knn_trn$sensitivityInfo$meanSensitivity,knn_trn$specificityInfo$meanSpecificity)


  FOLDS_ACC_TRN <- rbind(FOLDS_ACC_TRN,
                         unname(knn_trn$accuracyInfo$meanAccuracy))
  
dataPlot(knn_results, MLLabels, legend = c("Mean Accuracy","Mean Sensitivity",
                                    "Mean Specificity"), mode = "classResults", 
         main=paste("FOLD",oneFOLD), xlab="# Genes", ylab="Prediction Score")
#dataPlot(knn_trn, MLLabels, mode = "heatmapResults")

#dataPlot(knn_results[,1:4], MLLabels, legend = c("Mean Accuracy",
#"Mean Sensitivity","Mean Specificity"), mode = "classResults")

dataPlot(t(MLMatrix[,names(FSRankingMRMR[1:3])]), MLLabels, mode = "heatmap",
         main=paste("FOLD",oneFOLD))
dataPlot(knn_trn$cfMats[[3]]$table, MLLabels, mode = "confusionMatrix",
         main=paste("FOLD",oneFOLD))
dataPlot(t(MLMatrix[,names(FSRankingMRMR[1:3])]), MLLabels, mode = "genesBoxplot",
         main=paste("FOLD",oneFOLD))


# TEST:
 
 results_test_knn <- knn_test(MLMatrix, MLLabels, t(XTEST),
YTEST, names(FSRankingMRMR)[1:10], bestK = knn_trn$bestK)
 
 FOLDS_ACC_TST <- rbind(FOLDS_ACC_TST,unname(results_test_knn$accVector))
 if (oneFOLD==1){
   AllMats <- results_test_knn$cfMats[[2]]$table
 } else {
    AllMats <- AllMats + results_test_knn$cfMats[[2]]$table
 }




}

```


```{r}
cat("Number of genes in the corrected expression matrix: ",nrow(expressionMatrixCorrected),'\n')
cat("Number of extracted DEGs from each fold: ",nDEGS)

```


Training accuracy plot for each fold based on the number of genes used.


```{r}
dataPlot(as.matrix(FOLDS_ACC_TRN), MLLabels, legend = c("FOLD 1","FOLD 2",
  "FOLD 3", "FOLD 4", "FOLD 5"), mode = "classResults", 
  main=paste("KNN Accuracy"), xlab="# Genes", ylab="Prediction Score")

```

Sum of validation confusion matrices using 2 genes.

```{r}
dataPlot(AllMats, MLLabels, mode = "confusionMatrix")
```

Table with the top 10 MRMR ranking genes obtained for each fold.

```{r}
colnames(TABLA_10_GENES) <- c("Gene1","Gene2","Gene3","Gene4","Gene5","Gene6","Gene7",
                              "Gene8","Gene9","Gene10")
row.names(TABLA_10_GENES) <- paste0("Fold",1:5)
TABLA_10_GENES
write.csv(file = "TABLA10.csv", x = TABLA_10_GENES)
```
- Possible gene footprint for classification: ICAIL.



Training and validation accuracy plot for each fold based on the number of genes used.

```{r}
library("ggplot2")
num_genes <- 1:10
folds_label <- c("Fold1","Fold2","Fold3","Fold4","Fold5")
colnames(FOLDS_ACC_TRN) <- num_genes
rownames(FOLDS_ACC_TRN) <- folds_label
trn_dat <- t(FOLDS_ACC_TRN)
trn_dat <- cbind(trn_dat,1:10)
colnames(trn_dat)[6] <- "NGENES"
trn_dat <- as.data.frame(trn_dat)


colnames(FOLDS_ACC_TST) <- num_genes
rownames(FOLDS_ACC_TST) <- folds_label
tst_dat <- t(FOLDS_ACC_TST)
tst_dat <- cbind(tst_dat,1:10)
colnames(tst_dat)[6] <- "NGENES"
tst_dat <- as.data.frame(tst_dat)


color1 <- rgb(9/255, 137/255, 134/255, maxColorValue = 1)
color2 <- rgb(255/255, 198/255, 51/255, maxColorValue = 1)
color3 <- rgb(253/255, 128/255, 174/255, maxColorValue = 1)
color5 <- rgb(14/255, 14/255, 15/255, maxColorValue = 1)
color4 <- rgb(51/255, 243/255, 25/255, maxColorValue = 1)
plot1<-ggplot(trn_dat, mapping= aes(x = NGENES, y=Fold1)) + 
  geom_line(data = trn_dat, aes(x = NGENES, y=Fold1, colour='Fold1', 
                                linetype = "TRAIN"),linewidth=0.3)+
  geom_line(data = trn_dat, aes(x = NGENES, y=Fold2, colour='Fold2', 
                                linetype = "TRAIN"),linewidth=0.3)+
  geom_line(data = trn_dat, aes(x = NGENES, y=Fold3, colour='Fold3',
                                linetype = "TRAIN"),linewidth=0.3)+
  geom_line(data = trn_dat, aes(x = NGENES, y=Fold4, colour='Fold4', 
                                linetype = "TRAIN"),linewidth=0.3)+
  geom_line(data = trn_dat, aes(x = NGENES, y=Fold5, colour='Fold5', 
                                linetype = "TRAIN"),linewidth=0.3)+
  geom_line(data= tst_dat, aes(x = NGENES, y=Fold1, colour='Fold1', 
                               linetype = "TEST"),linewidth=0.3)+
  geom_line(data= tst_dat, aes(x = NGENES, y=Fold2, colour='Fold2', 
                               linetype = "TEST"),linewidth=0.3)+
  geom_line(data= tst_dat, aes(x = NGENES, y=Fold3, colour='Fold3', 
                               linetype = "TEST"),linewidth=0.3)+
  geom_line(data= tst_dat, aes(x = NGENES, y=Fold4, colour='Fold4', 
                               linetype = "TEST"),linewidth=0.3)+
  geom_line(data= tst_dat, aes(x = NGENES, y=Fold5, colour='Fold5', 
                               linetype = "TEST"),linewidth=0.3)+
  theme_bw()+
  scale_y_continuous(name="Accuracy",limits = c(0.99,1.01),
        breaks = round(seq(0.99, 1, by = 0.005),2))+
  scale_x_continuous(name="# Genes",limits = c(1,10),
        breaks = seq(1, 20, by = 1))+
  scale_color_manual(name='Color',
        breaks=c('Fold1','Fold2','Fold3','Fold4','Fold5'),
        values = c('Fold1'=color1, 'Fold2'=color2, 'Fold3'=color3, 
        'Fold4'=color4,'Fold5'=color5))+
  scale_linetype_manual(name= 'Line type', breaks=c('TRAIN','TEST'),
        values=c('TRAIN'='solid','TEST'='longdash'))+
  theme(legend.key.size = unit(1, 'cm'))

plot1
# +theme(plot.title = element_text(size=12,face='bold',hjust = 0.5)) 
```
# # 1. 5-fold CV ADENO-SQUAMOUS classification, using the gene signature.

ICA1L as uni-gene signature.

Repetition of the 5-fold cross-validation 20 times (varying the seed).

```{r}
for (seed in c(1:20)){
  set.seed(seed)
selecc <- c('ICA1L', 'ICA1L')
#SUBDIV TRANI TEST -> COGIENDO CADA VEZ UN FOLD
FOLDS_ACC_TRN = data.frame()
FOLDS_ACC_TST = data.frame()
TABLA_10_GENES = data.frame()

nDEGS <- c()
expressionMatrixCorrected <- batchMatrix
cv.Index <- createFolds(qualityLabels,5,returnTrain = T)
for (oneFOLD in 1:5 ){
  train_ind = cv.Index[[oneFOLD]]
  XTRN =expressionMatrixCorrected[,train_ind] # muestras en las cols 
                                              # (seleccionas muestras)
                                              # los genes (filas) los coges 
                                              # todos siempre claro
  XTEST = expressionMatrixCorrected[,-train_ind]

  YTRN= qualityLabels[train_ind]
  YTEST=qualityLabels[-train_ind]
 



# Prepare both the matrix and the labels
MLMatrix <- t(XTRN) # GENES IN COLUMNS AND SAMPLES IN ROWS
MLLabels <- YTRN #





   knn_trn <- knn_trn(MLMatrix, MLLabels, 
                      vars_selected = selecc,numFold=5)
   
   knn_results <- rbind(knn_trn$accuracyInfo$meanAccuracy,        knn_trn$sensitivityInfo$meanSensitivity,knn_trn$specificityInfo$meanSpecificity)


  FOLDS_ACC_TRN <- rbind(FOLDS_ACC_TRN,
                         unname(knn_trn$accuracyInfo$meanAccuracy))
  
# dataPlot(knn_results, MLLabels, legend = c("Mean Accuracy","Mean Sensitivity",
#                                     "Mean Specificity"), mode = "classResults", 
#          main=paste("FOLD",oneFOLD), xlab="# Genes", ylab="Prediction Score")
#dataPlot(knn_trn, MLLabels, mode = "heatmapResults")

#dataPlot(knn_results[,1:4], MLLabels, legend = c("Mean Accuracy",
#"Mean Sensitivity","Mean Specificity"), mode = "classResults")

# dataPlot(t(MLMatrix[,names(FSRankingMRMR[1:3])]), MLLabels, mode = "heatmap",
#          main=paste("FOLD",oneFOLD))
# dataPlot(knn_trn$cfMats[[3]]$table, MLLabels, mode = "confusionMatrix",
#          main=paste("FOLD",oneFOLD))
# dataPlot(t(MLMatrix[,names(FSRankingMRMR[1:3])]), MLLabels, mode = "genesBoxplot",
#          main=paste("FOLD",oneFOLD))


# TEST:
 
 results_test_knn <- knn_test(MLMatrix, MLLabels, t(XTEST),
YTEST,selecc, bestK = knn_trn$bestK)
 
 FOLDS_ACC_TST <- rbind(FOLDS_ACC_TST,unname(results_test_knn$accVector))
 if (oneFOLD==1){
   AllMats <- results_test_knn$cfMats[[1]]$table
 } else {
    AllMats <- AllMats + results_test_knn$cfMats[[1]]$table
 }




}
dataPlot(AllMats, MLLabels, mode = "confusionMatrix")}

```


```{r, fig.height=5,fig.width=5}
#dataPlot(expressionMatrixCorrected[c('ARSI',	'ICA1L', 'GPR171',	'THPO',	'AC091563.1'),], qualityLabels, mode = "genesBoxplot")
dataPlot(expressionMatrixCorrected[c('ICA1L','ICA1L'),], qualityLabels, mode = "genesBoxplot")
```