VALID_Preprocessing_cervix_ADENO_SQUAMOUS.Rmd

---
title: "Creación de la matriz limpia (clases: ADENO, SQUAMOUS)"
author: "Lucía Almorox Antón"
date: "2023-01-24"
output: html_document
---

This code was created for the study "Uterine Cervix and Corpus Cancers Characterization through Gene Expression Analysis Using the Knowseq Tool".

Department of Computer Engineering, Automatics and Robotics,University of Granada. C.I.T.I.C., Periodista Rafael Gómez Montero, 2, 18014. Granada, Spain.

luciaalmorox@correo.ugr.es


```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
require(KnowSeq)
rm(list = ls())
memory.limit(size=10000)
set.seed(111)
```

Creating count files:

```{r}
samples <- read.table(file = 'CESC_CGCI_samplesheet.tsv', sep = '\t', header = TRUE)
convert_tsv_to_counts <- function(file) {
  # Read the tsv file into a data frame
  df <- read.table(file = file, sep = '\t', header = TRUE)
  # Select columns 1 and 4 from the data frame
  df <- df[, c(1, 4)]
  # Take only rows from 5th to the last
  df <- df[5:nrow(df), ]
  # Save the data frame in csv format, but with counts extension
  write.table(df, file = paste0(file, ".counts"), sep = "\t", row.names = FALSE,
              col.names =FALSE,quote = FALSE)
}
# Iterate over each directory, according to the samples sheet.
n <- nrow(samples)
for (i in 1:n) {
  # Change to the directory
  print(samples$File.ID[i])
  setwd(samples$File.ID[i])
  # Apply the convert_tsv_to_csv function to each tsv file
  convert_tsv_to_counts(samples$File.Name[i])
  # Go back to the previous directory
  setwd("..")
}



```

Creating the classes of interest.

Samples with the value 'Solid Tissue Normal' correspond to healthy samples.

```{r}
samples <- read.table(file = 'CESC_CGCI_samplesheet.tsv', sep = '\t', header = TRUE)

```

```{r}
library(readr)
clinical <- read_delim(file = 'CESC_CGCI_clinical.tsv')
clinical <- clinical %>% dplyr::select(case_submitter_id,primary_diagnosis)
clinical <- unique(clinical)
clinical
```

We observe the number of ADENO and SQUAMOUS samples in each project.

CGCI samples.


```{r}
library("dplyr")
CGCI <- samples %>% filter(Project.ID == 'CGCI-HTMCP-CC') 

nrow(CGCI)

Class <- CGCI$Sample.Type
table(Class)
Class[Class=='Solid Tissue Normal'] <- 'HEALTHY'

for (i in 1:nrow(CGCI)){
  if (Class[i] != 'HEALTHY'){
    CASE_ID_i <- CGCI$Case.ID[i]
    clinical_i <- clinical[clinical$case_submitter_id==CASE_ID_i,]
    ENF <- clinical_i$primary_diagnosis[1]
    Class[i] <- ENF
  }
}

table(Class)

Class[Class=='Adenocarcinoma, endocervical type'|Class=='Adenocarcinoma, NOS'|Class=='Endometrioid adenocarcinoma, NOS' | Class == 'Mucinous adenocarcinoma, endocervical type'] <- 'ADENO'
Class[Class=='Adenosquamous carcinoma' | Class=='Warty carcinoma' | Class=='Tumor, NOS' | Class=='Not Reported' | Class == 'Lymphoepithelial carcinoma'] <- 'DELETE'
Class[Class != 'DELETE' & Class != 'ADENO' & Class !='HEALTHY'] <- 'SQUAMOUS'

table(Class)



```
```{r}

Run <- paste(CGCI$File.Name,".counts",sep = "")
Path <- paste("./",CGCI$File.ID,sep = "")

data.info_CGCI <- data.frame(Run = Run, Path = Path, Class = Class)
data.info_CGCI <- data.info_CGCI[!(data.info_CGCI$Class=='HEALTHY'|data.info_CGCI$Class=='DELETE'),]
write.csv(file = "data_info_CGCI.csv", x = data.info_CGCI)
```


CESC samples. 

```{r}
CESC <- samples %>% filter(Project.ID == 'TCGA-CESC') 
# We ensure that we only use samples from CESC that were present in the previous download.
samples_PREV  <- read.table(file = 'samplesheet.tsv', sep = '\t', header = TRUE, check.names = F)
samples_PREV_CESC <- samples_PREV %>% dplyr::filter(`Project ID`=='TCGA-CESC')
CESC <- CESC[CESC$File.ID %in% samples_PREV_CESC$`File ID`,]

nrow(CESC)
```


```{r}

Class <- CESC$Sample.Type
table(Class)
Class[Class=='Solid Tissue Normal'] <- 'HEALTHY'

for (i in 1:nrow(CESC)){
  if (Class[i] != 'HEALTHY'){
    CASE_ID_i <- CESC$Case.ID[i]
    clinical_i <- clinical[clinical$case_submitter_id==CASE_ID_i,]
    ENF <- clinical_i$primary_diagnosis[1]
    Class[i] <- ENF
  }
}

table(Class)

Class[Class=='Adenocarcinoma, endocervical type'|Class=='Adenocarcinoma, NOS'|Class=='Endometrioid adenocarcinoma, NOS' | Class == 'Mucinous adenocarcinoma, endocervical type'] <- 'ADENO'
Class[Class=='Adenosquamous carcinoma' | Class=='Warty carcinoma' | Class=='Tumor, NOS' | Class=='Not Reported' | Class == 'Lymphoepithelial carcinoma'] <- 'DELETE'
Class[Class != 'DELETE' & Class != 'ADENO' & Class !='HEALTHY'] <- 'SQUAMOUS'

table(Class)


```

```{r}

Run <- paste(CESC$File.Name,".counts",sep = "")
Path <- paste("./",CESC$File.ID,sep = "")

data.info_CESC <- data.frame(Run = Run, Path = Path, Class = Class)

```

We remove the samples whose class is not of interest and save the object as a csv file.

```{r}
data.info_CESC <- data.info_CESC[!(data.info_CESC$Class=='HEALTHY'|data.info_CESC$Class=='DELETE'),]

table(data.info_CESC$Class)
#write.csv(file = "data_infoCESC.csv", x = data.info_CESC)
```
Undersampling of the SQUAMOUS class (only in CESC PROJECT):

```{r}
set.seed(22)
adeno <- data.info_CESC[data.info_CESC$Class=='ADENO',]
squamous <- data.info_CESC[data.info_CESC$Class=='SQUAMOUS',]

index_adeno <- sample(seq(1:nrow(adeno)),48)
index_squamous <- sample(seq(1:nrow(squamous)),150)

adeno64 <- adeno[index_adeno,]
squamous64 <- squamous[index_squamous,]

min_data_info_CESC <- rbind(adeno64,squamous64)
nrow(min_data_info_CESC)
head(min_data_info_CESC)
write.csv(file = "min_data_info_CESC.csv", x = min_data_info_CESC)
```

CGCI and CESC samples together (after undersampling of CESC samples).

```{r}
TOT_INFO <- rbind(data.info_CGCI,min_data_info_CESC)
write.csv(file = "TOT_INFO_unders_only_cesc.csv", x = TOT_INFO)
```

We carry out the necessary preprocessing to obtain the corrected expression matrix of each gene in each sample. We save this matrix and the corresponding labels for the samples that passed the quality filters. These objects will be used in the file "VALID_Classification_cervix_ADENO_SQUAMOUS.Rmd".

```{r}

# Load and merge the count files (genes are in rows and samples are in columns).
countsInfo <- countsToMatrix("TOT_INFO_unders_only_cesc.csv", extension = "")

# Export both the data matrix and the labels to new variables
countsMatrix <- countsInfo$countsMatrix
labels <- countsInfo$labels

# Get the Gene Symbols and GC content for each gene
# (if the GC value is too high it is indicative that the sequencing did not go well, 
# but we will not pay attention to it in this case).
myAnnotation <- getGenesAnnotation(rownames(countsMatrix))

# Calculate the expression values using the count matrix and the previously acquired annotation
geneExprMatrix <- calculateGeneExpressionValues(countsMatrix, annotation = myAnnotation)
# A first normalization is already being done, although it is not the complete normalization. 
# (the count values are not "raw" anymore).
save(geneExprMatrix,file="geneEMAT.RData")


# Keep only the genes that have a name, that is, those that are known.
geneExprMatrix <- geneExprMatrix[!is.na(rownames(geneExprMatrix)),]
# Perform the RNAseq quality analysis: remove those samples 
# that may have worse quality or whose expression distribution deviates 
# too much from the distribution of the rest of samples (outliers).
# If any samples have been removed, the number of columns will be reduced.
QAResults <- RNAseqQA(geneExprMatrix, toRemoval = TRUE, toPNG=FALSE, toPDF=FALSE)

qualityMatrix <- QAResults$matrix

# Update the Labels object to keep only those that correspond to the samples that 
# remain after removing outliers.
qualityLabels <- labels[-which(colnames(geneExprMatrix) %in% QAResults$outliers)]

table(qualityLabels) # We see the number of samples removed from each class.

# We create the SVA model of surrogate variables to address the batch effect.
  # We observe the plot before removing the batch effect.
dataPlot(qualityMatrix, qualityLabels, mode = "orderedBoxplot")
batchMatrix <- batchEffectRemoval(qualityMatrix, qualityLabels, method = "sva")
 # We observe the plot after removing the batch effect.
#dataPlot(batchMatrix, qualityLabels, mode = "orderedBoxplot")
save(batchMatrix,file="batchMatrix_ADENO_SQUAMOUS.RData")
save(qualityLabels,file="quality_labels_ADENO_SQUAMOUS.RData")

```
Check the final number of CESC samples in the quality matrix.

```{r}
load("batchMatrix_ADENO_SQUAMOUS.RData")
table(colnames(batchMatrix)%in% paste0(CESC$File.Name,'.counts'))
```