-
Notifications
You must be signed in to change notification settings - Fork 0
/
VALID_Preprocessing_cervix_ADENO_SQUAMOUS.Rmd
254 lines (183 loc) · 8.46 KB
/
VALID_Preprocessing_cervix_ADENO_SQUAMOUS.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
---
title: "Creación de la matriz limpia (clases: ADENO, SQUAMOUS)"
author: "Lucía Almorox Antón"
date: "2023-01-24"
output: html_document
---
This code was created for the study "Uterine Cervix and Corpus Cancers Characterization through Gene Expression Analysis Using the Knowseq Tool".
Department of Computer Engineering, Automatics and Robotics,University of Granada. C.I.T.I.C., Periodista Rafael Gómez Montero, 2, 18014. Granada, Spain.
luciaalmorox@correo.ugr.es
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
require(KnowSeq)
rm(list = ls())
memory.limit(size=10000)
set.seed(111)
```
Creating count files:
```{r}
samples <- read.table(file = 'CESC_CGCI_samplesheet.tsv', sep = '\t', header = TRUE)
convert_tsv_to_counts <- function(file) {
# Read the tsv file into a data frame
df <- read.table(file = file, sep = '\t', header = TRUE)
# Select columns 1 and 4 from the data frame
df <- df[, c(1, 4)]
# Take only rows from 5th to the last
df <- df[5:nrow(df), ]
# Save the data frame in csv format, but with counts extension
write.table(df, file = paste0(file, ".counts"), sep = "\t", row.names = FALSE,
col.names =FALSE,quote = FALSE)
}
# Iterate over each directory, according to the samples sheet.
n <- nrow(samples)
for (i in 1:n) {
# Change to the directory
print(samples$File.ID[i])
setwd(samples$File.ID[i])
# Apply the convert_tsv_to_csv function to each tsv file
convert_tsv_to_counts(samples$File.Name[i])
# Go back to the previous directory
setwd("..")
}
```
Creating the classes of interest.
Samples with the value 'Solid Tissue Normal' correspond to healthy samples.
```{r}
samples <- read.table(file = 'CESC_CGCI_samplesheet.tsv', sep = '\t', header = TRUE)
```
```{r}
library(readr)
clinical <- read_delim(file = 'CESC_CGCI_clinical.tsv')
clinical <- clinical %>% dplyr::select(case_submitter_id,primary_diagnosis)
clinical <- unique(clinical)
clinical
```
We observe the number of ADENO and SQUAMOUS samples in each project.
CGCI samples.
```{r}
library("dplyr")
CGCI <- samples %>% filter(Project.ID == 'CGCI-HTMCP-CC')
nrow(CGCI)
Class <- CGCI$Sample.Type
table(Class)
Class[Class=='Solid Tissue Normal'] <- 'HEALTHY'
for (i in 1:nrow(CGCI)){
if (Class[i] != 'HEALTHY'){
CASE_ID_i <- CGCI$Case.ID[i]
clinical_i <- clinical[clinical$case_submitter_id==CASE_ID_i,]
ENF <- clinical_i$primary_diagnosis[1]
Class[i] <- ENF
}
}
table(Class)
Class[Class=='Adenocarcinoma, endocervical type'|Class=='Adenocarcinoma, NOS'|Class=='Endometrioid adenocarcinoma, NOS' | Class == 'Mucinous adenocarcinoma, endocervical type'] <- 'ADENO'
Class[Class=='Adenosquamous carcinoma' | Class=='Warty carcinoma' | Class=='Tumor, NOS' | Class=='Not Reported' | Class == 'Lymphoepithelial carcinoma'] <- 'DELETE'
Class[Class != 'DELETE' & Class != 'ADENO' & Class !='HEALTHY'] <- 'SQUAMOUS'
table(Class)
```
```{r}
Run <- paste(CGCI$File.Name,".counts",sep = "")
Path <- paste("./",CGCI$File.ID,sep = "")
data.info_CGCI <- data.frame(Run = Run, Path = Path, Class = Class)
data.info_CGCI <- data.info_CGCI[!(data.info_CGCI$Class=='HEALTHY'|data.info_CGCI$Class=='DELETE'),]
write.csv(file = "data_info_CGCI.csv", x = data.info_CGCI)
```
CESC samples.
```{r}
CESC <- samples %>% filter(Project.ID == 'TCGA-CESC')
# We ensure that we only use samples from CESC that were present in the previous download.
samples_PREV <- read.table(file = 'samplesheet.tsv', sep = '\t', header = TRUE, check.names = F)
samples_PREV_CESC <- samples_PREV %>% dplyr::filter(`Project ID`=='TCGA-CESC')
CESC <- CESC[CESC$File.ID %in% samples_PREV_CESC$`File ID`,]
nrow(CESC)
```
```{r}
Class <- CESC$Sample.Type
table(Class)
Class[Class=='Solid Tissue Normal'] <- 'HEALTHY'
for (i in 1:nrow(CESC)){
if (Class[i] != 'HEALTHY'){
CASE_ID_i <- CESC$Case.ID[i]
clinical_i <- clinical[clinical$case_submitter_id==CASE_ID_i,]
ENF <- clinical_i$primary_diagnosis[1]
Class[i] <- ENF
}
}
table(Class)
Class[Class=='Adenocarcinoma, endocervical type'|Class=='Adenocarcinoma, NOS'|Class=='Endometrioid adenocarcinoma, NOS' | Class == 'Mucinous adenocarcinoma, endocervical type'] <- 'ADENO'
Class[Class=='Adenosquamous carcinoma' | Class=='Warty carcinoma' | Class=='Tumor, NOS' | Class=='Not Reported' | Class == 'Lymphoepithelial carcinoma'] <- 'DELETE'
Class[Class != 'DELETE' & Class != 'ADENO' & Class !='HEALTHY'] <- 'SQUAMOUS'
table(Class)
```
```{r}
Run <- paste(CESC$File.Name,".counts",sep = "")
Path <- paste("./",CESC$File.ID,sep = "")
data.info_CESC <- data.frame(Run = Run, Path = Path, Class = Class)
```
We remove the samples whose class is not of interest and save the object as a csv file.
```{r}
data.info_CESC <- data.info_CESC[!(data.info_CESC$Class=='HEALTHY'|data.info_CESC$Class=='DELETE'),]
table(data.info_CESC$Class)
#write.csv(file = "data_infoCESC.csv", x = data.info_CESC)
```
Undersampling of the SQUAMOUS class (only in CESC PROJECT):
```{r}
set.seed(22)
adeno <- data.info_CESC[data.info_CESC$Class=='ADENO',]
squamous <- data.info_CESC[data.info_CESC$Class=='SQUAMOUS',]
index_adeno <- sample(seq(1:nrow(adeno)),48)
index_squamous <- sample(seq(1:nrow(squamous)),150)
adeno64 <- adeno[index_adeno,]
squamous64 <- squamous[index_squamous,]
min_data_info_CESC <- rbind(adeno64,squamous64)
nrow(min_data_info_CESC)
head(min_data_info_CESC)
write.csv(file = "min_data_info_CESC.csv", x = min_data_info_CESC)
```
CGCI and CESC samples together (after undersampling of CESC samples).
```{r}
TOT_INFO <- rbind(data.info_CGCI,min_data_info_CESC)
write.csv(file = "TOT_INFO_unders_only_cesc.csv", x = TOT_INFO)
```
We carry out the necessary preprocessing to obtain the corrected expression matrix of each gene in each sample. We save this matrix and the corresponding labels for the samples that passed the quality filters. These objects will be used in the file "VALID_Classification_cervix_ADENO_SQUAMOUS.Rmd".
```{r}
# Load and merge the count files (genes are in rows and samples are in columns).
countsInfo <- countsToMatrix("TOT_INFO_unders_only_cesc.csv", extension = "")
# Export both the data matrix and the labels to new variables
countsMatrix <- countsInfo$countsMatrix
labels <- countsInfo$labels
# Get the Gene Symbols and GC content for each gene
# (if the GC value is too high it is indicative that the sequencing did not go well,
# but we will not pay attention to it in this case).
myAnnotation <- getGenesAnnotation(rownames(countsMatrix))
# Calculate the expression values using the count matrix and the previously acquired annotation
geneExprMatrix <- calculateGeneExpressionValues(countsMatrix, annotation = myAnnotation)
# A first normalization is already being done, although it is not the complete normalization.
# (the count values are not "raw" anymore).
save(geneExprMatrix,file="geneEMAT.RData")
# Keep only the genes that have a name, that is, those that are known.
geneExprMatrix <- geneExprMatrix[!is.na(rownames(geneExprMatrix)),]
# Perform the RNAseq quality analysis: remove those samples
# that may have worse quality or whose expression distribution deviates
# too much from the distribution of the rest of samples (outliers).
# If any samples have been removed, the number of columns will be reduced.
QAResults <- RNAseqQA(geneExprMatrix, toRemoval = TRUE, toPNG=FALSE, toPDF=FALSE)
qualityMatrix <- QAResults$matrix
# Update the Labels object to keep only those that correspond to the samples that
# remain after removing outliers.
qualityLabels <- labels[-which(colnames(geneExprMatrix) %in% QAResults$outliers)]
table(qualityLabels) # We see the number of samples removed from each class.
# We create the SVA model of surrogate variables to address the batch effect.
# We observe the plot before removing the batch effect.
dataPlot(qualityMatrix, qualityLabels, mode = "orderedBoxplot")
batchMatrix <- batchEffectRemoval(qualityMatrix, qualityLabels, method = "sva")
# We observe the plot after removing the batch effect.
#dataPlot(batchMatrix, qualityLabels, mode = "orderedBoxplot")
save(batchMatrix,file="batchMatrix_ADENO_SQUAMOUS.RData")
save(qualityLabels,file="quality_labels_ADENO_SQUAMOUS.RData")
```
Check the final number of CESC samples in the quality matrix.
```{r}
load("batchMatrix_ADENO_SQUAMOUS.RData")
table(colnames(batchMatrix)%in% paste0(CESC$File.Name,'.counts'))
```