/
DNM-Finder_code.R
316 lines (202 loc) · 9.98 KB
/
DNM-Finder_code.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
#### De Novo Variant Analysis Pipeline generated by Zeynep Hande Coban Akdemir and Tomasz Gambin
# DNM-Finder_code.R
#
#
# Created by zeynep hande coban akdemir on 10/18/16.
#
# for libraries
library(data.table)
library (stringr)
library(gdata)
# output directory
mainDir='/mnt/bigData/cmg/DENOVO_VARIANT_ANALYSIS'
# the directory where we keep all filtered vcf files
xlsDir='/mnt/bigData/cmg/DENOVO_VARIANT_ANALYSIS/ALL_LL_XLS'
# bam directory that we keep all proband and parent bam and vcf files. In the INPUT directory, we have a folder for each sample. The folder is named with each sample's flowcell ID. In that folder, we keep each sample's bam and vcf files
PR_PATH= PAR_PATH= '/mnt/bigData/cmg/INPUT'
### to read parental vcf files and generate the key of each variants that consists of variants' chrom number, position,reference and alternative allele information
readParentData <- function(sampleName)
{
vcfDir=paste(PAR_PATH,'/',sampleName,sep='')
vcffiles=list.files(vcfDir)
files <- vcffiles[grep(paste(sampleName),vcffiles)]
snp <- files[grep('SNP',files)]
indel <- files[grep('INDEL',files)]
file.copy(paste(vcfDir, '/', snp,sep=""),mainDir)
file.copy(paste(vcfDir, '/', indel,sep=""),mainDir)
snp <- paste(mainDir, '/', snp,sep="")
indel<- paste(mainDir,'/', indel,sep="")
system(paste('bunzip2', paste(snp, sep=""), sep=" "))
snp <- strsplit(snp,'.bz2')[[1]][1]
system(paste('bunzip2', paste(indel, sep=""), sep=" "))
indel<- strsplit(indel,'.bz2')[[1]][1]
system (paste( "grep -v '##'", paste(snp), ">", "temp.vcf", sep=" "))
system (paste( "grep -v '##'", paste(indel), ">", "temp.indel", sep=" "))
snp <- fread("temp.vcf")
indel <- fread("temp.indel")
vars <- rbind(snp, indel)
vars$key <- paste(vars$'#CHROM',":", vars$POS, "_", vars$REF, ">",vars$ALT, sep="")
vars
}
### to read proband SNP Annotated vcf files and generate the key of each variants that consists of variants' chrom number, position,reference and alternative allele information
readProbandData <- function(dd)
{
fileSNPp1 <- system(paste("find", paste(dd),"-name *SNPs_Annotated*",sep=' '),intern=TRUE)
system (paste( "grep -v '##'", paste(fileSNPp1), ">", "temp.vcf", sep=" "))
snp <- fread("temp.vcf",header=TRUE)
vars <- snp
vars$key <- paste(vars$'#CHROM',":", vars$POS, "_", vars$REF, ">",vars$ALT, sep="")
vars
}
### to retrieve the pileup information at the indicated variant information
checkSide2 <- function(fl, rr)
{
outFile <- paste( fl,".pileup.pos_",rr$POS, sep="")
lines <- system (paste("samtools mpileup '",fl,"' -r ",rr$CHROM, ":", rr$POS, "-",rr$POS , sep=""), intern=T)
tt <- strsplit(lines[1], "\t")[[1]]
pil <- tt[5]
kk <- strsplit(pil,"")[[1]]
aa <- length(which(tolower(kk)== "a"))
cc <- length(which(tolower(kk)== "c"))
tt <- length(which(tolower(kk)== "t"))
gg <- length(which(tolower(kk)== "g"))
dels <- length(which(tolower(kk)== "-"))
dups <- length(which(tolower(kk)== "+"))
res <- c(aa, cc,tt,gg, dels, dups)
names(res) <- c("A", "C", "T", "G", "dels","dups")
res
}
### retrieve the pileup information for all of the variants in the data object. This function utilizes the above checkSide2 function.
addPileups <- function(prBam, p1Bam, p2Bam, data)
{
library(parallel)
pileups <- mclapply(1:nrow(data), function(i){
rr <- data[i,]
if (rr$potDN == FALSE){
tmp <- rep(NA,3*8)
names(tmp)<- do.call(c, lapply(c("pr","p1","p2"), function(x){t(paste(x,c("_REF","_ALT","_A","_C","_T","_G","_dels","_dups"),sep=""))}))
return(tmp);
}
ref <- rr$REF
alt <- rr$ALT
if (nchar(rr$REF)>1)
{
ref <- alt
alt <- "dels"
}else if (nchar(rr$ALT)>1)
{
alt <- "dups"
}
tmp <-do.call(cbind, lapply(1:3, function(j)
{
tryCatch({
bam <- prBam
if (j ==2){bam <- p1Bam}
if (j == 3){bam<- p2Bam}
pileup <- checkSide2(bam, rr)
pRef <- pileup[ref]
pAlt <- pileup[alt]
return(t(c(pRef,pAlt, pileup)))
},
error=function(e){
print(e)
return(c(NA,NA))
})
}))
colnames(tmp)<- do.call(c, lapply(c("pr","p1","p2"), function(x){t(paste(x,c("_REF","_ALT","_A","_C","_T","_G","_dels","_dups"),sep=""))}))
return(tmp[])
}, mc.cores=6)
return(cbind(data,do.call(rbind, pileups)))
}
count=0
### list all the files in PARENTAL PATH
parent_files=list.files(PAR_PATH)
### list all the files in PROBAND PATH
pr_files=list.files(PR_PATH)
#### read the file that we keep all the information regarding proband and parent sample IDs and flowcell IDs.
trios <- read.csv('/mnt/bigData/cmg/trios.csv')
selected <- c(1:nrow(trios))
for ( i in selected)
{
PR_ID=sampleName=key=trios$PR_ID[i]
P1_ID=trios$P1_ID[i]
P2_ID=trios$P2_ID[i]
print (paste(key,'processing',sep=' '))
PR_FID <- trios$PR_FID[i]
P1_FID <- trios$P1_FID[i]
P2_FID <- trios$P2_FID[i]
#### check if we have folders existing for proband, parent1 and parent2 in INPUT folder
if(length(P1_FID)>0 && length(P2_FID)>0 && length(PR_FID)>0 && P1_FID!='NA' && P2_FID!='NA' && PR_FID %in% pr_files && P1_FID %in% parent_files && P2_FID %in% parent_files)
{
count=count+1
print(count)
# the path for proband files
prPATH=paste(PR_PATH,'/',PR_FID,sep='')
# the path for parent1 files
p1PATH=paste(PAR_PATH,'/',P1_FID,sep='')
# the path for parent2 files
p2PATH=paste(PAR_PATH,'/',P2_FID,sep='')
### find the bam file for proband
prBam=system(paste("find", paste(prPATH),"-name *.bam",sep=' '),intern=TRUE)
print(prBam)
### find the bam file for parent1
p1Bam=system(paste("find", paste(p1PATH),"-name *.bam",sep=' '),intern=TRUE)
priority <- c("realigned.recal.bam", "realigned.bam", "marked.bam","recal.bam", ".bam")
if (length(p1Bam)>0)
{
for (p in priority){
bams <- p1Bam[grep(paste(p,"$",sep=""), p1Bam)]
if (length(bams)>0){
bam1 <- bams[length(bams)]
p1Bam <- bam1
break;
}
}
}
print (p1Bam)
### find the bam file for parent2
p2Bam=system(paste("find", paste(p2PATH),"-name *.bam",sep=' '),intern=TRUE)
if (length(p2Bam)>0)
{
for (p in priority){
bams <- p2Bam[grep(paste(p,"$",sep=""), p2Bam)]
if (length(bams)>0){
bam2 <- bams[length(bams)]
p2Bam <- bam2
break;
}
}
}
print (p2Bam)
### to check if the bam files exist for both proband,parent1 and parent2
if (length(prBam) >0 & length(p1Bam) >0 & length(p2Bam) >0 )
{
### list all of the filtered vcf files in xlsDir
csvfiles=list.files(xlsDir,'*.csv')
### find the specific filtered vcf file that has its names starting with PR_ID
probandCsv <- csvfiles[grep(paste('^',PR_ID,'_ll.csv', sep=''),csvfiles)]
if(length(probandCsv)!=0)
{
### read the proband filtered vcf file
probandCsv <- paste(xlsDir,'/',probandCsv,sep="")
proband <- read.csv(probandCsv, stringsAsFactors=F)
# read parent data
p1d <- readParentData(P1_FID)
p2d <- readParentData(P2_FID)
proband$key <- paste(proband$CHROM, ":",proband$POS, "_",proband$REF, ">", proband$ALT,sep="")
# subtract the variants in parents from the variants in proband and identify the proband specific variants
proband$potDN <- proband$key %in% setdiff(proband$key, union(p1d$key, p2d$key))
### retrieve the pileup information for all of variants in the proband
system.time(pileups <- addPileups(prBam,p1Bam,p2Bam, proband))
dn <- pileups
### stringent criteria for identification of de novo variants in the proband
dn$likelyDN <- pileups$"p1_ALT" == 0 & pileups$"p2_ALT" ==0 & pileups$"pr_ALT" > 5 & pileups$"p1_REF">10 & pileups$"p2_REF">10 & pileups$"pr_ALT"/(pileups$"pr_REF" + 0.00001)>0.2
### loose criteria for identification of de novo variants in the proband
dn$likelyDN.Loose <- pileups$"p1_ALT"/(pileups$"p1_REF" + 0.00001)<=0.05 & pileups$"p2_ALT"/(pileups$"p2_REF" + 0.00001)<=0.05 & pileups$"pr_ALT" > 5 & pileups$"p1_REF">10 & pileups$"p2_REF">10 & pileups$"pr_ALT"/(pileups$"pr_REF" + 0.00001)>0.2
#### write the all of the proband variants with the annotation of de novo variants in the last columns named likelyDN and likelyDN.Loose
print (paste(key,'file writing',sep=' '))
write.csv(dn,paste(mainDir,'/', key,'_denovovariantresults.csv',sep=''))
}
}
}
}