DNM-Finder_code.R

#### De Novo Variant Analysis Pipeline generated by  Zeynep Hande Coban Akdemir and Tomasz Gambin


#  DNM-Finder_code.R
#
#
#  Created by zeynep hande coban akdemir on 10/18/16.
#

# for libraries

library(data.table)
library (stringr)
library(gdata)


# output directory

mainDir='/mnt/bigData/cmg/DENOVO_VARIANT_ANALYSIS'

# the directory where we keep all filtered vcf files

xlsDir='/mnt/bigData/cmg/DENOVO_VARIANT_ANALYSIS/ALL_LL_XLS'

# bam directory that we keep all proband and parent bam and vcf files. In the INPUT directory, we have a folder for each sample. The folder is named with each sample's flowcell ID. In that folder, we keep each sample's bam and vcf files


PR_PATH= PAR_PATH= '/mnt/bigData/cmg/INPUT'

### to read parental vcf files and generate the key of each variants that consists of variants' chrom number, position,reference and alternative allele information

readParentData <- function(sampleName)
{
    vcfDir=paste(PAR_PATH,'/',sampleName,sep='')
    vcffiles=list.files(vcfDir)
    files <- vcffiles[grep(paste(sampleName),vcffiles)]
    snp <- files[grep('SNP',files)]
    indel <- files[grep('INDEL',files)]
    file.copy(paste(vcfDir, '/', snp,sep=""),mainDir)
    file.copy(paste(vcfDir, '/', indel,sep=""),mainDir)
    
    snp <- paste(mainDir, '/', snp,sep="")
    indel<- paste(mainDir,'/', indel,sep="")
    
    
    system(paste('bunzip2', paste(snp, sep=""), sep=" "))
    snp <- strsplit(snp,'.bz2')[[1]][1]
    system(paste('bunzip2', paste(indel, sep=""), sep=" "))
    indel<- strsplit(indel,'.bz2')[[1]][1]
    
    system (paste( "grep -v '##'", paste(snp), ">", "temp.vcf", sep=" "))
    system (paste( "grep -v '##'", paste(indel), ">", "temp.indel", sep=" "))
    snp <- fread("temp.vcf")
    indel <- fread("temp.indel")
    
    vars <- rbind(snp, indel)
    vars$key <- paste(vars$'#CHROM',":",  vars$POS, "_", vars$REF, ">",vars$ALT, sep="")
    vars
}


### to read proband SNP Annotated vcf files and generate the key of each variants that consists of variants' chrom number, position,reference and alternative allele information


readProbandData <- function(dd)
{
    fileSNPp1 <- system(paste("find", paste(dd),"-name *SNPs_Annotated*",sep=' '),intern=TRUE)
    
    
    system (paste( "grep -v '##'", paste(fileSNPp1), ">", "temp.vcf", sep=" "))
    snp <- fread("temp.vcf",header=TRUE)
    
    vars <- snp
    vars$key <- paste(vars$'#CHROM',":",  vars$POS, "_", vars$REF, ">",vars$ALT, sep="")
    vars
}


### to retrieve the pileup information at the indicated variant information


checkSide2 <- function(fl, rr)
{
    outFile <- paste( fl,".pileup.pos_",rr$POS, sep="")
    
    lines <- system (paste("samtools mpileup '",fl,"' -r ",rr$CHROM, ":", rr$POS, "-",rr$POS , sep=""), intern=T)
    tt <- strsplit(lines[1], "\t")[[1]]
    pil <- tt[5]
    kk <- strsplit(pil,"")[[1]]
    aa <- length(which(tolower(kk)== "a"))
    cc <- length(which(tolower(kk)== "c"))
    tt <- length(which(tolower(kk)== "t"))
    gg <- length(which(tolower(kk)== "g"))
    dels <- length(which(tolower(kk)== "-"))
    dups <- length(which(tolower(kk)== "+"))
    res <- c(aa, cc,tt,gg, dels, dups)
    names(res) <- c("A", "C", "T", "G", "dels","dups")
    res
    
}

### retrieve the pileup information for all of the variants in the data object. This function utilizes the above checkSide2 function.

addPileups <- function(prBam, p1Bam, p2Bam,	data)
{
    library(parallel)
    pileups <- mclapply(1:nrow(data), function(i){
        
        rr <- data[i,]
        
        if (rr$potDN == FALSE){
            tmp <- rep(NA,3*8)
            names(tmp)<- do.call(c, lapply(c("pr","p1","p2"), function(x){t(paste(x,c("_REF","_ALT","_A","_C","_T","_G","_dels","_dups"),sep=""))}))
            return(tmp);
        }
        ref <- rr$REF
        alt <- rr$ALT
        if (nchar(rr$REF)>1)
        {
            ref <- alt
            alt <- "dels"
        }else if (nchar(rr$ALT)>1)
        {
            alt <- "dups"
        }
        tmp <-do.call(cbind, lapply(1:3, function(j)
								{
                                    tryCatch({
                                        bam <- prBam
                                        if (j ==2){bam <- p1Bam}
                                        if (j == 3){bam<- p2Bam}
                                        pileup <- checkSide2(bam, rr)
                                        pRef <- pileup[ref]
                                        pAlt <- pileup[alt]
                                        return(t(c(pRef,pAlt, pileup)))
                                        
                                    },
                                    error=function(e){
                                        print(e)
                                        return(c(NA,NA))
                                    })
                                    
                                }))
                                
                                colnames(tmp)<- do.call(c, lapply(c("pr","p1","p2"), function(x){t(paste(x,c("_REF","_ALT","_A","_C","_T","_G","_dels","_dups"),sep=""))}))
                                return(tmp[])
    }, mc.cores=6)
    return(cbind(data,do.call(rbind, pileups)))
    
    
}


count=0


### list all the files in PARENTAL PATH

parent_files=list.files(PAR_PATH)

### list all the files in PROBAND PATH

pr_files=list.files(PR_PATH)


#### read the file that we keep all the information regarding proband and parent sample IDs and flowcell IDs.

trios <- read.csv('/mnt/bigData/cmg/trios.csv')

selected <- c(1:nrow(trios))
for ( i in selected)
{


    PR_ID=sampleName=key=trios$PR_ID[i]
    P1_ID=trios$P1_ID[i]
    P2_ID=trios$P2_ID[i]
    
    print (paste(key,'processing',sep=' '))


    PR_FID <- trios$PR_FID[i]
    P1_FID <- trios$P1_FID[i]
    P2_FID <- trios$P2_FID[i]


#### check if we have folders existing for proband, parent1 and parent2 in INPUT folder
 if(length(P1_FID)>0 && length(P2_FID)>0 && length(PR_FID)>0 && P1_FID!='NA' && P2_FID!='NA' && PR_FID %in% pr_files && P1_FID %in% parent_files && P2_FID %in% parent_files)
 {
    

    count=count+1
    print(count)

    # the path for proband files
    prPATH=paste(PR_PATH,'/',PR_FID,sep='')
    
    # the path for parent1 files

    p1PATH=paste(PAR_PATH,'/',P1_FID,sep='')
    
    # the path for parent2 files

    p2PATH=paste(PAR_PATH,'/',P2_FID,sep='')
    
    ### find the bam file for proband
    prBam=system(paste("find", paste(prPATH),"-name *.bam",sep=' '),intern=TRUE)

    print(prBam)
    
    ### find the bam file for parent1

    p1Bam=system(paste("find", paste(p1PATH),"-name *.bam",sep=' '),intern=TRUE)
    priority <- c("realigned.recal.bam", "realigned.bam", "marked.bam","recal.bam", ".bam")
    if (length(p1Bam)>0)
    {
        for (p in priority){
            bams <- p1Bam[grep(paste(p,"$",sep=""), p1Bam)]
            if (length(bams)>0){
                bam1 <- bams[length(bams)]
                p1Bam <- bam1
                break;
            }
        }
    }

    print (p1Bam)
    
    ### find the bam file for parent2

    p2Bam=system(paste("find", paste(p2PATH),"-name *.bam",sep=' '),intern=TRUE)
    if (length(p2Bam)>0)
    {
        for (p in priority){
            bams <- p2Bam[grep(paste(p,"$",sep=""), p2Bam)]
            if (length(bams)>0){
                bam2 <- bams[length(bams)]
                p2Bam <- bam2
                
                break;
            }
        }
    }


    print (p2Bam)

    ### to check if the bam files exist for both proband,parent1 and parent2
        if (length(prBam) >0 & length(p1Bam) >0 & length(p2Bam) >0 )

        {
            
            
            ### list all of the filtered vcf files in xlsDir
            csvfiles=list.files(xlsDir,'*.csv')
            
            ### find the specific filtered vcf file that has its names starting with PR_ID
            
            probandCsv <- csvfiles[grep(paste('^',PR_ID,'_ll.csv', sep=''),csvfiles)]
            if(length(probandCsv)!=0)
            {
                
                ### read the proband filtered vcf file
                probandCsv <- paste(xlsDir,'/',probandCsv,sep="")
                proband <- read.csv(probandCsv, stringsAsFactors=F)
                
                # read parent data
                p1d <- readParentData(P1_FID)
                p2d <- readParentData(P2_FID)
                
                
                proband$key <- paste(proband$CHROM, ":",proband$POS, "_",proband$REF, ">", proband$ALT,sep="")
                # subtract the variants in parents from the variants in proband and identify the proband specific variants
                proband$potDN <- proband$key %in% setdiff(proband$key, union(p1d$key, p2d$key))
                
                
                ### retrieve the pileup information for all of variants in the proband
                
                system.time(pileups <- addPileups(prBam,p1Bam,p2Bam, proband))
                
                dn <- pileups
                ### stringent criteria for identification of de novo variants in the proband
                dn$likelyDN <- pileups$"p1_ALT" == 0 & pileups$"p2_ALT" ==0 & pileups$"pr_ALT" > 5 & pileups$"p1_REF">10 & pileups$"p2_REF">10 & pileups$"pr_ALT"/(pileups$"pr_REF" + 0.00001)>0.2
                ### loose criteria for identification of de novo variants in the proband

                dn$likelyDN.Loose <- pileups$"p1_ALT"/(pileups$"p1_REF" + 0.00001)<=0.05 &  pileups$"p2_ALT"/(pileups$"p2_REF" + 0.00001)<=0.05 & pileups$"pr_ALT" > 5 & pileups$"p1_REF">10 & pileups$"p2_REF">10 & pileups$"pr_ALT"/(pileups$"pr_REF" + 0.00001)>0.2
                

                #### write the all of the proband variants with the annotation of de novo variants in the last columns named likelyDN and likelyDN.Loose
                
                print (paste(key,'file writing',sep=' '))
                write.csv(dn,paste(mainDir,'/', key,'_denovovariantresults.csv',sep=''))
                
                
            }
            
        }
    }

}