In [None]:
# we will need these packages that you download from Bioconductor('maftools') and CRAN ('readr', 'NMF')
library(readr)
library(NMF)
library(maftools)

# change this path according to where you created the folder
#dataPath <- "C:/-=TRANSMED=-/course materials/teaching/BIOINF17/ex1/bioinf/" 
fileList <- list.files( paste0("gdac.broadinstitute.org_OV.Mutation_Packager_Oncotated_Calls.Level_3.2016012800.0.0/"))

# this is to remove the manifest file from the list
fileList <- fileList[ fileList != "MANIFEST.txt"] 

# let's create an empty list and then populate it with the files
mafList <- list()
for (i in seq_along(fileList)) 
  mafList[[i]] <- read_tsv( paste0( 
                "gdac.broadinstitute.org_OV.Mutation_Packager_Oncotated_Calls.Level_3.2016012800.0.0/",
                fileList[i]) ,comment = "#", progress = F)

#now let's aggregate the list into one data frame
maf.df <- as.data.frame(do.call(rbind, mafList))
head(maf.df)

In [42]:
# You can check here and appreciate how many possible annotations one can get
# We will choose specific columns only for ease of use

selectedColumns <- c( "Hugo_Symbol", "Center", "NCBI_Build", "Chromosome",
                      "Start_position", "End_position", "Variant_Classification",
                      "Variant_Type", "Tumor_Sample_Barcode", "Mutation_Status", 
                      "Reference_Allele", "Tumor_Seq_Allele1", "Tumor_Seq_Allele2",
                      "Protein_Change", "i_dbNSFP_CADD_phred")

maf.df <- maf.df[, selectedColumns]


In [43]:
# Let's keep only mutations marked as somatic or unknown and 
# store the rest in separate data.frame
maf.germline.df <- maf.df[! maf.df$Mutation_Status %in% c("Somatic","Unknown"), ]
maf.df <- maf.df[ maf.df$Mutation_Status %in% c("Somatic","Unknown"), ]

# let's see the structure of data frame
str(maf.df)

'data.frame':	20160 obs. of  15 variables:
 $ Hugo_Symbol           : chr  "BAI2" "LRRC41" "ERICH3" "SSX2IP" ...
 $ Center                : chr  "broad.mit.edu" "broad.mit.edu" "broad.mit.edu" "broad.mit.edu" ...
 $ NCBI_Build            : int  37 37 37 37 37 37 37 37 37 37 ...
 $ Chromosome            : chr  "1" "1" "1" "1" ...
 $ Start_position        : int  32202221 46751997 75055494 85128152 107866918 153274927 157516843 224321795 248343719 28023684 ...
 $ End_position          : int  32202221 46751997 75055494 85128152 107866918 153274927 157516843 224321795 248343719 28023684 ...
 $ Variant_Classification: chr  "Missense_Mutation" "Missense_Mutation" "Missense_Mutation" "Silent" ...
 $ Variant_Type          : chr  "SNP" "SNP" "SNP" "SNP" ...
 $ Tumor_Sample_Barcode  : chr  "TCGA-04-1331-01" "TCGA-04-1331-01" "TCGA-04-1331-01" "TCGA-04-1331-01" ...
 $ Mutation_Status       : chr  "Unknown" "Somatic" "Unknown" "Somatic" ...
 $ Reference_Allele      : chr  "C" "C" "T" "A" ...
 $ Tum

In [44]:
head(maf.df)

Hugo_Symbol,Center,NCBI_Build,Chromosome,Start_position,End_position,Variant_Classification,Variant_Type,Tumor_Sample_Barcode,Mutation_Status,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,Protein_Change,i_dbNSFP_CADD_phred
BAI2,broad.mit.edu,37,1,32202221,32202221,Missense_Mutation,SNP,TCGA-04-1331-01,Unknown,C,C,G,p.R1028P,17.93
LRRC41,broad.mit.edu,37,1,46751997,46751997,Missense_Mutation,SNP,TCGA-04-1331-01,Somatic,C,C,T,p.E178K,28.5
ERICH3,broad.mit.edu,37,1,75055494,75055494,Missense_Mutation,SNP,TCGA-04-1331-01,Unknown,T,T,G,p.E666A,12.41
SSX2IP,broad.mit.edu,37,1,85128152,85128152,Silent,SNP,TCGA-04-1331-01,Somatic,A,A,G,p.G245G,
NTNG1,broad.mit.edu,37,1,107866918,107866918,Missense_Mutation,SNP,TCGA-04-1331-01,Somatic,G,G,A,p.M87I,19.4
PGLYRP3,broad.mit.edu,37,1,153274927,153274927,Missense_Mutation,SNP,TCGA-04-1331-01,Unknown,G,G,T,p.S229Y,14.61


In [45]:
tail(maf.df)


Unnamed: 0,Hugo_Symbol,Center,NCBI_Build,Chromosome,Start_position,End_position,Variant_Classification,Variant_Type,Tumor_Sample_Barcode,Mutation_Status,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,Protein_Change,i_dbNSFP_CADD_phred
20214,C9orf78,broad.mit.edu,37,9,132597484,132597484,Missense_Mutation,SNP,TCGA-61-2113-01,Unknown,T,T,G,p.K6T,13.48
20215,CAMSAP1,broad.mit.edu,37,9,138714002,138714002,Missense_Mutation,SNP,TCGA-61-2113-01,Unknown,G,G,C,p.S835R,22.6
20216,NOTCH1,broad.mit.edu,37,9,139391081,139391081,Missense_Mutation,SNP,TCGA-61-2113-01,Unknown,G,G,T,p.S2370R,12.84
20217,DPP7,broad.mit.edu,37,9,140007856,140007856,Missense_Mutation,SNP,TCGA-61-2113-01,Unknown,G,G,A,p.A193V,17.85
20218,FAM47A,broad.mit.edu,37,X,34149721,34149721,Silent,SNP,TCGA-61-2113-01,Unknown,C,C,T,p.P225P,
20219,B3GNT7,broad.mit.edu,37,2,232263379,232263380,In_Frame_Ins,INS,TCGA-61-2113-01,Unknown,-,-,GCG,p.317_318insG,


In [13]:
write.table(maf.df, file = paste0("allSamplesMaf.maf"),
            col.names = T, row.names = F, quote = F, sep="\t")


maf <- read.maf(maf = paste0("allSamplesMaf.maf"),
                removeSilent = T, useAll = T , verbose = F)

reading maf..
Done !
