In [86]:
source('parseCosmic.R')

In [87]:
impact.df = read.table("../../../data/all_IMPACT_mutations_180508.txt", sep="\t", stringsAsFactors=F, header=T)
annotation.df <- read.table("../../../data/all_IMPACT_mutations_180508.simple.hg19_multianno.txt", sep="\t", stringsAsFactors=F, header=T)

In [88]:
impact.ann.df <- cbind(impact.df, annotation.df)
nrow(impact.ann.df)

In [89]:
# Annotate against gene types: oncogene or recessive according to OncoKb
cg <- read.table('../../../data/CancerGenesList.txt', sep='\t', header=TRUE, comment.char = '', row.names = 1)
impact.ann.df$isOncogene <- cg[impact.ann.df$Hugo_Symbol,'OncoKB.Oncogene']
impact.ann.df$cgType[impact.ann.df$isOncogene=='Yes'] <- 'Oncogene'
impact.ann.df$cgType[!impact.ann.df$isOncogene=='Yes'] <- 'Recessive'
impact.ann.df$cgType[is.na(impact.ann.df$cgType) ] <- 'Unknown'
impact.ann.df$cgType <- as.factor(impact.ann.df$cgType)

table(impact.ann.df$cgType)


 Oncogene Recessive   Unknown 
   176535    401325     10687 

In [90]:
# import OncoKb variant annotation
oncokb.annotation <- read.table('../../../data/oncoKb/allAnnotatedVariants.txt', header=TRUE, sep='\t', quote='')
oncokb.annotation$mutID <- paste0(oncokb.annotation$Gene, '.p.',oncokb.annotation$Alteration	)
rownames(oncokb.annotation) <- oncokb.annotation$mutID

In [91]:
# Dealing with non-coding mutations
impact.ann.df$HGVSp_Modified <- impact.ann.df$HGVSp_Short
impact.ann.df$HGVSp_Modified[impact.ann.df$HGVSp_Modified ==''] <- 'Non-coding'

In [92]:
# annotate each impact mutation, to facilitate search in OncoKb
impact.ann.df$oncokb.id <- paste0(impact.ann.df$Hugo_Symbol, '.',impact.ann.df$HGVSp_Short)
isExonic <- impact.ann.df$Consequence %in% c('exonic', 'frameshift_deletion', 'frameshift_insertion', 'nonframeshift_deletion', 'nonframeshift_insertion', 'nonsynonymous_SNV', 'stopgain_SNV', 'stoploss_SNV', 'synonymous_SNV')
impact.ann.df$oncokb.id[!isExonic] <- 'other' 
impact.ann.df$oncokb.id[impact.ann.df$HGVSp_Short==''] <- 'other'

isTrunc <- impact.ann.df$Consequence %in% c('frameshift_deletion', 'frameshift_insertion','stopgain_SNV', 'stoploss_SNV',
                                           'splicing', ' splicing_noncanonical')
impact.ann.df$oncokb.consequence <- 'other'
impact.ann.df$oncokb.consequence[isTrunc ] <- paste0(impact.ann.df$Hugo_Symbol[isTrunc ], '.p.Truncating Mutations')

In [93]:
# look up OncoKB
impact.ann.df$Change.onco <- oncokb.annotation[as.character(impact.ann.df$oncokb.id), 'Oncogenicity']
impact.ann.df$Consq.onco <- oncokb.annotation[as.character(impact.ann.df$oncokb.consequence), 'Oncogenicity']

impact.ann.df$Total.onco <- impact.ann.df$Change.onco
notFound <- is.na(impact.ann.df$Total.onco)
impact.ann.df$Total.onco[notFound] <- impact.ann.df$Consq.onco[notFound] 
levels(impact.ann.df$Total.onco) <- c(levels(impact.ann.df$Total.onco), 'NotInOncoKb')
impact.ann.df$Total.onco[is.na(impact.ann.df$Total.onco)] <- 'NotInOncoKb'

In [94]:
table(impact.ann.df$Total.onco, impact.ann.df$confidence_class)

                  
                   AUTO_OK MANUAL_OK OK_NOT_SO UNKNOWN UNLIKELY
  Inconclusive         372        13         5       0       81
  Likely Neutral       162        10         0       0       76
  Likely Oncogenic   38168      2635        89      13     1575
  Oncogenic          13725       546         0       0      287
  null                   0         0         0       0        0
  NotInOncoKb       138801      3909       264  374518    13298

ERROR: Error in variantTrain.corr$occurence_in_normals.1 <- NULL: object 'variantTrain.corr' not found


In [95]:
impact.ann.df  <- parseCosmic(impact.ann.df)
save(impact.ann.df, file='../../../data/impactAnnotated.RData')

In [96]:
table(impact.ann.df$Consequence, impact.ann.df$confidence_class)

                         
                          AUTO_OK MANUAL_OK OK_NOT_SO UNKNOWN UNLIKELY
  Translation_Start_Site       98         3         0       0        7
  UTR3                          0         1         0   13208       53
  UTR5                          0         0         0    8551        5
  Unknown                       0         0         0     411        0
  downstream                    0         0         0    2167        3
  exonic                        0         0         0    1771       12
  frameshift_deletion       15259      1446         3       0      951
  frameshift_insertion       5870       641         4       0      319
  intergenic                    0         0         0     114        0
  intronic                      1         0         0  287748     1222
  nonframeshift_deletion     3118       425         1       0     1013
  nonframeshift_insertion     613       113         1       0      273
  nonsynonymous_SNV        137145      2912        

In [80]:
cols <- c('Consequence','oncokb.consequence','Change.onco', 'Consq.onco', 'Total.onco' , 'oncokb.id', 'Hugo_Symbol', 'HGVSp_Short', 'confidence_class', 'cosmicCount')

In [81]:
head(subset(impact.ann.df, Total.onco=='Likely Oncogenic' & confidence_class=='UNKNOWN'))[,cols]

Unnamed: 0,Consequence,oncokb.consequence,Change.onco,Consq.onco,Total.onco,oncokb.id,Hugo_Symbol,HGVSp_Short,confidence_class,cosmicCount
1199,stoploss_SNV,FAT1.p.Truncating Mutations,,Likely Oncogenic,Likely Oncogenic,FAT1.p.X4589R,FAT1,p.X4589R,UNKNOWN,0
2427,stoploss_SNV,SOX9.p.Truncating Mutations,,Likely Oncogenic,Likely Oncogenic,SOX9.p.*510Sext*49,SOX9,p.*510Sext*49,UNKNOWN,1
2858,stoploss_SNV,TP53.p.Truncating Mutations,,Likely Oncogenic,Likely Oncogenic,TP53.p.X394S,TP53,p.X394S,UNKNOWN,0
12092,stoploss_SNV,PTEN.p.Truncating Mutations,,Likely Oncogenic,Likely Oncogenic,PTEN.p.X404W,PTEN,p.X404W,UNKNOWN,0
12433,stoploss_SNV,SMAD3.p.Truncating Mutations,,Likely Oncogenic,Likely Oncogenic,SMAD3.p.X426L,SMAD3,p.X426L,UNKNOWN,0
12941,stoploss_SNV,CDH1.p.Truncating Mutations,,Likely Oncogenic,Likely Oncogenic,CDH1.p.X883L,CDH1,p.X883L,UNKNOWN,0


In [82]:
head(subset(impact.ann.df, Total.onco=='NotInOncoKb' & confidence_class=='AUTO_OK'))[,cols]

Unnamed: 0,Consequence,oncokb.consequence,Change.onco,Consq.onco,Total.onco,oncokb.id,Hugo_Symbol,HGVSp_Short,confidence_class,cosmicCount
1,nonsynonymous_SNV,other,,,NotInOncoKb,SPEN.p.I3661F,SPEN,p.I3661F,AUTO_OK,0
9,nonsynonymous_SNV,other,,,NotInOncoKb,MLL3.p.M812I,MLL3,p.M812I,AUTO_OK,0
34,nonsynonymous_SNV,other,,,NotInOncoKb,TP53.p.T256P,TP53,p.T256P,AUTO_OK,1
37,nonsynonymous_SNV,other,,,NotInOncoKb,PIK3R3.p.I298M,PIK3R3,p.I298M,AUTO_OK,0
38,nonsynonymous_SNV,other,,,NotInOncoKb,ATR.p.R2431M,ATR,p.R2431M,AUTO_OK,0
39,nonsynonymous_SNV,other,,,NotInOncoKb,PDGFRA.p.L465M,PDGFRA,p.L465M,AUTO_OK,0


In [84]:
head(subset(impact.ann.df, Total.onco=='Likely Oncogenic' & cosmicCount==0))[,cols]

Unnamed: 0,Consequence,oncokb.consequence,Change.onco,Consq.onco,Total.onco,oncokb.id,Hugo_Symbol,HGVSp_Short,confidence_class,cosmicCount
40,splicing,FBXW7.p.Truncating Mutations,,Likely Oncogenic,Likely Oncogenic,other,FBXW7,,AUTO_OK,0
41,stopgain_SNV,FBXW7.p.Truncating Mutations,,Likely Oncogenic,Likely Oncogenic,FBXW7.p.G61*,FBXW7,p.G61*,AUTO_OK,0
43,stopgain_SNV,MAP3K1.p.Truncating Mutations,,Likely Oncogenic,Likely Oncogenic,MAP3K1.p.E302*,MAP3K1,p.E302*,AUTO_OK,0
44,splicing,MAP3K1.p.Truncating Mutations,,Likely Oncogenic,Likely Oncogenic,other,MAP3K1,,AUTO_OK,0
49,stopgain_SNV,PTPRD.p.Truncating Mutations,,Likely Oncogenic,Likely Oncogenic,PTPRD.p.E915*,PTPRD,p.E915*,AUTO_OK,0
59,stopgain_SNV,SMARCA4.p.Truncating Mutations,,Likely Oncogenic,Likely Oncogenic,SMARCA4.p.E1211*,SMARCA4,p.E1211*,AUTO_OK,0
