In [1]:
library('tidyverse')
library('SelectSim')
library('tictoc')
library('reshape2')

── [1mAttaching core tidyverse packages[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.4.4     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[3

In [2]:
sessionInfo()

R version 4.3.2 (2023-10-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /mnt/ndata/arvind/envs/R_4/lib/libopenblasp-r0.3.25.so;  LAPACK version 3.11.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Zurich
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] reshape2_1.4.4    tictoc_1.2        SelectSim_0.0.1.3 lubridate_1.9.3  
 [5] forcats_1.0.0     stringr_1.5.1     dplyr_1.1.4       purrr_1.0.2      
 [9] readr_2.1.5       tidyr_1.3.0       tibble_3.2.1      ggplot2_3.4.4    

# Note
- Ensure the file path are correct in all read functions below of the raw downloaded data.
- Ensure the file are saved to correct location based on path decided by the user.
- Download the TCGA MAF file from (https://gdc.cancer.gov/about-data/publications/Pathways-2018)

In [3]:
# Actual MAF file
load('/mnt/ptemp/arvind/giovanni_share/data/maf/mc3.v0.2.8.PUBLIC.LAML_PATCH.RData')

In [4]:
# TCGA Annotation from our previous paper
load('/mnt/ptemp/arvind/giovanni_share/data/gam/tcga_mut_only_marco.RData')

In [5]:
filter_maf <-maf[,c('Entrez_Gene_Id','Chromosome','Start_Position','End_Position','Reference_Allele','Tumor_Seq_Allele2','Hugo_Symbol','Variant_Classification','Tumor_Sample_Barcode','HGVSp_Short')]

In [6]:
filter_maf$sample <- substr(filter_maf$Tumor_Sample_Barcode,1,15)

In [7]:
input_maf<-filter_maf %>% filter(sample %in% rownames(gam_collection$gam))

In [8]:
mutation_type = list(
      'ignore' = c("Silent","Intron","RNA","3'UTR","5'UTR","5'Flank","3'Flank","IGR"),
      'truncating'= c('Frame_Shift_Del','Frame_Shift_Ins','In_Frame_Del','In_Frame_Ins','Nonsense_Mutation','Nonstop_Mutation','Splice_Region','Splice_Site','Translation_Start_Site'),
      'missense' = c('Missense_Mutation')
)
custom_maf_schema = list(
    'name' = 'custom_maf',
    'column' = list(
          'gene' = 'Hugo_Symbol'
        , 'gene.name' = 'Hugo_Symbol'
        , 'sample' = 'sample'
        , 'sample.name' = 'sample'
        , 'mutation.type' = 'Variant_Classification'
        , 'mutation' = 'HGVSp_Short'
        ),
        'mutation.type' = mutation_type
)

In [9]:
dim(input_maf)

In [10]:
input_maf$sample[1]

In [11]:
mut_samples = unique(input_maf[, custom_maf_schema$column$sample])
print(length(mut_samples))

[1] 9082


In [12]:
print(mut_samples[1:5])

[1] "TCGA-AB-2803-03" "TCGA-02-0047-01" "TCGA-02-0055-01" "TCGA-AB-2805-03"
[5] "TCGA-AB-2806-03"


In [13]:
gene_list<-readRDS(file='/mnt/ptemp/arvind/oncokb_annotator/oncokb_data/gene_list.rds')

In [14]:
input_maf = input_maf[,c('Hugo_Symbol','Variant_Classification','Tumor_Sample_Barcode','HGVSp_Short')]
input_maf$sample <- substr(input_maf$Tumor_Sample_Barcode,1,15)
print(paste('##### Number of lines ####',nrow(input_maf),sep="->"))
genes_to_consider = gene_list$gene_to_consider
print(paste('##### Number of genes ####',length(genes_to_consider),sep="->"))
# Extact number of samples
mut_samples = unique(input_maf[, custom_maf_schema$column$sample])
print(paste('##### Number of samples ####',length(mut_samples),sep="->"))
# Extact mutations in genes in oncokb list
genes_to_consider = gene_list$gene_to_consider
maf_genes = filter_maf_gene.name(input_maf, genes = genes_to_consider, gene.col = custom_maf_schema$column$gene)
print(paste('##### Number of lines ####',nrow(maf_genes),sep="->"))

[1] "##### Number of lines ####->2971979"
[1] "##### Number of genes ####->396"
[1] "##### Number of samples ####->9082"
[1] "##### Number of lines ####->104962"


In [15]:
# Creating Silent GAM
tic('##### Creating Silent GAM ####')
		maf_genes_silent<- maf_genes %>% filter(Variant_Classification %in% c('Silent'))
		silent_tmb <- data.frame('sample'=mut_samples,'mutation'=rep(0,length(mut_samples)))
		rownames(silent_tmb)<-mut_samples
		temp <- maf_genes_silent %>% count(sample) 
		rownames(temp)<-temp$sample
		silent_tmb[intersect(silent_tmb$sample,temp$sample),]$mutation <-temp[intersect(silent_tmb$sample,temp$sample),'n']
		tcga_silent_gam = maf2gam(maf_genes_silent,
                             sample.col = custom_maf_schema$column$sample,
                             gene.col = custom_maf_schema$column$gene,
                             value.var = 'Variant_Classification',
                             samples = mut_samples,
                             genes = genes_to_consider,
                             fun.aggregate = length,
                             binarize=TRUE,
                             fill=0)
		silent_data <- list('gam'=tcga_silent_gam,
		                    'tmb'=silent_tmb)
toc()

##### Creating Silent GAM ####: 4.281 sec elapsed


In [16]:
	# Creating Truncating GAM
	tic('##### Creating Truncating GAM ####')
		maf_trunc = filter_maf_truncating(maf_genes,genes=gene_catalogue$gene_for_trun_mut, custom_maf_schema)
		print(paste('##### Number of lines ####',nrow(maf_trunc),sep="->"))
        input_maf_trunc<-filter_maf_truncating(input_maf, custom_maf_schema)
		truncating_tmb <- data.frame('sample'=mut_samples,'mutation'=rep(0,length(mut_samples)))
		rownames(truncating_tmb)<-mut_samples
		temp <- input_maf_trunc %>% count(sample) 
		rownames(temp)<-temp$sample
		truncating_tmb[intersect(truncating_tmb$sample,temp$sample),]$mutation <-temp[intersect(truncating_tmb$sample,temp$sample),'n']
		tcga_truc_gam = maf2gam(maf_trunc,
                         sample.col = custom_maf_schema$column$sample,
                         gene.col = custom_maf_schema$column$gene,
                         value.var = 'Variant_Classification',
                         samples = mut_samples,
                         genes = genes_to_consider,
                         fun.aggregate = length,
                         binarize=TRUE,
                         fill=0)
		truncating_data <- list('gam'=tcga_truc_gam,
                				'tmb'=truncating_tmb)
	toc()

[1] "##### Number of lines ####->19138"
##### Creating Truncating GAM ####: 3.65 sec elapsed


In [17]:
# Creating Missense GAM
tic('##### Creating Missense GAM ####')
    maf_valid = filter_maf_schema(input_maf,
                             schema = custom_maf_schema,
                             column = 'mutation.type',
                             values = custom_maf_schema[['mutation.type']][['ignore']],
                             inclusive = FALSE)
    missense_maf<-filter_maf_mutation.type(input_maf,
                                      variants = 'Missense_Mutation',
                                      variant.col = custom_maf_schema$column$mutation.type)
    missense_tmb <- data.frame('sample'=mut_samples,'mutation'=rep(0,length(mut_samples)))
    rownames(missense_tmb)<-mut_samples
    temp <- missense_maf %>% count(sample) 
    rownames(temp)<-temp$sample
    missense_tmb[intersect(missense_tmb$sample,temp$sample),]$mutation <-temp[intersect(missense_tmb$sample,temp$sample),'n']
    t_m = substr(maf_valid[[custom_maf_schema$column$mutation]],3,1000)
    t_m1 =  gsub('[A-Z]*$', '', t_m)
    maf_valid$HGVSp_Short_fixed = t_m1
    maf_hotspot = filter_maf_mutations(maf_valid,
                                  variant_catalogue,
                                  maf.col = c(custom_maf_schema$column$gene, 'HGVSp_Short_fixed'),
                                  values.col = c('gene', 'mut'))
	print(paste('##### Number of lines ####',nrow(maf_hotspot),sep="->"))
    missense_tcga_gam = maf2gam(maf_hotspot,
                     sample.col = custom_maf_schema$column$sample,
                     gene.col = custom_maf_schema$column$gene,
                     value.var = 'Variant_Classification',
                     samples = mut_samples,
                     genes = genes_to_consider,
                     fun.aggregate = length,
                     binarize=TRUE,
                     fill=0)
    missesne_data <- list('gam'=missense_tcga_gam,
                          'tmb'=missense_tmb)
toc()

[1] "##### Number of lines ####->12784"
##### Creating Missense GAM ####: 22.409 sec elapsed


In [18]:
global_sample_covariates <-gam_collection$sample.class

In [19]:
gene_to_take <- colnames(missesne_data$gam)
order <- names(global_sample_covariates[rownames(missesne_data$gam)])

data <-list('M'=list('missense'=t(missesne_data$gam[order,gene_to_take]),
                             'truncating'=t(truncating_data$gam[rownames(missesne_data$gam[order,]),gene_to_take])),
                    'tmb'=list('missense'=missesne_data$tmb[order,],
                               'truncating'=truncating_data$tmb[order,]))

alteration_covariates <- rep('MUT',ncol(missesne_data$gam[order,gene_to_take]))
names(alteration_covariates)<-colnames(missesne_data$gam[order,gene_to_take])
sample_covariates<-global_sample_covariates[rownames(missesne_data$gam)]

In [20]:
run_data <- list('M'=data,'sample.class' = sample_covariates,'alteration.class' = alteration_covariates)

In [21]:
str(run_data)

List of 3
 $ M               :List of 2
  ..$ M  :List of 2
  .. ..$ missense  : num [1:396, 1:9082] 0 0 0 0 0 0 0 0 0 0 ...
  .. .. ..- attr(*, "dimnames")=List of 2
  .. .. .. ..$ : chr [1:396] "AKT1" "ALK" "APC" "AR" ...
  .. .. .. ..$ : chr [1:9082] "TCGA-02-0047-01" "TCGA-02-0055-01" "TCGA-AB-2805-03" "TCGA-AB-2806-03" ...
  .. ..$ truncating: num [1:396, 1:9082] 0 0 0 0 0 0 0 0 0 0 ...
  .. .. ..- attr(*, "dimnames")=List of 2
  .. .. .. ..$ : chr [1:396] "AKT1" "ALK" "APC" "AR" ...
  .. .. .. ..$ : chr [1:9082] "TCGA-02-0047-01" "TCGA-02-0055-01" "TCGA-AB-2805-03" "TCGA-AB-2806-03" ...
  ..$ tmb:List of 2
  .. ..$ missense  :'data.frame':	9082 obs. of  2 variables:
  .. .. ..$ sample  : chr [1:9082] "TCGA-02-0047-01" "TCGA-02-0055-01" "TCGA-AB-2805-03" "TCGA-AB-2806-03" ...
  .. .. ..$ mutation: num [1:9082] 56 47 40 422 214 588 14 8 19 8 ...
  .. ..$ truncating:'data.frame':	9082 obs. of  2 variables:
  .. .. ..$ sample  : chr [1:9082] "TCGA-02-0047-01" "TCGA-02-0055-01" "TCGA-

In [22]:
saveRDS(run_data,file='../data/processed/gams/pan_can_tcga_run_data.rds')