# Example notebook - Differential Gene Expression

This notebook demonstrates reading data from the API, performing analysis of differential gene expression, and plotting expression of genes of interest.
- title: "Differential Gene Expression"
- author: "Margaret Paiva"
- date: "21/10/2021"
- output: R notebook

## Install dependencies

### Install libraries. You'll only need to run this cell once

In [None]:
# Install some non-standard packages if they aren't available below
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("limma")
BiocManager::install("fgsea")


### Load libraries we'll need below. Run every time you start this notebook

In [None]:
suppressPackageStartupMessages({
    library(jsonlite)
    library(httr)
    library(dplyr)
    library(data.table)
    library(parallel)
    library(DBI)
    library(DT)
    library(limma)
    library(fgsea)
    library(tidyr)
    library(tibble)
    library(ggplot2)
})

## Request Data
Request data from API. We only need to run these cells once - unless you need to change the data requested.

### 1. Read in list of desired models with their group assignment

In [None]:
# In "dge_model_list.csv", define a list of models along with group [a,b] assignment
dge_model_list  <- read.csv('../../lists/dge_model_list.csv')
print(dge_model_list$model)

### 2. Save your API token (from the Lumin Workspaces home page)

In [None]:
tok <- "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJwZXJtaXNzaW9uIjoid29ya3N0YXRpb24iLCJpYXQiOjE2NDAwNTU3NTUsImV4cCI6MTY0MDA3MDE1NX0.dH76RahBzhhzEo32MhpfqLWmmcR7a4_m4UUKvezZgPo"

### 3. Request data from the Lumin API

In [None]:
# Define the data to query from API

data  <- list("request_data_type" = "expression",
  "request_cancer_type" = list("all"),
#   "request_genes" = c("ATM", "BRCA1", "BRCA2", "BRIP1", "CDK12", "FANCA", "HDAC2", "KRAS", 
#                       "PALB2", "SRY", "TP53", "NOTCH1", "CCND1", "BARD1", "FBLN2", "CDKN1B", 
#                       "RB1", "CHEK2", "APOBEC3B", "PALB2"),  # to define a list of genes here
  "request_genes" = c(),  # if a list of genes is defined in a file
  "request_models" = dge_model_list$model, #c("CTG-0012","CTG-0019"),
  "request_agents" = c(),
  "request_dataset" = "PDX",
  "request_workspace_id" = "5036-99-ca8c10",  # enter your token here
  "request_client" = 99,
  "request_user" = 5036,  # enter your user id here
  "request_mode" = TRUE,
  "request_display_error"= FALSE,
  "preview"= TRUE
 ) 
request  <-  POST(
    url = 'https://stag.lumin-fast-api.championsoncology.com/workstation/', 
    body = data, 
    encode = 'json', 
    add_headers(authorization = paste("Bearer", tok))
)
task <- httr::content(request)
task

### 4. Check on the status of our request

In [None]:

# Request data from API - this may take some time
request  <-  GET(
    url = paste0('https://stag.lumin-fast-api.championsoncology.com/tasks/poll/', task$task_id, "/"),
    add_headers(authorization = paste("Bearer", tok)),
    verbose()
)
request
poll_results <- content(request)

### 5. Get final results of our data request

In [None]:
poll_results
res <- fromJSON(poll_results$result)

### 6. Read in delivered data and combine into a single data table

In [None]:
# read the file names of your .json files
fnames <- list.files(paste0("~/", res$name), full.names = TRUE, recursive = TRUE)
print(fnames)

In [None]:
dat_l <- lapply(fnames, fromJSON)
lapply(dat_l, head)

In [None]:
# need to combine...
dat_l <- lapply(dat_l, data.table)
df <- data.table::rbindlist(dat_l)

In [None]:
# Create a dataframe from the data
#df  <- as.data.frame(df)
df$tumor_type  <- as.character(df$tumor_type)  # each column is a list - specify data type
df$z  <- as.numeric(df$z)
head(df, 2)
print(dim(df))

In [None]:
# using log(TPM + 1) is not ideal but for now it is what we have
df[,'tpm' := log2(2^as.numeric(log.tpm) + 1)]

In [None]:
geneids <- unique(df[, c('gene_id', 'gene')])
head(geneids)

In [None]:
test_models <- dge_model_list[dge_model_list$group %in% c('B', 'C'),]

### Load Counts data

In [None]:
load('../Data/RSEM_expected_counts_matrix_2021-03-11.RData', verbose=TRUE)

In [None]:
expected.count.matrix[1:10, 1:10]
# Filter the data to models that match our focal group
keepcols <- c('gene_id', unique(test_models$model))
keepcols <- keepcols[keepcols %in% colnames(expected.count.matrix)]
count.matrix <- expected.count.matrix[,..keepcols]
count.matrix <- merge(count.matrix, geneids, on="gene_id", sort=FALSE)
count.matrix <- count.matrix[!duplicated(count.matrix[,gene]),]
class(count.matrix) <- "data.frame"
rownames(count.matrix) <- count.matrix[,'gene']
count.matrix <- count.matrix[,!colnames(count.matrix) %in% c('gene', 'gene_id')]

In [None]:
counts <- round(count.matrix)
counts[1:10, 1:10]

In [None]:
# Build the model that estimates t-statistics for each gene set
test_design <- merge(data.frame(model = colnames(counts)), test_models, by='model', sort=FALSE)
design <- model.matrix(~0+group, test_design)
fit <- limma::lmFit(counts, design = design)
fit <- limma::eBayes(fit)
res <- limma::decideTests(fit, p.value=0.01)
summary(res)
tmp <- limma::topTable(fit, number = Inf)
tmp

In [None]:
tmp <- tmp[tmp$adj.P.Val <= 0.01 & !is.na(tmp$adj.P.Val),]
tmp

In [None]:
# Define a gene of interest
gene_choice <- 'ACSM3'
df_gene  <- df %>% 
    na.omit()  %>% 
    filter(gene==gene_choice)  # put your gene of interest here
head(df_gene, 2)

In [None]:
# Visualize the expression levels by cancer type
ggplot(df_gene, aes(x=tumor_type, y=z, fill=tumor_type, color=tumor_type)) +
    geom_boxplot(alpha=0.6) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          legend.position = "none", 
          plot.margin = unit(c(0,0,0,1)+0.1, "cm")) +
    labs(x="", y="Z Score", 
         title=paste0(gene_choice, " RNA expression by cancer type")) +
    scale_fill_brewer(palette="BrBG")