# *Data QC of bulk RNA-seq data in R* 
Info | Value
---- | ----
Implemented by | Elucidata
Docker | RNASeq Downstream:For transcriptomics analysis
Tag(s) | edgeR - pca - counts distribution - raw counts 

## Initiate and configure the notebook

In [None]:
# please do not modify
from IPython.display import display_html
def restartkernel() :
    display_html("<script>Jupyter.notebook.kernel.restart()</script>",raw=True)

In [None]:
!sudo pip3 install polly-python --quiet

In [None]:
restartkernel() #Pause for a few seconds before the kernel is refreshed

In [None]:
# please do not modify
from IPython.display import HTML
HTML('''<script type="text/javascript"> Jupyter.notebook.kernel.execute("url = '" + window.location + "'", {}, {}); </script>''')

## Fetch OmixAtlas ID and Dataset ID

- **OmixAtlas ID**: Target repository identifier which is required for downloading the dataset
- **Dataset ID**: Identifier for the dataset on the atlas which is to be analysed 

In [None]:
import urllib.parse as urlparse
from urllib.parse import parse_qs

parsed_url     = urlparse.urlparse(url)
repo_vars_list = [parse_qs(parsed_url.query).get(query_url)[0] for query_url in ['repo_id', 'repo_name', 'dataset_id']]
repo_id        = repo_vars_list[0]
dataset_id     = repo_vars_list[2]

## Download dataset from the OmixAtlas

In [None]:
from polly.omixatlas import OmixAtlas
import os

In [None]:
omix_atlas = OmixAtlas(os.environ['POLLY_REFRESH_TOKEN'])

In [None]:
def download_dataset(repo_id, dataset_id):
    """
    Downloads a single dataset with given repo_id and dataset_id
    """
    file_name = f"{dataset_id}.gct"
    data = omix_atlas.download_data(repo_id, dataset_id)
    url = data.get('data').get('attributes').get('download_url')
    status = os.system(f"wget -O '{file_name}' '{url}'")
    if status == 0:
        print("Downloaded data successfully")
    else:
        raise Exception("Download not successful")

In [None]:
download_dataset(repo_id, dataset_id)

## Read dataset

In [None]:
%get dataset_id --from python3
dataset_id

In [None]:
library(mapGCT)

In [None]:
gctFile <- paste0(dataset_id, '.gct')
gctObj  <- parse_gct(gctFile)

In [None]:
counts  <- gctObj@mat
coldata <- gctObj@cdesc
rowdata <- gctObj@rdesc

In [None]:
dim(counts)
head(counts)

In [None]:
dim(coldata)
head(coldata)

In [None]:
dim(rowdata)
head(rowdata)

## Distributionn of reads mapped to each sample

In [None]:
reads.data.frame <- data.frame(
    sample = colnames(counts),
    totalCounts = colSums(counts)
)
reads.data.frame <- reads.data.frame[order(reads.data.frame$totalCounts), ,F]
dim(reads.data.frame)
head(reads.data.frame)

In [None]:
library(ggplot2)
library(RColorBrewer)
options(repr.plot.width=15, repr.plot.height=9)

p <- ggplot(data=reads.data.frame, aes(x=sample, y=totalCounts)) +
        geom_bar(stat="identity")+
        labs(
            title=paste0(dataset_id, ": mapped reads distribution"),
            x = "Samples",
            y = "Reads mapped",
            caption = paste0("Source: ", dataset_id)
        )+
        scale_fill_brewer(palette="Dark2")+
        scale_x_discrete(limits = reads.data.frame$sample)+
        theme(
          axis.line = element_line(size = 1, colour = "grey"),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank(),
          panel.background = element_blank()
        ) +
        theme(
          legend.position = "right", legend.direction = "vertical", # legend positioned at the bottom, horizantal direction,
          axis.line = element_line(size=1, colour = "black"), # axis line of size 1 inch in black color
          panel.grid.major = element_blank(), # major grids included
          panel.grid.minor = element_blank(), # no minor grids
          panel.border = element_blank(), panel.background = element_blank(), # no borders and background color
          plot.title = element_text(color = "black", size = 25, face="bold"),
          axis.title = element_text(colour="black", size = 20, face = "bold"), # axis title
          axis.text.x = element_text(colour="black", size = 15, angle=90, margin=unit(c(0.5,0.5,0.1,0.1), "cm"), face = "bold"), # x-axis text in fontsize 20
          axis.text.y = element_text(colour="black", size = 15, margin=unit(c(0.5,0.5,0.1,0.1), "cm"), face = "bold"), # y-axis text in fontsize 20
          legend.text = element_text(size = 20, face = "bold"),
          legend.title = element_text(colour="black", size=22, face="bold"),
          axis.ticks.length = unit(-0.25, "cm"))
p

## Create DGEList

In [None]:
library(edgeR)

In [None]:
COHORT_COL <- 'kw_curated_cell_type'

In [None]:
all(colnames(counts) == rownames(coldata))

In [None]:
y <- DGEList(counts=counts, samples = coldata, group = coldata[, COHORT_COL])

## Filter out low expression genes

In [None]:
keep_genes <- filterByExpr(y)
y <- y[keep_genes, ]
dim(y$counts)

## Normalisation

In [None]:
y <- calcNormFactors(y)
head(y$samples)

In [None]:
norm.data <- cpm(y, log = TRUE)
head(norm.data)

## Expression distribution

In [None]:
df <- reshape2::melt(norm.data)
colnames(df) <- c('gene','sample','expression')
head(df)

In [None]:
df <- merge(df, coldata[, COHORT_COL, F], by.x='sample', by.y='row.names')
df <- df[order(df[, COHORT_COL]), ]
head(df)

In [None]:
options(repr.plot.width=15, repr.plot.height=9)

p <- ggplot(data=df, aes_string(x="sample", y="expression", fill=COHORT_COL)) +
        geom_boxplot()+
        labs(
            title=paste0(dataset_id, ": expression distribution"),
            x = "Samples",
            y = "logCPM",
            caption = paste0("Source: ", dataset_id)
        )+
        scale_fill_brewer(palette="Dark2")+
        scale_x_discrete(limits = unique(df$sample))+
        theme(
          axis.line = element_line(size = 1, colour = "grey"),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank(),
          panel.background = element_blank()
        ) +
        theme(
          legend.position = "right", legend.direction = "vertical", # legend positioned at the bottom, horizantal direction,
          axis.line = element_line(size=1, colour = "black"), # axis line of size 1 inch in black color
          panel.grid.major = element_blank(), # major grids included
          panel.grid.minor = element_blank(), # no minor grids
          panel.border = element_blank(), panel.background = element_blank(), # no borders and background color
          plot.title = element_text(color = "black", size = 25, face="bold"),
          axis.title = element_text(colour="black", size = 20, face = "bold"), # axis title
          axis.text.x = element_text(colour="black", size = 15, angle=90, margin=unit(c(0.5,0.5,0.1,0.1), "cm"), face = "bold"), # x-axis text in fontsize 20
          axis.text.y = element_text(colour="black", size = 15, margin=unit(c(0.5,0.5,0.1,0.1), "cm"), face = "bold"), # y-axis text in fontsize 20
          legend.text = element_text(size = 20, face = "bold"),
          legend.title = element_text(colour="black", size=22, face="bold"),
          axis.ticks.length = unit(-0.25, "cm"))
p

## PCA

In [None]:
compute_pca <- function(input_matrix, metadata, ntop_variable_genes = 100){
    input_matrix <- as.data.frame(input_matrix)
    input_matrix$mad <- apply(input_matrix, 1, mad)
    input_matrix <- input_matrix[order(input_matrix$mad, decreasing = T), ]
    input_matrix <- input_matrix[1:ntop_variable_genes, ]
    input_matrix$mad <- NULL
    
    PCAObj <- prcomp(as.data.frame(t(input_matrix)), scale = T)
    PCAObj_Summary <- summary(PCAObj)
    PCA_scores <- data.frame(PCAObj$x, metadata)
    
    return(
        list(
            'scores' = PCA_scores,
            'summary' = PCAObj_Summary
        )
    )
}

In [None]:
pca_plot <- function(pca, cohortCol, pc_x='PC1', pc_y='PC2', title='PCA', subtitle=''){
    require(ggplot2)
    require(ggsci)
    
    p <- ggplot(pca$scores, aes_string(x = pc_x, y = pc_y, fill = cohortCol)) + 
      geom_point(shape = 21, size = 5, alpha = 0.7) + 
      labs(title = title, subtitle = subtitle,
           x = paste(pc_x, '(', round(pca$summary$importance[2,pc_x]*100, 2), '%)'),
           y = paste(pc_y, '(', round(pca$summary$importance[2,pc_y]*100, 2), '%)'), fill = cohortCol) + 
        scale_fill_brewer(palette="Dark2")+
        theme(
          axis.line = element_line(size = 1, colour = "grey"),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank(),
          panel.background = element_blank()
        ) +
        theme(
          legend.position = "right", legend.direction = "vertical", # legend positioned at the bottom, horizantal direction,
          axis.line = element_line(size=1, colour = "black"), # axis line of size 1 inch in black color
          panel.grid.major = element_blank(), # major grids included
          panel.grid.minor = element_blank(), # no minor grids
          panel.border = element_blank(), panel.background = element_blank(), # no borders and background color
          plot.title = element_text(color = "black", size = 25, face="bold"),
          axis.title = element_text(colour="black", size = 20, face = "bold"), # axis title
          axis.text.x = element_text(colour="black", size = 15, angle=90, margin=unit(c(0.5,0.5,0.1,0.1), "cm"), face = "bold"), # x-axis text in fontsize 20
          axis.text.y = element_text(colour="black", size = 15, margin=unit(c(0.5,0.5,0.1,0.1), "cm"), face = "bold"), # y-axis text in fontsize 20
          legend.text = element_text(size = 20, face = "bold"),
          legend.title = element_text(colour="black", size=22, face="bold"),
          axis.ticks.length = unit(-0.25, "cm"))
    p
}

In [None]:
pca <- compute_pca(norm.data, coldata, nrow(norm.data))
head(pca$scores)

In [None]:
options(repr.plot.width=15, repr.plot.height=9)
pca_plot(pca, COHORT_COL, pc_x='PC1', pc_y='PC2', title='PCA')