MS_Data preprocessing

##################################################################################################################
#####                                     PRE-PROCESSING Poly MeRIP-seq DATA                                       ####
##################################################################################################################

## By Mark Larance and Renhua Song

# Raw data are available at the ProteomeXchange Consortium (http://proteomecentral.proteomexchange.org) via the PRIDE partner repository with the dataset identifier PXD017391.

######## THIS PORTION IS RUN by Mark Larance ######
MaxQuant version 1.6.3.4

Peptide and protein level identification were set to a false discovery rate (FDR) of 1% using the human Uniprot database

### THIS PORTION OF THIS SCRIPT IS RUN IN R USING A .R SCRIPT ###
#############################################################################

## LIBRARIES TO USE

library(ggfortify)
library(cluster)
library(DESeq2)
library(apeglm)
library(limma)
library(edgeR)
library(clusterProfiler)
library(org.Hs.eg.db)

# Input data-------------------------------------------

## Raw intensity

df <- read.delim("filtered_NP_EXP_log2FC_Pvalue_ND_M0_M1_v2.txt",comment="#")

## Protein Imput LFQ intensity

count=df[,c(58:66,73:75)]

## Protein ID

rownames(count)<-df$Protein.IDs

## Sample gene_name

colnames(count)<-c("mo1","mo2","mo3","M01","M02","M03","M11","M12","M13","M21","M22","M23")

## Sample information
sample=data.frame(
  sample=c("mo1","mo2","mo3","M01","M02","M03","M11","M12","M13","M21","M22","M23"),
  condition=c(rep("mo",3),rep("M0",3),rep("M1",3),rep("M2",3))
)


####Select Protiens containing at least 6 samples' value.

count_raw=df[,c(37:39,22:24,28:30,34:36)] ##Raw LFQ intensity

count_data<- filter(count, rowSums(is.na(count_raw)) < 6 ) 

##### Quality Check : missing values #####

figa=data.frame(Percentage=as.numeric(colSums(is.na(count_data))/nrow(count_data)),
                Count=(colSums(is.na(count_data))
                ))                
figa$name=c("mo_r1","mo_r2","mo_r3","M0_r1","M0_r2","M0_r3","M1_r1","M1_r2","M1_r3","M2_r1","M2_r2","M2_r3")
figa$name=factor(figa$name,levels = figa$name)
figa$condition=c("mo","mo","mo","M0","M0","M0","M1","M1","M1","M2","M2","M2")
figa$condition=factor(figa$condition,levels=c("mo","M0","M1","M2"))

## plot the percentarge and count of missing value per sample 
ggplot(data = figa, aes(name,Count, group=condition, fill=condition)) + 
  geom_bar(stat="identity", position = "dodge")+    scale_fill_manual(values=c("red","orange","blue","green"))+
  geom_text(aes(label = paste0(round(100*figa$Percentage,1),"%"), vjust = -0.2))+theme_classic(base_size = 15) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

################## Quality Check : Impute value distribution ##########

## read Imput LFQ intensity and log transfered 

count=log10(df[,c(58:66,73:75)]+1)
rownames(count)=df$Protein.IDs
colnames(count)=c("mo_r1","mo_r2","mo_r3","M0_r1","M0_r2","M0_r3","M1_r1","M1_r2","M1_r3","M2_r1","M2_r2","M2_r3")
count_melt=melt(count)
count_melt$condition=sapply(strsplit(as.character(count_melt$variable),"_"),'[',1)
count_melt$condition=factor(count_melt$condition,levels=c("mo","M0","M1","M2"))

## plot the data distribution

ggplot(count_melt, aes(x=factor(variable), y=value, fill=condition)) +ylab("log10(Impute Intensity Value)")+
  geom_boxplot()+    scale_fill_manual(values=c("red","orange","blue","green"))+theme_classic(base_size = 15)+xlab("") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),legend.position = "NA")

##### Quality Check : overlapping protiens #####

##Define a function drawing flower plot
flower_plot2 <- function(sample, value, start, a, b,  
                         ellipse_col = rgb(135, 206, 235, 150, max = 255), 
                         circle_col = rgb(0, 162, 214, max = 255),
                         circle_text_cex = 1, labels=labels) {
  par( bty = "n", ann = F, xaxt = "n", yaxt = "n", mar = c(1,1,1,1))
  plot(c(0,10),c(0,10),type="n")
  n   <- length(sample)
  deg <- 360 / n
  res <- lapply(1:n, function(t){
    ellipse_col <- ellipse_col[t]
    plotrix::draw.ellipse(x = 5 + cos((start + deg * (t - 1)) * pi / 180), 
                          y = 5 + sin((start + deg * (t - 1)) * pi / 180), 
                          col = ellipse_col,
                          border = ellipse_col,
                          a = a, b = b, angle = deg * (t - 1))
    text(x = 5 + 2.5 * cos((start + deg * (t - 1)) * pi / 180),
         y = 5 + 2.5 * sin((start + deg * (t - 1)) * pi / 180),
         value[t]
    )
    
    if (deg * (t - 1) < 180 && deg * (t - 1) > 0 ) {
      text(x = 5 + 3.3 * cos((start + deg * (t - 1)) * pi / 180),
           y = 5 + 3.3 * sin((start + deg * (t - 1)) * pi / 180),
           sample[t],
           srt = deg * (t - 1) - start,
           adj = 1,
           cex = circle_text_cex
      )
      
    } else {
      text(x = 5 + 3.3 * cos((start + deg * (t - 1)) * pi / 180),
           y = 5 + 3.3 * sin((start + deg * (t - 1)) * pi / 180),
           sample[t],
           srt = deg * (t - 1) + start,
           adj = 0,
           cex = circle_text_cex
      )
    }           
  })
  plotrix::draw.circle(x = 5, y = 5, r = 1.5, col = circle_col, border = circle_col )
  
  # tune location by x and y.
  text(x = 4.7, y = 5, labels=labels)
}

## plot the overlapping protein numbers

flower_plot2 (c("mo_r1", "mo_r2", "mo_r3", "M0_r1", "M0_r2", "M0_r3", 
                "M1_r1", "M1_r2", "M1_r3", "M2_r1", "M2_r2", "M2_r3"),
              c(as.numeric(nrow(count1)-colSums(is.na(count1)))), 90, 0.5, 2, labels="Core items: 3962",
              ellipse_col = c(rep("red",3),rep("orange",3),rep("blue",3),rep("green",3)), 
              circle_col = "white" )

## Remove Proteins with NA value

count1=count[complete.cases(count), ]

## Create the DGEList data class

y <- DGEList(counts = count1)

## Normalization

y <- calcNormFactors(y)

## Clustr samples

mds <- plotMDS(y,top = 100)

## Note on factor levels

toplot <- data.frame(PC1 = mds$x*100, PC2 = mds$y*100, condition = sample$condition)
toplot$condition=factor(toplot$condition,levels = c("mo","M0","M1","M2"))

## Principal Components Analysis plot

ggplot(toplot, aes(PC1, PC2, color=condition)) +
  geom_point(size=6,shape=0)+scale_color_manual(values = c("red","orange","blue","green"))+
  xlab(paste0("PC1: ",11,"% variance")) +
  ylab(paste0("PC2: ",70,"% variance")) +
  theme_bw(base_size = 12)+theme(legend.position = "top")


### create design

condition=sample$condition
design=model.matrix(~0+condition,data = y$samples)
colnames(design) <- gsub("condition", "", colnames(design))

## Estimate dispersions

y <- estimateDisp(y, design, robust=TRUE)

## test for DE Proteins

fit <- glmQLFit(y, design, robust=TRUE)
contr <- makeContrasts("M1 - mo", levels=design)
qlf <- glmQLFTest(fit, contrast=contr)
is.de <- decideTests(qlf, p.value=0.05,adjust.method = "BH",lfc = 2)
summary(is.de)
tt<-topTags(qlf,n=483)
dep=as.data.frame(tt$table)
res1=subset(dep,abs(dep$logFC)>1&dep$FDR<0.05)

#################### Replicability of processed data ########

## Define a function to draw a scatter plot for a pair of variables (samples) with density colors
plotFun <- function(x,y){
  dns <- densCols(x,y);
  points(x,y, col="black", pch=".",cex=4,panel.first=grid());
  abline(a=0, b=1, col="red")
}

## get the transformed data
nc <- as.data.frame(cpm(y, normalized.lib.sizes=TRUE))

count.table2=subset(nc,rownames(nc)%in%rownames(res1))[,1:3]
colnames(count.table2)=factor(c("mo replicate1","mo replicate2","mo replicate3"),levels =c("mo replicate3","mo replicate2","mo replicate1") )
## Plot the scatter plot for a few pairs of variables selected at random
set.seed(123) # forces the random number generator to produce fixed results. Should generally not be used, except for the sake of demonstration with a particular selection.
pairs(log2(count.table2[,sample(ncol(count.table2), 3)] + 1),
      panel=plotFun)
      
#################### Gene Ontology analysis of processed data ########

## read the differential Mass spec between mo and M0 ##
MS_M0vsmo=read.csv("MS_M0vsmo.csv")
rownames(MS_M0vsmo)=MS_M0vsmo[,1]
mo_M0_GO<- enrichGO(gene       =MS_M0vsmo$Gene,
                    OrgDb         = org.Hs.eg.db,
                    universe      = rownames(vsd_count), 
                    keyType       = 'SYMBOL',
                    ont           = "BP",
                    pAdjustMethod = "BH",
                    pvalueCutoff  = 1,
                    qvalueCutoff  = 1)
## read the differential Mass spec between M0 and M1 ##
MS_M1vsM0=read.csv("MS_M1vsM0.csv")
rownames(MS_M1vsM0)=MS_M1vsM0[,1]
M0_M1_GO<- enrichGO(gene       = MS_M1vsM0$Gene,
                    OrgDb         = org.Hs.eg.db,
                    universe      = rownames(vsd_count), 
                    keyType       = 'SYMBOL',
                    ont           = "BP",
                    pAdjustMethod = "BH",
                    pvalueCutoff  = 1,
                    qvalueCutoff  = 1)

## read the differential genes between M0 and M2 ##
MS_M2vsM0=read.csv("MS_M2vsM0.csv")
rownames(MS_M2vsM0)=MS_M2vsM0[,1]
M0_M2_GO<- enrichGO(gene       = MS_M2vsM0$Gene,
                    OrgDb         = org.Hs.eg.db,
                    universe      = rownames(vsd_count), 
                    keyType       = 'SYMBOL',
                    ont           = "BP",
                    pAdjustMethod = "BH",
                    pvalueCutoff  = 1,
                    qvalueCutoff  = 1)

### Convert to DataFrame #####
GO_mo_M0_MS=as.data.frame(mo_M0_GO)
GO_M0_M1_MS=as.data.frame(M0_M1_GO)
GO_M0_M2_MS=as.data.frame(M0_M2_GO)