# Assess differential gene expression among bulk RNA-seq samples

#### Install required packages

In [1]:
if (!require(edgeR)) {install.packages("BiocManager");BiocManager::install("edgeR")}
require(edgeR)
require(gplots)
require(feather)

Loading required package: edgeR
"package 'edgeR' was built under R version 3.6.2"Loading required package: limma
"package 'limma' was built under R version 3.6.2"Loading required package: gplots
"there is no package called 'gplots'"Loading required package: feather
"there is no package called 'feather'"

## Look at the low cell data

#### Load data from csv file (genes in rows, samples in columns)

In [2]:
dat=read.csv("data\\lowCellSeq.csv",as.is=T,header=T,row.names=1)
dat[1:5,1:5] #view first five rows+columns of data matrix to check that the input is correct
celltyp=(do.call(rbind,strsplit(colnames(dat),"_"))[,1])
print(unique(celltyp))
dge_table=list()

Unnamed: 0,P.EN2_r1,P.EN2_r2,P.EN2_r3,P.EN2_r4,P.EN2_r5
ERCC-00002,12075.0372,14097.317,14240.2966,14735.2292,15290.5389
ERCC-00003,270.5944,411.7351,489.4273,802.7052,806.6205
ERCC-00004,4339.1067,3867.4418,4001.0686,4482.9562,5049.1541
ERCC-00009,497.0494,620.6752,279.3814,636.0677,792.1055
ERCC-00012,0.0,0.0,0.0,0.0,0.0


[1] "P.EN2"      "ring"       "wholebrain"


#### Run differential expression vs. whole brain data

In [3]:
celltype1_columns=which(celltyp=="P.EN2")
celltype2_columns=which(celltyp=="wholebrain")

classvec=as.factor(rep(c(1,2),times=c(length(celltype1_columns),length(celltype2_columns))))
startmat_cpm=sweep(dat[,c(celltype1_columns,celltype2_columns)],2,colSums(dat[,c(celltype1_columns,celltype2_columns)]),"/")*10^6
e_design=model.matrix(~classvec)
y2 = DGEList(counts=dat[,c(celltype1_columns,celltype2_columns)])
y2 = estimateDisp(y2, e_design)
fit = glmQLFit(y2, e_design)
qlf.2vs1 <- glmQLFTest(fit, coef=2)
outval2=topTags(qlf.2vs1,n=nrow(startmat_cpm),p.value=1)
outval2$table=outval2$table[intersect(rownames(outval2$table),rownames(startmat_cpm)),]
mean1=apply(startmat_cpm[rownames(outval2$table),classvec==1],1,mean)
mean2=apply(startmat_cpm[rownames(outval2$table),classvec==2],1,mean)
frac1=rowSums(startmat_cpm[rownames(outval2$table),classvec==1]>0)/length(celltype1_columns)
frac2=rowSums(startmat_cpm[rownames(outval2$table),classvec==2]>0)/length(celltype2_columns)
dge_table[["PEN2_wholebrain"]]=cbind(outval2$table,mean1,mean2,frac1,frac2)

In [4]:
celltype1_columns=which(celltyp=="ring")
celltype2_columns=which(celltyp=="wholebrain")

classvec=as.factor(rep(c(1,2),times=c(length(celltype1_columns),length(celltype2_columns))))
startmat_cpm=sweep(dat[,c(celltype1_columns,celltype2_columns)],2,colSums(dat[,c(celltype1_columns,celltype2_columns)]),"/")*10^6
e_design=model.matrix(~classvec)
y2 = DGEList(counts=dat[,c(celltype1_columns,celltype2_columns)])
y2 = estimateDisp(y2, e_design)
fit = glmQLFit(y2, e_design)
qlf.2vs1 <- glmQLFTest(fit, coef=2)
outval2=topTags(qlf.2vs1,n=nrow(startmat_cpm),p.value=1)
outval2$table=outval2$table[intersect(rownames(outval2$table),rownames(startmat_cpm)),]
mean1=apply(startmat_cpm[rownames(outval2$table),classvec==1],1,mean)
mean2=apply(startmat_cpm[rownames(outval2$table),classvec==2],1,mean)
frac1=rowSums(startmat_cpm[rownames(outval2$table),classvec==1]>0)/length(celltype1_columns)
frac2=rowSums(startmat_cpm[rownames(outval2$table),classvec==2]>0)/length(celltype2_columns)
dge_table[["ring_wholebrain"]]=cbind(outval2$table,mean1,mean2,frac1,frac2)

#### Save the data

In [5]:
for (tablename in names(dge_table)) {
    write.csv(dge_table[[tablename]],file=paste0("dge\\dge_table_",tablename,".csv"))
}

## Look at the bulk data

#### Load data from csv file (genes in rows, samples in columns)

In [6]:
dat=read.csv("data\\bulkSeq.csv",as.is=T,header=T,row.names=1)
dat[1:5,1:5] #view first five rows+columns of data matrix to check that the input is correct
celltyp=(do.call(rbind,strsplit(colnames(dat),"_"))[,1])
print(unique(celltyp))
dge_table=list()

Unnamed: 0,E.PG_l1r1,E.PG_l1r2,E.PG_l2r1,E.PG_l2r2,P.EG_r1
ERCC-00002,1548.17,727.82,173.11,298.57,231.35
ERCC-00003,68.98,90.31,41.6,26.95,36.51
ERCC-00004,614.97,299.36,186.32,76.96,141.78
ERCC-00009,89.47,24.66,19.9,4.68,16.55
ERCC-00012,0.0,0.0,0.0,0.0,0.0


[1] "E.PG"       "P.EG"       "D7"         "P.EN2"      "wholebrain"


In [7]:
celltype1_columns=which(celltyp=="E.PG")
celltype2_columns=which(celltyp=="wholebrain")

classvec=as.factor(rep(c(1,2),times=c(length(celltype1_columns),length(celltype2_columns))))
startmat_cpm=sweep(dat[,c(celltype1_columns,celltype2_columns)],2,colSums(dat[,c(celltype1_columns,celltype2_columns)]),"/")*10^6
e_design=model.matrix(~classvec)
y2 = DGEList(counts=dat[,c(celltype1_columns,celltype2_columns)])
y2 = estimateDisp(y2, e_design)
fit = glmQLFit(y2, e_design)
qlf.2vs1 <- glmQLFTest(fit, coef=2)
outval2=topTags(qlf.2vs1,n=nrow(startmat_cpm),p.value=1)
outval2$table=outval2$table[intersect(rownames(outval2$table),rownames(startmat_cpm)),]
mean1=apply(startmat_cpm[rownames(outval2$table),classvec==1],1,mean)
mean2=apply(startmat_cpm[rownames(outval2$table),classvec==2],1,mean)
frac1=rowSums(startmat_cpm[rownames(outval2$table),classvec==1]>0)/length(celltype1_columns)
frac2=rowSums(startmat_cpm[rownames(outval2$table),classvec==2]>0)/length(celltype2_columns)
dge_table[["EPG_wholebrain"]]=cbind(outval2$table,mean1,mean2,frac1,frac2)

In [8]:
celltype1_columns=which(celltyp=="P.EG")
celltype2_columns=which(celltyp=="wholebrain")

classvec=as.factor(rep(c(1,2),times=c(length(celltype1_columns),length(celltype2_columns))))
startmat_cpm=sweep(dat[,c(celltype1_columns,celltype2_columns)],2,colSums(dat[,c(celltype1_columns,celltype2_columns)]),"/")*10^6
e_design=model.matrix(~classvec)
y2 = DGEList(counts=dat[,c(celltype1_columns,celltype2_columns)])
y2 = estimateDisp(y2, e_design)
fit = glmQLFit(y2, e_design)
qlf.2vs1 <- glmQLFTest(fit, coef=2)
outval2=topTags(qlf.2vs1,n=nrow(startmat_cpm),p.value=1)
outval2$table=outval2$table[intersect(rownames(outval2$table),rownames(startmat_cpm)),]
mean1=apply(startmat_cpm[rownames(outval2$table),classvec==1],1,mean)
mean2=apply(startmat_cpm[rownames(outval2$table),classvec==2],1,mean)
frac1=rowSums(startmat_cpm[rownames(outval2$table),classvec==1]>0)/length(celltype1_columns)
frac2=rowSums(startmat_cpm[rownames(outval2$table),classvec==2]>0)/length(celltype2_columns)
dge_table[["PEG_wholebrain"]]=cbind(outval2$table,mean1,mean2,frac1,frac2)

In [9]:
celltype1_columns=which(celltyp=="D7")
celltype2_columns=which(celltyp=="wholebrain")

classvec=as.factor(rep(c(1,2),times=c(length(celltype1_columns),length(celltype2_columns))))
startmat_cpm=sweep(dat[,c(celltype1_columns,celltype2_columns)],2,colSums(dat[,c(celltype1_columns,celltype2_columns)]),"/")*10^6
e_design=model.matrix(~classvec)
y2 = DGEList(counts=dat[,c(celltype1_columns,celltype2_columns)])
y2 = estimateDisp(y2, e_design)
fit = glmQLFit(y2, e_design)
qlf.2vs1 <- glmQLFTest(fit, coef=2)
outval2=topTags(qlf.2vs1,n=nrow(startmat_cpm),p.value=1)
outval2$table=outval2$table[intersect(rownames(outval2$table),rownames(startmat_cpm)),]
mean1=apply(startmat_cpm[rownames(outval2$table),classvec==1],1,mean)
mean2=apply(startmat_cpm[rownames(outval2$table),classvec==2],1,mean)
frac1=rowSums(startmat_cpm[rownames(outval2$table),classvec==1]>0)/length(celltype1_columns)
frac2=rowSums(startmat_cpm[rownames(outval2$table),classvec==2]>0)/length(celltype2_columns)
dge_table[["D7_wholebrain"]]=cbind(outval2$table,mean1,mean2,frac1,frac2)

In [10]:
celltype1_columns=which(celltyp=="P.EN2")
celltype2_columns=which(celltyp=="wholebrain")

classvec=as.factor(rep(c(1,2),times=c(length(celltype1_columns),length(celltype2_columns))))
startmat_cpm=sweep(dat[,c(celltype1_columns,celltype2_columns)],2,colSums(dat[,c(celltype1_columns,celltype2_columns)]),"/")*10^6
e_design=model.matrix(~classvec)
y2 = DGEList(counts=dat[,c(celltype1_columns,celltype2_columns)])
y2 = estimateDisp(y2, e_design)
fit = glmQLFit(y2, e_design)
qlf.2vs1 <- glmQLFTest(fit, coef=2)
outval2=topTags(qlf.2vs1,n=nrow(startmat_cpm),p.value=1)
outval2$table=outval2$table[intersect(rownames(outval2$table),rownames(startmat_cpm)),]
mean1=apply(startmat_cpm[rownames(outval2$table),classvec==1],1,mean)
mean2=apply(startmat_cpm[rownames(outval2$table),classvec==2],1,mean)
frac1=rowSums(startmat_cpm[rownames(outval2$table),classvec==1]>0)/length(celltype1_columns)
frac2=rowSums(startmat_cpm[rownames(outval2$table),classvec==2]>0)/length(celltype2_columns)
dge_table[["PEN2_wholebrain"]]=cbind(outval2$table,mean1,mean2,frac1,frac2)

In [11]:
for (tablename in names(dge_table)) {
    write.csv(dge_table[[tablename]],file=paste0("dge\\dge_table_",tablename,".csv"))
}