/
compartments_genes.R
214 lines (174 loc) · 10.3 KB
/
compartments_genes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
library(diffHic)
library(reshape2)
library(BSgenome.Hsapiens.UCSC.hg19)
library(limma)
seg.frags <- segmentGenome(BSgenome.Hsapiens.UCSC.hg19, size=500000)
seg.frags = seg.frags[1:6087]
# read hg19 gtf file
hg19_gene_gtf <- read.table( file = "/lustre/user/liclab/publicData/igenomes/Homo_sapiens/UCSC/hg19/Annotation/Genes/genes.gtf", sep = "\t")
hg19_gene_gtf_gr <- GRanges(seqnames = hg19_gene_gtf[, 1],
ranges = IRanges( start = hg19_gene_gtf[, 4], end = hg19_gene_gtf[, 5] ),
strand = hg19_gene_gtf[, 7] )
mcols(hg19_gene_gtf_gr ) <- DataFrame( type = hg19_gene_gtf[, 3], gene_id = strsplit2( hg19_gene_gtf[, 9], " |;")[, 2])
head(hg19_gene_gtf_gr )
# RPMI8226_HindIII
RPMI8226_HindIII_compartment <- numeric()
for(i in 1:23){
tmp <- read.table(paste0("/lustre/user/liclab/lirf/Project/hic/mm/pca/500kb/RPMI8226_HindIII/compartment/chr",i,".txt",collapse = ""))
RPMI8226_HindIII_compartment = rbind(RPMI8226_HindIII_compartment, tmp)
}
dim(RPMI8226_HindIII_compartment)
RPMI8226_HindIII_compartment <- as.numeric(as.matrix(RPMI8226_HindIII_compartment))
length(RPMI8226_HindIII_compartment)
# U266_HindIII
U266_HindIII_compartment <- numeric()
for(i in 1:23){
tmp <- read.table(paste0("/lustre/user/liclab/lirf/Project/hic/mm/pca/500kb/U266_HindIII/compartment/chr",i,".txt",collapse = ""))
U266_HindIII_compartment = rbind(U266_HindIII_compartment, tmp)
}
dim(U266_HindIII_compartment)
U266_HindIII_compartment <- as.numeric(as.matrix(U266_HindIII_compartment) )
length(U266_HindIII_compartment)
# GM12878
GM12878_compartment <- numeric()
for(i in 1:23){
tmp <- read.table(paste0("/lustre/user/liclab/lirf/Project/hic/mm/pca/500kb/gm12878/compartment/chr",i,".txt",collapse = ""))
GM12878_compartment = rbind(GM12878_compartment, tmp)
}
dim(GM12878_compartment)
GM12878_compartment <- as.numeric(as.matrix(GM12878_compartment))
length(GM12878_compartment)
############################################################
mcols(seg.frags) <- DataFrame(RPMI8226_HindIII = RPMI8226_HindIII_compartment,
U266_HindIII = U266_HindIII_compartment, GM12878 = GM12878_compartment )
seg.frags = seg.frags[which(is.na(seg.frags$RPMI8226_HindIII) == F)]
seg.frags = seg.frags[which(is.na(seg.frags$U266_HindIII) == F)]
seg.frags = seg.frags[which(is.na(seg.frags$GM12878) == F)]
seg.frags_all_A <- seg.frags[ seg.frags$RPMI8226_HindIII > 0 & seg.frags$U266_HindIII > 0 & seg.frags$GM12878 < 0]
length(seg.frags_all_A) # [1] 262
B2A_genes <- hg19_gene_gtf_gr[ findOverlaps(query = seg.frags_all_A, subject = hg19_gene_gtf_gr )@subjectHits ]
head(B2A_genes)
write.table( x = unique(B2A_genes$gene_id), file = "Compartment_B2A_in_MM.txt", quote = F, row.names = F, col.names = F)
seg.frags_all_B <- seg.frags[ seg.frags$RPMI8226_HindIII < 0 & seg.frags$U266_HindIII < 0 & seg.frags$GM12878 > 0]
length(seg.frags_all_B)
A2B_genes <- hg19_gene_gtf_gr[ findOverlaps(query = seg.frags_all_B, subject = hg19_gene_gtf_gr )@subjectHits ]
head(A2B_genes)
write.table( x = unique(A2B_genes$gene_id), file = "Compartment_A2B_in_MM.txt", quote = F, row.names = F, col.names = F)
hg19_len <- read.table("/lustre/user/liclab/biotools/FREEC_Linux64/data/hg19.len")
# read expression of GM12878, RPMI-8226, U266
ALL_exp <- read.table("/lustre/user/liclab/wupz/dosageEffect/cuffdiff_GM12878_MM/20160706_cuffdiff_hg19/genes.fpkm_tracking", header = T)
head(ALL_exp)
rownames(ALL_exp) <- ALL_exp[,1]
plot(density(log2(as.matrix(ALL_exp[, "GM12878_FPKM"]))) )
lines( density(log2(as.matrix(ALL_exp[, "RMPI8226_FPKM"]))), col = "red")
lines( density(log2(as.matrix(ALL_exp[, "U266_FPKM"]))), col = "blue")
boxplot( log2(cbind( ALL_exp[, "GM12878_FPKM"],
ALL_exp[, "RMPI8226_FPKM"],
ALL_exp[, "U266_FPKM"] )) )
A2B_genes_exp <- ALL_exp[ unique(A2B_genes$gene_id),]
head(A2B_genes_exp)
boxplot(log2(A2B_genes_exp[, c(10,14,18)]) )
# filter the A2B genes:
# 1. GM12878 expression >= 1
# 2. log2 fold change > 2
A2B_genes_exp_filter <- A2B_genes_exp[, 10] >= 1 & log2(A2B_genes_exp[, 10]/A2B_genes_exp[, 14]) >= 1 & log2(A2B_genes_exp[, 10]/A2B_genes_exp[, 18]) >= 1
sum(A2B_genes_exp_filter)
write.table( rownames( A2B_genes_exp[ A2B_genes_exp_filter, ] ),
file = "Compartment_A2B_in_MM_diff_exp.txt", quote = F, row.names = F, col.names = F )
B2A_genes_exp <- ALL_exp[ intersect(unique(B2A_genes$gene_id), ALL_exp[,1]), ]
boxplot(log2(B2A_genes_exp[, c(10,14,18)]) )
head(B2A_genes_exp)
dim(B2A_genes_exp)
# filter the A2B genes:
# 1. GM12878 expression >= 1
# 2. log2 fold change > 2
B2A_genes_exp_filter <- B2A_genes_exp[, 14] >= 1 & B2A_genes_exp[, 18] >= 1 & log2(B2A_genes_exp[, 14]/B2A_genes_exp[, 10]) >= 1 & log2(B2A_genes_exp[, 18]/B2A_genes_exp[, 10]) >= 1
sum(B2A_genes_exp_filter)
B2A_genes_exp[ B2A_genes_exp_filter, ]
write.table( rownames( B2A_genes_exp[ B2A_genes_exp_filter, ] ),
file = "Compartment_B2A_in_MM_diff_exp.txt", quote = F, row.names = F, col.names = F )
ALL_exp[ALL_exp[, 1] == "MAP4K3", ]
# find the location of genes
protein_coding_gene <- read.csv("/lustre/user/liclab/wupz/database/human_genes/protein-coding_gene.csv")
a2b_enriched_genes <- c( "SPRY2", "CBLB", "PIK3R5", "IL7R",
"IL1R2", "IL1R1", "IL7R",
"IL18R1", "IL1R2", "IL1R1", "IL7R" )
a2b_enriched_genes_matrix <- numeric()
for (i in b2a_enriched_genes) {
a2b_enriched_genes_matrix <- rbind(a2b_enriched_genes_matrix, protein_coding_gene[protein_coding_gene[, 2] == i, ])
}
write.csv(x = a2b_enriched_genes_matrix, file = "a2b_enriched_genes_matrix.txt",
quote = F, row.names = F )
b2a_enriched_genes <- c( "MAP4K3",
"CACNG6",
"FGF13",
"DUSP6",
"NCAM1",
"CADM1",
"NLGN1",
"IRS2",
"PDE3B",
"PYGB" )
b2a_enriched_genes_matrix <- numeric()
for (i in b2a_enriched_genes) {
b2a_enriched_genes_matrix <- rbind(b2a_enriched_genes_matrix, protein_coding_gene[protein_coding_gene[, 2] == i, ])
}
write.csv(x = b2a_enriched_genes_matrix, file = "b2a_enriched_genes_matrix.txt",
quote = F, row.names = F )
MM_genes <- read.table("../MM_genes/mm_related_keygene_in_RNA_seq.txt", header = T)
MM_genes_a2b_overlap <- intersect(x = as.vector(MM_genes[, 6]), y = unique(A2B_genes$gene_id) )
MM_genes_b2a_overlap <- intersect(x = as.vector(MM_genes[, 6]), y = unique(B2A_genes$gene_id) )
ALL_exp[as.vector(MM_genes[, 6]), c(7, 10,14,18)]
MM_genes_gr <- GRanges(seqnames = MM_genes[, "chr"],
ranges = IRanges( start = MM_genes[, "start"],
end = MM_genes[, "end"]),
strand = MM_genes[, "strand"] )
mcols(MM_genes_gr ) <- DataFrame( gene = MM_genes[, "gene"], gene_id = MM_genes[, "gene_id"])
MM_genes_gr
mcols(MM_genes_gr)$RPMI8226 <- 0
mcols(MM_genes_gr)$U266 <- 0
mcols(MM_genes_gr)$GM12878 <- 0
mcols(MM_genes_gr)[findOverlaps(query = promoters(MM_genes_gr), subject = seg.frags)@queryHits, 3:5] <- mcols(seg.frags[findOverlaps(query = promoters(MM_genes_gr), subject = seg.frags)@subjectHits])
sum(MM_genes_gr$RPMI8226 > 0 & MM_genes_gr$U266 > 0 & MM_genes_gr$GM12878 > 0) # 27
sum(MM_genes_gr$RPMI8226 > 0 & MM_genes_gr$U266 < 0 & MM_genes_gr$GM12878 > 0) # 1
sum(MM_genes_gr$RPMI8226 < 0 & MM_genes_gr$U266 > 0 & MM_genes_gr$GM12878 > 0) # 5
sum(MM_genes_gr$RPMI8226 < 0 & MM_genes_gr$U266 < 0 & MM_genes_gr$GM12878 > 0) # 0
sum(MM_genes_gr$RPMI8226 < 0 & MM_genes_gr$U266 < 0 & MM_genes_gr$GM12878 < 0) # 8
sum(MM_genes_gr$RPMI8226 < 0 & MM_genes_gr$U266 > 0 & MM_genes_gr$GM12878 < 0) # 4
sum(MM_genes_gr$RPMI8226 > 0 & MM_genes_gr$U266 < 0 & MM_genes_gr$GM12878 < 0) # 0
sum(MM_genes_gr$RPMI8226 > 0 & MM_genes_gr$U266 > 0 & MM_genes_gr$GM12878 < 0) # 0
enrichment_results_of_all_genes_with_compartment_switching <- read.table(file = "../comparments_genes/enrichment_results_of_all_genes_with_compartment_switching.txt", sep = "\t", header = T)
head(enrichment_results_of_all_genes_with_compartment_switching)
dim(enrichment_results_of_all_genes_with_compartment_switching)
png(filename = "../comparments_genes/enrichment_results_of_all_genes_with_compartment_switching.png")
par(cex = 2)
barplot( -log10(enrichment_results_of_all_genes_with_compartment_switching[, "PValue"]),
horiz = T, col = "darkorange", xlab = "-log10 (p-value)")
dev.off()
#
enrichment_results_of_all_genes_with_compartment_switching_and_diff_exp <- read.table(file = "../comparments_genes/enrichment_results_of_all_genes_with_compartment_switching_and_diff_exp.txt", sep = "\t", header = T)
head(enrichment_results_of_all_genes_with_compartment_switching_and_diff_exp)
dim(enrichment_results_of_all_genes_with_compartment_switching_and_diff_exp)
png(filename = "../comparments_genes/enrichment_results_of_all_genes_with_compartment_switching_and_diff_exp.png")
par(cex = 2)
barplot( -log10(enrichment_results_of_all_genes_with_compartment_switching[1:4, "PValue"]),
horiz = T, col = "darkorange", xlab = "-log10 (p-value)")
dev.off()
enrichment_results_A2B_enrichr_KEGG <- read.table(file = "../comparments_genes/A2B_KEGG_2016_table.txt", sep = "\t", header = T)
enrichment_results_B2A_enrichr_KEGG <- read.table(file = "../comparments_genes/B2A_KEGG_2016_table.txt", sep = "\t", header = T)
sum(enrichment_results_A2B_enrichr_KEGG[, 3] <= 0.05) # 21
sum( enrichment_results_B2A_enrichr_KEGG[, 3] <= 0.05 ) # 7
enrichment_results_enrichr_KEGG <- rbind( enrichment_results_A2B_enrichr_KEGG[1:21, c(1,3)], enrichment_results_B2A_enrichr_KEGG[1:7, c(1,3)])
library(limma)
enrichment_results_enrichr_KEGG[,1] <- as.vector( strsplit2(enrichment_results_enrichr_KEGG[,1], "_")[,1])
barplot(height = enrichment_results_enrichr_KEGG[, 2], names.arg = T)
head(enrichment_results_B2A_enrichr_KEGG)
head(enrichment_results_A2B_enrichr_KEGG )
dim(enrichment_results_of_all_genes_with_compartment_switching)
png(filename = "../comparments_genes/enrichment_results_of_all_genes_with_compartment_switching.png", width = 2048, height = 1024)
par(cex = 2, mar= c(5, 30, 3, 3))
barplot( -log10(enrichment_results_enrichr_KEGG[, 2]),
names.arg = enrichment_results_enrichr_KEGG[,1],las=1,
horiz = T, xlab = "-log10 (p-value)",
col = c( rep("deepskyblue1", 21), rep("orangered1", 7)) )
dev.off()