/
exp8tpp.R
407 lines (368 loc) · 23.2 KB
/
exp8tpp.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
library(dplyr)
library(readr)
library(ggplot2)
library(reshape2)
library(tidyverse)
library(stringr)
library(httr)
library(xml2)
library(ggrepel)
library(fields)
library(VennDiagram)
library(gridExtra)
library(igraph)
library(KEGGREST)
source("functions.R")
exp8tpp.collect_data = function()
{
tpp = read.table("data/exp8tpp/tpp.tsv", na.strings="", sep="\t", quote="", stringsAsFactors=F, header=T)
concentrations = c(0, 0.5, 2, 10, 50)
measurements.regex = "^(signal_sum|rel_fc_protein|norm_rel_fc_protein)_([0-9.]+)_?(unmodified|transformed)?"
measurements.list = colnames(tpp)[grepl(measurements.regex, colnames(tpp))]
tpp_long = tpp %>% reshape2::melt(measure.vars=measurements.list)
vals = stringr::str_match(as.character(tpp_long$variable), measurements.regex); vals[is.na(vals)] = ""
tpp_long$variable = paste0(vals[,2], ifelse(vals[,4]=="", "", paste0("_", vals[,4])))
tpp_long$concentration = as.numeric(vals[,3])
tpp_long = tpp_long %>%
dplyr::filter(!grepl("^#",gene_name) & !grepl("^#",protein_id) & qupm>=2) %>%
dplyr::filter(variable=="norm_rel_fc_protein_unmodified") %>%
dplyr::rename(original_protein_id="protein_id") %>%
dplyr::arrange(organism, preparation, variable, original_protein_id, concentration, temperature) %>%
dplyr::distinct(organism, preparation, variable, original_protein_id, concentration, temperature, .keep_all=T) %>%
dplyr::mutate(f=paste(organism, preparation, variable, original_protein_id, concentration, compound_effect)) %>%
dplyr::mutate(f.lag1=dplyr::lag(f, 1, "NA"), f.lag2=dplyr::lag(f, 2, "NA"), f.lead1=dplyr::lead(f, 1, "NA"), f.lead2=dplyr::lead(f, 2, "NA")) %>%
dplyr::mutate(f=!grepl("NA$", f) & (f == f.lag1 | f == f.lead1)) %>% # | f == f.lead2 | f == f.lag2 %>%
dplyr::mutate(compound_effect.f=ifelse(f, compound_effect, NA)) %>%
dplyr::group_by(organism, preparation, variable, original_protein_id) %>%
dplyr::mutate(i=concentration==concentration[1]) %>%
dplyr::mutate(n_hits.f=sum(!is.na(compound_effect.f[i])), temp_hits.f=paste0(sort(temperature[i & !is.na(compound_effect.f)]), collapse=","), has_hits.f=paste0(sort(unique(na.omit(compound_effect.f[i]))), collapse=","), has_hits.f=ifelse(has_hits.f=="destabilized,stabilized", "mixed", has_hits.f)) %>%
dplyr::mutate(n_hits=sum(!is.na(compound_effect[i])), temp_hits=paste0(sort(temperature[i & !is.na(compound_effect)]), collapse=","), has_hits=paste0(sort(unique(na.omit(compound_effect[i]))), collapse=","), has_hits=ifelse(has_hits=="destabilized,stabilized", "mixed", has_hits)) %>%
dplyr::ungroup() %>%
dplyr::mutate(
compound_effect=tidyr::replace_na(compound_effect, "none"),
compound_effect.f=tidyr::replace_na(compound_effect.f, "none")) %>%
dplyr::select(-i) %>%
dplyr::select(preparation, organism, temperature, concentration, variable, gene_name, original_protein_id, compound_effect, value, n_hits, temp_hits, has_hits)
write.table(tpp_long, "data/exp8tpp/tpp_long.tsv", na="", sep="\t", quote=F, row.names=F, col.names=T)
# Mapping of single proteins to protein groups which can't be distinguished
tppsum_long.sep = tpp_long %>%
dplyr::mutate(protein_id=original_protein_id) %>%
tidyr::separate_rows(protein_id, sep="\\|") %>%
dplyr::distinct(organism, original_protein_id, protein_id) %>%
dplyr::group_by(organism, protein_id) %>%
dplyr::summarize(original_protein_id=strset(unlist(sapply(original_protein_id, strsplit, "\\|")))) %>%
data.frame()
write.table(tppsum_long.sep, "data/exp8tpp/tpp_protein_groups.tsv", na="", sep="\t", quote=F, row.names=F, col.names=T)
#
# Sum all the log2(fold-changes) per protein
#
tppsum_long = tpp_long %>%
dplyr::group_by(organism, preparation, variable, original_protein_id, n_hits, temp_hits) %>%
dplyr::summarise(
gene_name=paste(unique(na.omit(gene_name)), collapse="|"),
log2fc=sum(log2(value)),
has_hits=paste0(sort(unique(na.omit(has_hits))), collapse=","), has_hits=ifelse(has_hits!="" & !grepl("stabilized", has_hits), "mixed", has_hits)) %>%
dplyr::group_by(organism, preparation, variable) %>%
dplyr::mutate(log2fc_q=ecdf(log2fc)(log2fc)) %>%
dplyr::mutate(log2fc_z=scale(log2fc)) %>%
dplyr::ungroup()
write.table(tppsum_long, "data/exp8tpp/tppsum_long.tsv", na="", sep="\t", quote=F, row.names=F, col.names=T)
# create special (Wide) version of hits table and also split undestinguishable proteins to one protein per line
tppsum_wide = tppsum_long %>%
dplyr::filter(grepl("stabilized", has_hits)) %>%
dplyr::inner_join(tppsum_long.sep, by=c("organism", "original_protein_id")) %>%
reshape2::dcast(protein_id ~ preparation + organism, value.var="has_hits") %>%
dplyr::rename(
has_hits.lysate_Csaccharolyticum="lysate_C. saccharolyticum",
has_hits.lysate_IAI1="lysate_E. coli IAI1",
has_hits.whole_IAI1="whole_cell_E. coli IAI1",
has_hits.lysate_ED1a="lysate_E. coli ED1a",
has_hits.whole_ED1a="whole_cell_E. coli ED1a")
write.table(tppsum_wide, "data/exp8tpp/tppsum_wide.tsv", na="", sep="\t", quote=F, row.names=F, col.names=T)
}
exp8tpp.analyze = function()
{
# UNIPROT mappings
uniprot2kegg = readr::read_tsv("data/db/uniprot2kegg.tsv", col_names=T, na="")
gene2enzyme = readr::read_tsv("data/db/KEGG/kegg.gene2enzyme.tsv", col_names=T, na="") %>%
dplyr::filter(!is.na(KEGG_UNIPROT))
kegg.pathways = readr::read_delim("data/db/KEGG/kegg_pathways.tsv", "\t")
kegg.gene2enzyme = readr::read_tsv("data/db/KEGG/kegg.gene2enzyme.tsv", col_names=T, na="") %>%
dplyr::filter(!is.na(KEGG_UNIPROT))
kegg.pathway2ec = readr::read_delim("data/db/KEGG/kegg.pathway2gene.tsv", "\t") %>%
dplyr::distinct(KEGG_ORG, KEGG_PATHWAY, KEGG_EC) %>%
dplyr::mutate(KEGG_EC=gsub("ec:", "", KEGG_EC))
kegg.pathway2uniprot = readr::read_delim("data/db/KEGG/kegg.pathway2gene.tsv", "\t") %>%
dplyr::filter(!is.na(KEGG_UNIPROT))
# C. saccharolyticum/E. coli IAI1 blast
csh2refseq = readr::read_delim("data/db/UNIPROT/csh_proteinids2refseq.list", "\t")
ecr2refseq = readr::read_delim("data/db/UNIPROT/ecr_proteinids2refseq.list", "\t")
ecq2refseq = readr::read_delim("data/db/UNIPROT/ecq_proteinids2refseq.list", "\t")
csh2ecr = read.delim("data/db/UNIPROT/csh_ecr.m8", na="", header=F)
colnames(csh2ecr) = c("qseqid", "sseqid", "qlen", "slen", "score", "bitscore", "length")
csh2ecr = csh2ecr %>%
dplyr::inner_join(csh2refseq %>% dplyr::rename(protein_id.Csaccharolyticum="From"), by=c("qseqid"="To")) %>%
dplyr::inner_join(ecr2refseq %>% dplyr::rename(protein_id.IAI1="From"), by=c("sseqid"="To"))
csh2ecr.f = csh2ecr %>%
dplyr::filter(bitscore>50 & length>0.8*qlen) %>%
dplyr::distinct(protein_id.Csaccharolyticum, protein_id.IAI1, bitscore)
#
# Read data files
#
tpp = readr::read_delim("data/exp8tpp/tpp.tsv", "\t")
tpp_long = readr::read_delim("data/exp8tpp/tpp_long.tsv", "\t", col_types=c("temp_hits"="c", "has_hits"="c"))
tppsum_long = readr::read_delim("data/exp8tpp/tppsum_long.tsv", "\t", col_types=c("temp_hits"="c", "has_hits"="c"))
tppsum_long.sep = readr::read_delim("data/exp8tpp/tpp_protein_groups.tsv", "\t")
tppsum_wide = readr::read_delim("data/exp8tpp/tppsum_wide.tsv", "\t", col_types=c("has_hits.whole_IAI1"="c", "has_hits.lysate_IAI1"="c", "has_hits.whole_ED1a"="c", "has_hits.lysate_ED1a"="c", "has_hits.lysate_Csaccharolyticum"="c"))
#
# Display correlation between E. coli strains sum log2 values
#
tppsum_long.fit = tppsum_long %>%
dplyr::group_by(preparation, organism, variable) %>%
dplyr::summarise(total_hits=sum(!is.na(has_hits)))
tppsum_long.ecoli = tppsum_long %>%
dplyr::inner_join(tppsum_long, by=c("preparation", "variable", "gene_name")) %>%
dplyr::filter(grepl("ED1a", organism.x) & grepl("IAI1", organism.y) & !is.na(gene_name)) %>%
setNames(gsub("\\.x$",".ED1a",names(.))) %>%
setNames(gsub("\\.y$",".IAI1",names(.)))
tppsum_long.ecoli.cor = tppsum_long.ecoli %>%
dplyr::group_by(preparation, variable) %>%
dplyr::summarise(
max=max(c(log2fc.ED1a, log2fc.IAI1), na.rm=T),
min=min(c(log2fc.ED1a, log2fc.IAI1), na.rm=T),
x=min+(max-min)*0.8,
y=min+(max-min)*0.2,
slope=round(lm(log2fc.ED1a~log2fc.IAI1)$coefficients[2], 2), R=round(cor(log2fc.IAI1, log2fc.ED1a, use="pairwise.complete.obs"), 2))
pdf("reports/exp8tpp_ecoli.pdf", width=10, height=5)
ggplot(tppsum_long.ecoli, aes(log2fc.IAI1, log2fc.ED1a)) +
geom_abline(slope=1) +
geom_point(alpha=0.2, shape=16) +
geom_text(aes(x=x, y=y, label=paste0("slope: ", slope, "\nR: ", R)), size=6, data=tppsum_long.ecoli.cor) +
geom_smooth(method="lm", fullrange=T, alpha=0.2) +
facet_wrap(~preparation, scales="free") +
labs(x="Sum fold-changes (IAI1)", y="Sum fold-changes (ED1a)")+
theme_classic() +
theme(aspect.ratio=1, axis.title=element_text(size=15), axis.text=element_text(size=13),
legend.title=element_text(size=15), legend.text=element_text(size=13),
strip.text = element_text(size=20),
strip.background=element_blank(),
panel.grid=element_blank())
venn.data = with(tppsum_long.ecoli, list(
IAI1_whole=gene_name[!is.na(has_hits.IAI1) & preparation=="whole_cell"],
IAI1_lysate=gene_name[!is.na(has_hits.IAI1) & preparation=="lysate"],
ED1a_whole=gene_name[!is.na(has_hits.ED1a) & preparation=="whole_cell"],
ED1a_lysate=gene_name[!is.na(has_hits.ED1a) & preparation=="lysate"]
))
venn.colors = c(IAI1_whole="#FDC08699", IAI1_lysate="#386CB0FF", ED1a_whole="#FFD92F99", ED1a_lysate="#6DB5D2")[names(venn.data)]
grid.newpage()
grid.draw(VennDiagram::venn.diagram(
venn.data,
fill=venn.colors,
filename=NULL, force.unique=T))
dev.off()
tppenrichment.species = tppsum_long %>%
dplyr::filter(grepl("C. saccharolyticum|IAI1", organism)) %>%
dplyr::mutate(hit=ifelse(grepl("stabilized", has_hits), "Yes", "No")) %>%
dplyr::select(organism, preparation, original_protein_id, has_hits)
#
# Analyze TPP hits and overlap
#
tppenrichment.species = tppsum_long %>%
dplyr::filter(grepl("C. saccharolyticum|IAI1", organism)) %>%
dplyr::mutate(hit=ifelse(grepl("stabilized", has_hits), "Yes", "No")) %>%
dplyr::select(organism, preparation, original_protein_id, has_hits)
#
# Overlap between E. coli IAI1 and C. saccharolyticum WM1
#
# Cluster proteins based on BLAST
g = igraph::graph_from_data_frame(csh2ecr.f, directed=F)
gc = igraph::clusters(g)
gc = data.frame(gc.names=names(gc$membership), gc.membership=gc$membership)
# Use protein clusters of individual species to cluster cross-species proteins (significant blast hits between proteins of two species)
tpp_csh2ecr.f = csh2ecr.f %>%
dplyr::inner_join(gc, by=c("protein_id.Csaccharolyticum"="gc.names")) %>%
dplyr::inner_join(gc, by=c("protein_id.IAI1"="gc.names")) %>%
dplyr::mutate(cluster=ifelse(gc.membership.x==gc.membership.y, gc.membership.x, NA_integer_)) %>%
dplyr::inner_join(tppsum_long.sep %>% dplyr::rename(original_protein_id.Csaccharolyticum="original_protein_id"), by=c("protein_id.Csaccharolyticum"="protein_id")) %>%
dplyr::inner_join(tppsum_long.sep %>% dplyr::rename(original_protein_id.IAI1="original_protein_id"), by=c("protein_id.IAI1"="protein_id")) %>%
dplyr::filter(!is.na(cluster)) %>%
dplyr::distinct(original_protein_id.Csaccharolyticum, original_protein_id.IAI1, cluster)
# Overlap between E.coli IAI1 and C. saccharolyticum WM1 protein clusters in TPP
tppenrichment.multispecies = tpp_csh2ecr.f %>%
dplyr::group_by(cluster) %>%
dplyr::summarise(cluster_protein_id.Csaccharolyticum=strset(original_protein_id.Csaccharolyticum), cluster_protein_id.IAI1=strset(original_protein_id.IAI1)) %>%
dplyr::mutate(protein_id.Csaccharolyticum=cluster_protein_id.Csaccharolyticum, protein_id.IAI1=cluster_protein_id.IAI1) %>%
tidyr::separate_rows(protein_id.Csaccharolyticum, sep="\\|") %>%
tidyr::separate_rows(protein_id.IAI1, sep="\\|") %>%
dplyr::left_join(tppsum_wide %>% dplyr::filter(grepl("stabilized", has_hits.lysate_IAI1)) %>% dplyr::select(protein_id, has_hits.lysate_IAI1), by=c("protein_id.IAI1"="protein_id")) %>%
dplyr::left_join(tppsum_wide %>% dplyr::filter(grepl("stabilized", has_hits.whole_IAI1)) %>% dplyr::select(protein_id, has_hits.whole_IAI1), by=c("protein_id.IAI1"="protein_id")) %>%
dplyr::left_join(tppsum_wide %>% dplyr::filter(grepl("stabilized", has_hits.lysate_Csaccharolyticum)) %>% dplyr::select(protein_id, has_hits.lysate_Csaccharolyticum), by=c("protein_id.Csaccharolyticum"="protein_id")) %>%
dplyr::group_by(cluster, cluster_protein_id.IAI1, cluster_protein_id.Csaccharolyticum) %>%
dplyr::summarise(
protein_id.lysate_Csaccharolyticum=strset(protein_id.Csaccharolyticum, grepl("stabilized", has_hits.lysate_Csaccharolyticum)),
protein_id.lysate_IAI1=strset(protein_id.IAI1, grepl("stabilized", has_hits.lysate_IAI1)),
protein_id.whole_IAI1=strset(protein_id.IAI1, grepl("stabilized", has_hits.whole_IAI1)),
has_hits.lysate_Csaccharolyticum=strset(paste0(protein_id.Csaccharolyticum, ":", has_hits.lysate_Csaccharolyticum), grepl("stabilized", has_hits.lysate_Csaccharolyticum)),
has_hits.lysate_IAI1=strset(paste0(protein_id.IAI1, ":", has_hits.lysate_IAI1), grepl("stabilized", has_hits.lysate_IAI1)),
has_hits.whole_IAI1=strset(paste0(protein_id.IAI1, ":", has_hits.whole_IAI1), grepl("stabilized", has_hits.whole_IAI1))) %>%
dplyr::ungroup()
pdf("reports/exp8tpp_csh2ecoli_venn.pdf", width=10, height=5)
venn.data = with(tppenrichment.multispecies, list(IAI1_whole=cluster[!is.na(has_hits.whole_IAI1)], CSH_lysate=cluster[!is.na(has_hits.lysate_Csaccharolyticum)], IAI1_lysate=cluster[!is.na(protein_id.lysate_IAI1)]))
venn.colors = c(IAI1_whole="#FDC08699", CSH_lysate="#386CB0FF", IAI1_lysate="#FFD92F99")
grid.newpage()
grid.draw(VennDiagram::venn.diagram(
venn.data,
fill=venn.colors,
filename=NULL, force.unique=T))
tppenrichment.multispecies.export = tppenrichment.multispecies %>%
dplyr::mutate(hit=ifelse(!is.na(has_hits.lysate_Csaccharolyticum) & !is.na(has_hits.whole_IAI1) & !is.na(protein_id.lysate_IAI1), "Yes", "No")) %>%
dplyr::select(cluster_protein_id.Csaccharolyticum, cluster_protein_id.IAI1, hit)
tppenrichment.species_export = tppenrichment.species %>%
dplyr::mutate(hit=ifelse(grepl("stabilized", has_hits), "Yes", "No")) %>%
dplyr::select(organism, preparation, original_protein_id, hit)
readr::write_delim(tppenrichment.multispecies.export, "reports/exp8tpp_tppenrichment_multispecies.tsv", delim="\t", na="")
readr::write_delim(tppenrichment.species_export, "reports/exp8tpp_tppenrichment_species.tsv", delim="\t", na="")
dev.off()
#
# Export
#
tpp_export.a = tpp %>%
dplyr::mutate(cluster_progein_id=protein_id) %>%
tidyr::separate_rows(cluster_progein_id, sep="\\|")
tpp_export.b = tppenrichment.multispecies.export %>%
dplyr::mutate(cluster_id=1:n()) %>%
tidyr::separate_rows(cluster_protein_id.Csaccharolyticum, sep="\\|") %>%
tidyr::separate_rows(cluster_protein_id.IAI1, sep="\\|") %>%
dplyr::rename(`C. saccharolyticum`="cluster_protein_id.Csaccharolyticum", `E. coli IAI1`="cluster_protein_id.IAI1") %>%
reshape2::melt(id.vars=c("cluster_id"), variable.name="organism", value.name="cluster_progein_id") %>%
dplyr::distinct()
tpp_export = tpp_export.a %>%
dplyr::left_join(tpp_export.b, by=c("organism", "cluster_progein_id")) %>%
dplyr::select(-cluster_progein_id) %>%
dplyr::group_by(preparation, organism, temperature, protein_id) %>%
dplyr::mutate(
gene_name=paste0(unique(unlist(strsplit(gene_name, "\\|"))), collapse="|"),
cluster_id=ifelse(all(is.na(cluster_id)), NA_integer_, unique(na.omit(cluster_id)))
) %>%
dplyr::ungroup() %>%
unique()
tpp_rossman = readr::read_delim("data/exp8tpp/exp8tpp_rossman_data.tsv", "\t", col_types=c(rossman_family="c"))
tpp_export.rossman = tpp_export %>%
dplyr::left_join(tpp_rossman, by=c("organism", "preparation", "protein_id"="original_protein_id")) %>%
dplyr::left_join(kegg.gene2enzyme %>% dplyr::filter(!is.na(KEGG_UNIPROT)) %>% dplyr::select(KEGG_UNIPROT, KEGG_EC), by=c("protein_id"="KEGG_UNIPROT")) %>%
dplyr::select(
organism, preparation, temperature, protein_id, gene_name,
enzyme=KEGG_EC,
compound_effect,
description,
qupm, cluster_id, rossman_hit, rossman_family, experiment,
signal_sum_0, signal_sum_0.5, signal_sum_2, signal_sum_10, signal_sum_50,
norm_rel_fc_protein_0.5_unmodified, norm_rel_fc_protein_2_unmodified, norm_rel_fc_protein_10_unmodified, norm_rel_fc_protein_50_unmodified)
readr::write_delim(tpp_export.rossman, "reports/exp8tpp_export.tsv", delim="\t", na="")
#
# GO enrichment
#
tpp_enrichment.hits = tpp_long %>%
dplyr::filter(concentration==50) %>%
dplyr::group_by(organism, preparation, original_protein_id, variable) %>%
dplyr::summarise(is_hit=sum(compound_effect==has_hits, na.rm=T)>0)
tpp_enrichment.go = tppsum_long %>%
dplyr::mutate(protein_id=original_protein_id) %>%
tidyr::separate_rows(protein_id, sep="\\|") %>%
dplyr::inner_join(readr::read_delim("data/db/uniprot2go.tsv", "\t"), by=c("protein_id"="uniprot_id")) %>%
reshape2::dcast(organism+preparation+original_protein_id~go_term, value.var="go_term", fun.aggregate=length) %>%
reshape2::melt(id.vars=c("organism", "preparation", "original_protein_id"), variable.name="go_term", value.name="in_go", factorsAsStrings=F) %>%
dplyr::mutate(in_go=in_go>0)
tpp_enrichment = tpp_enrichment.go %>%
dplyr::inner_join(tpp_enrichment.hits, by=c("organism", "preparation", "original_protein_id")) %>%
dplyr::group_by(organism, preparation, go_term) %>%
dplyr::do((function(z){
zz<<-z
m11 = sum(z$is_hit & z$in_go)
m12 = sum(z$is_hit & !z$in_go)
m21 = sum(!z$is_hit & z$in_go)
m22 = sum(!z$is_hit & !z$in_go)
m = matrix(c(m11, m12, m21, m22), ncol=2, byrow=T)
z.test = fisher.test(m)
data.frame(odds=z.test$estimate, pvalue=z.test$p.value, hits_in_go=m11, hits_in_species=sum(z$is_hit), proteins_in_go=sum(z$in_go), proteins_in_species=nrow(z))
})(.)) %>%
dplyr::ungroup() %>%
dplyr::inner_join(readr::read_delim("data/db/GO/go_terms.tsv", "\t"), by="go_term") %>%
dplyr::mutate(is_significant=ifelse(pvalue<=0.1, "Yes", "No")) %>%
dplyr::select(organism, preparation, go_term, go_description, is_significant, odds, pvalue, hits_in_go, hits_in_species, proteins_in_go, proteins_in_species)
tpp_enrichment.f = tpp_enrichment %>%
dplyr::filter(is_significant=="Yes") %>%
dplyr::inner_join(readr::read_delim("data/db/GO/go_terms.tsv", "\t"), by="go_term") %>%
dplyr::arrange(organism, preparation, dplyr::desc(abs(log2(odds))), pvalue) %>%
dplyr::select(-is_significant)
readr::write_delim(tpp_enrichment.f, "reports/exp8tpp_go_enrichment.tsv", delim="\t", na="")
#
# TPP / Duloxetine pull-down proteomics
#
pdf("reports/exp8tpp_tpp2click_venn.pdf", width=10, height=5)
click.universe = readr::read_tsv("reports/exp3proteomics_volcano_plot_data.tsv", col_names=T, na="") %>%
dplyr::mutate(hit.click=is_significant=="Yes") %>%
dplyr::select(protein_id=proteinId, hit.click) %>%
dplyr::distinct(protein_id, hit.click)
tpp.universe = tppsum_long %>%
dplyr::filter(grepl("sacch", organism) & preparation=="lysate") %>%
dplyr::mutate(hit.tpp=!is.na(has_hits)) %>%
tidyr::separate_rows(original_protein_id, sep="\\|") %>%
dplyr::distinct(original_protein_id, hit.tpp)%>%
dplyr::select(protein_id=original_protein_id, hit.tpp)
tpp2click.export = click.universe %>%
dplyr::inner_join(tpp.universe, by="protein_id")
readr::write_delim(tpp2click.export, "reports/exp8tpp_tpp2click_overlap.tsv", delim="\t", na="")
tpp2click.venn = with(tpp2click.export, list(tpp=protein_id[hit.click], click=protein_id[hit.tpp]))
venn.colors = c(tpp="#386CB0FF", click="#38A0B0FF")
grid.newpage()
grid.draw(VennDiagram::venn.diagram(
tpp2click.venn,
fill=venn.colors,
filename=NULL,
force.unique=T))
dev.off()
}
analyze.enrichment = function() {
tppenrichment.multispecies.export = readr::read_delim("reports/exp8tpp_tppenrichment_multispecies.tsv", delim="\t")
tppenrichment.species_export = readr::read_delim("reports/exp8tpp_tppenrichment_species.tsv", delim="\t")
rossman_fold.IAI1 = data.frame(str=readLines("data/exp8tpp/enrichment/Ecoli/rosmann_matches.txt"), stringsAsFactors=F) %>%
dplyr::mutate(organism="E. coli IAI1", rossman_hit="Yes") %>%
tidyr::extract(str, c("some_id", "protein_id", "rossman_family"), regex='^( *\\d+ *)([^_]+)_.*"id":"([^"]+)".*')
rossman_fold.CSH = data.frame(str=readLines("data/exp8tpp/enrichment/Csaccharolyticum/all_rosmann_matches.txt"), stringsAsFactors=F) %>%
dplyr::mutate(organism="C. saccharolyticum", rossman_hit="Yes") %>%
tidyr::extract(str, c("some_id", "protein_id", "rossman_family"), regex='^( *\\d+ *)([^_]+)_.*"id":"([^"]+)".*')
rossman_fold = rbind(rossman_fold.IAI1, rossman_fold.CSH) %>%
dplyr::distinct(organism, protein_id, .keep_all=T)
tppenrichment = tppenrichment.multispecies.export %>%
dplyr::mutate(original_protein_id=paste0(cluster_protein_id.Csaccharolyticum, "|", cluster_protein_id.IAI1), preparation="lysate", organism="multispecies") %>%
select(organism, preparation, original_protein_id, hit)
tppenrichment = rbind(tppenrichment, tppenrichment.species_export)
tppenrichment.rossman = tppenrichment %>%
dplyr::mutate(protein_id=original_protein_id) %>%
tidyr::separate_rows(protein_id, sep="\\|") %>%
dplyr::left_join(rossman_fold, by=c("organism", "protein_id")) %>%
dplyr::mutate(rossman_hit=tidyr::replace_na(rossman_hit, "No")) %>%
dplyr::group_by(organism, preparation, original_protein_id, hit) %>%
dplyr::summarise(
rossman_hit=ifelse(any(rossman_hit=="Yes"), "Yes", "No"),
rossman_family=paste(unique(rossman_family), collapse="|"),
rossman_family=ifelse(rossman_family=="", NA_character_, rossman_family))
readr::write_delim(tppenrichment.rossman, "reports/exp8tpp_rossman_data.tsv", delim="\t", na="")
tppenrichment.rossman_sum = tppenrichment.rossman %>%
dplyr::group_by(organism, preparation) %>%
dplyr::do((function(z) {
m11 = sum(z$hit=="Yes" & z$rossman_hit=="Yes")
m12 = sum(z$hit=="Yes" & z$rossman_hit=="No")
m21 = sum(z$hit=="No" & z$rossman_hit=="Yes")
m22 = sum(z$hit=="No" & z$rossman_hit=="No")
t = fisher.test(matrix(c(m11,m12,m21,m22), ncol=2, byrow=T))
data.frame(pvalue=t$p.value, odds=t$estimate, hit1_rossman1=m11, hit1_rossman0=m12, hit0_rossman1=m21, hit0_rossman0=m22)
})(.))
readr::write_delim(tppenrichment.rossman_sum, "reports/exp8tpp_rossman_enrichment.tsv", delim="\t", na="")
# organism preparation pvalue odds hit1_rossman1 hit1_rossman0 hit0_rossman1 hit0_rossman0
# C. saccharolyticum lysate 0.631 1.10 42 114 469 1404
# E. coli IAI1 lysate 0.000000143 0.470 61 309 493 1173
# E. coli IAI1 whole_cell 0.236 0.844 92 271 386 960
# multispecies lysate 1 0 0 6 0 519
}