-
Notifications
You must be signed in to change notification settings - Fork 0
/
4. 5'UTRs of disease genes.Rmd
1038 lines (834 loc) · 35.3 KB
/
4. 5'UTRs of disease genes.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "4. 5'UTRs across LEOUF"
output: rmarkdown::github_document
---
#5'UTR Analysis for disease genes (DD, cancer, DS)
1. Bring in gene sets and clean data
2. 5UTR Length
3. uAUG type
4. PhyloP
5. 5UTR Intron
6. Recessive DD genes vs middle LEOUF deciles
Load Libraries
```{r}
library(splitstackshape)
library(stringr)
library(dplyr)
library(tidyverse)
library("readxl")
library(ggplot2)
```
#1. Bring all data in
```{r}
#plan
#1. Developmental Disorders (DDG2P) restrict to LoF and split dom+rec
#2. Cancer, split onco and TSG
#3. Dosage Sensitive, hap and trip
#4. all genes
################1. DD
#data: downloaded 18 February 2021 from DDG2P (McRae JF, Clayton S, Fitzgerald TW, Kaplanis J, Prigmore E, Rajan D, et al. Prevalence and architecture of de novo mutations in developmental disorders. Nature. 2017 Feb;542(7642):433–8)
#limited to confirmed or probable roles in developmental disorders and loss-of-function disease mechanism.
mst_ddG2P<-read.csv(file="DDG2P_18_2_2021.csv.gz")
ddG2P <- mst_ddG2P
#bring in MANE summary file and merge to get transcript/gene IDs
mane_summary <- read.delim(file = "MANE.GRCh38.v1.0.summary.txt.gz")
mane <- mane_summary
#remove "HGNC:" from HGNC column
manesplit <- cSplit(mane, "HGNC_ID", ":")
#subset
mane2 <- data.frame("gene_id" = manesplit$Ensembl_Gene, "transcript_id" = manesplit$Ensembl_nuc, "HGNC" = manesplit$HGNC_ID_2, "gene"=manesplit$symbol)
#filter DDD.category to confirmed or probable
ddconf <- ddG2P %>% filter(DDD.category == "confirmed")
ddprob <- ddG2P %>% filter(DDD.category == "probable")
dd1 <- rbind(ddconf, ddprob) #2037
#merge MANE with DD data to get transcript/gene ids
names(dd1)[13] <- "HGNC"
dd <- merge(dd1, mane2, by = "HGNC")#2071
#bring length data in
length_exon <- read.table("length_introns.txt")
#filter to just LoF, mono/bi
lof <- dd %>% filter(mutation.consequence == "loss of function")#1443
#merge with lengths to get info
loflen <- merge(lof, length_exon, by = "transcript_id")#1416
monolof <- loflen %>% filter(allelic.requirement == "monoallelic")#328
bilof <- loflen %>% filter(allelic.requirement == "biallelic")#948
########################2. Cancer, onco and TSG
#data: The COSMIC Cancer Gene Census was downloaded 22nd February 2021.
#Restricted to nonsense, frameshift and missense mutation types and then filtered to oncogene or TSG only as cancer gene type.
#bring in gene census data to extract if they are TSG/oncogenes
copy_census <- read.csv("Census_allMon Feb 22 14_01_37 2021.csv")
census_cancer_genes <- copy_census
#nonsense, frameshift, missense mutations
mutations <- dplyr::filter(census_cancer_genes, grepl("N|F|Mis", Mutation.Types))
mutations$split1 <- sub(".*ENSG", "", mutations$Synonyms)
#this removed ENSG and gave me everything after
#can now split by comma and then re-add ENSG as a prefix
mutations$ensg <- "ENSG"
mutations$paste <- paste(mutations$ensg, mutations$split1, sep = "")
mut2 <- cSplit(mutations, "paste", ",")
#rename gene_id
names(mut2)[23] <- "gene_id"
#cut off after the full stop for ENSGs (otherwise when merged with length_exon not enough overlap)
mut3 <- cSplit(mut2, "gene_id", ".")
#filter to TSG and onco
oncogene <- mut3[grepl('oncogene',tolower(mut3$Role.in.Cancer)) & !grepl('tsg',tolower(mut3$Role.in.Cancer)),]
TSG <- mut3[!grepl('oncogene',tolower(mut3$Role.in.Cancer)) & grepl('tsg',tolower(mut3$Role.in.Cancer)),]
#merge length exon
allUTRlengths <- cSplit(length_exon, "gene_id", ".")
#merge
tsg_len <- merge(TSG, allUTRlengths, by = "gene_id_1") #166
onco_len <- merge(oncogene, allUTRlengths, by = "gene_id_1") #109
##########################3. DS, hap and trip
#Collins RL, Glessner JT, Porcu E, Lepamets M, Brandon R, Lauricella C, et al. A cross-disorder dosage sensitivity map of the human genome. Cell. 2022 Aug 4;185(16):3041-3055.e25
#2,987 HI (pHaplo ≥ 0.86) and 1,559 TS (pTriplo ≥ 0.94) genes
ds <- read_excel("collins DS.xlsx")#18641
names(ds)[1]<-"gene_name"
#gives warnings but think its fine
hap <- ds %>% filter(pHaplo>=0.86)#2987
trip <- ds %>% filter(pTriplo>=0.94)#1559
hap2 <- merge(length_exon, hap, by="gene_name")#2856
trip2 <- merge(length_exon, trip, by="gene_name")#1487
#merged by gene bec no other info provided
#4. All Genes
#length_exon, added above
```
#2. Grouped Length plot
```{r}
#Get each sub group into its own df with gene, UTR length, and group its in, (and intron_final bec i need it later) Then can rbind them all together
#FACET plot
#1. DD
mono <- subset(monolof, select = c(gene, UTR_length, intron_final))
mono$group <- "Dominant (n=328)"
mono$disease <- "Developmental Disorder"
bi <- subset(bilof, select = c(gene, UTR_length, intron_final))
bi$group <- "Recessive (n=948)"
bi$disease <- "Developmental Disorder"
#2. Cancer
#subset onco_len and TSG_len
onc <- data.frame("gene"=onco_len$gene_name, "UTR_length"=onco_len$UTR_length, "group"="Oncogene (n=109)", "disease"="Cancer", "intron_final"=onco_len$intron_final)
TSG2 <- data.frame("gene"=tsg_len$gene_name, "UTR_length"=tsg_len$UTR_length, "group"="TSG (n=166)", "disease"="Cancer","intron_final"=tsg_len$intron_final)
#3. DS
hap_len <- data.frame("gene"=hap2$gene_name, "UTR_length"=hap2$UTR_length, "group"="Haploinsufficient (n=2856)", "disease"="Dosage Sensitive", "intron_final"=hap2$intron_final)
trip_len <- data.frame("gene"=trip2$gene_name, "UTR_length"=trip2$UTR_length, "group"="Triplosensitive (n=1487)", "disease"="Dosage Sensitive", "intron_final"=trip2$intron_final)
#4. All genes
all <- data.frame("gene"=length_exon$gene_name, "UTR_length"=length_exon$UTR_length, "group"="All Genes (n=18764)", "disease"="All Genes", "intron_final"=length_exon$intron_final)
#bind into one df and then plot a faceted boxplot?
length_plot <-bind_rows(all, hap_len, trip_len,onc,TSG2,mono, bi)
#plot
#get median for x intercept
median(all$UTR_length)
#raincloud for this
#this works
source("https://gist.githubusercontent.com/benmarwick/2a1bb0133ff568cbe28d/raw/fb53bd97121f7f9ce947837ef1a4c65a73bffb3f/geom_flat_violin.R")
ggplot(data = length_plot, aes(y=UTR_length, x = group)) + geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8, fill="#793A92") + geom_point(aes(y = UTR_length, fill = group), position = position_jitter(width = 0.15), size = 1, alpha = 0.1, color="grey48") + geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5, fill="#793A92") + labs(x = "5'UTR Length (bp)") + guides(fill = FALSE, color = FALSE) +theme_light() + coord_flip(ylim=c(0,2000)) + geom_hline(yintercept = 136, colour="black",linetype = "dotted") + facet_grid(disease ~ ., scales = "free", space = "free", labeller = labeller(disease=label_wrap_gen(width=20))) + facet_grid(disease ~ ., scales = "free", space = "free", labeller = labeller(disease=label_wrap_gen(width=20))) + theme(axis.text.y = element_text(size = 10)) + theme(strip.background =element_rect(fill="gray47"))+ theme(strip.text = element_text(colour = 'white', size = '6', face="bold")) + ylab("5'UTR Length (bp)") + xlab("Disease Gene Group")
#some stats
#means for all and stats to show whether increase/decrease in length is sig
#means
mean(all$UTR_length)#201.6124
mean(mono$UTR_length)#368.9695
mean(bi$UTR_length)#168.6814
mean(onc$UTR_length)#259.4495
mean(TSG2$UTR_length)#254.0181
mean(hap_len$UTR_length)#278.5063
mean(trip_len$UTR_length)# 252.694
#wilcox
al_len <- all$UTR_length
m_len <- mono$UTR_length
bi_le<- bi$UTR_length
on_le<- onc$UTR_length
tsg_le<- TSG2$UTR_length
ha_le<- hap_len$UTR_length
tr_le <- trip_len$UTR_length
#DD
wilcox.test(al_len, m_len)$p.value #4.202658e-39
wilcox.test(al_len, bi_le)$p.value #1.270368e-07
wilcox.test(al_len, on_le)$p.value #1.71202e-05
wilcox.test(al_len, tsg_le)$p.value #0.0003257773
wilcox.test(al_len, ha_le)$p.value #4.777048e-100
wilcox.test(al_len, tr_le)$p.value #2.510808e-30
#do stats for 5utr length against all genes with disease genes removed
n_dom <- anti_join(all,mono, by="gene")#
n_rec <- anti_join(all,bi, by="gene")#
n_hap <- anti_join(all,hap_len, by="gene")#
n_trip <- anti_join(all,trip_len, by="gene")#
n_onc <- anti_join(all,onc, by="gene")#
n_tsg <- anti_join(all,TSG2, by="gene")#
n_dom <- n_dom$UTR_length
n_rec <- n_rec$UTR_length
n_hap <- n_hap$UTR_length
n_trip <- n_trip$UTR_length
n_onc <- n_onc$UTR_length
n_tsg <- n_tsg$UTR_length
wilcox.test(n_dom, m_len)$p.value #1.967071e-40
wilcox.test(n_rec, bi_le)$p.value #2.743379e-08
wilcox.test(n_onc, on_le)$p.value #1.528611e-05
wilcox.test(n_tsg, tsg_le)$p.value #0.0002880399
wilcox.test(n_hap, ha_le)$p.value #2.904055e-135
wilcox.test(n_trip, tr_le)$p.value #2.953262e-35
```
#3. uAUG Types
```{r}
#bring in data
orf_gene <- read.table("orf_gene.txt")
#merge with length disese plot?
#this is only transcript ids. need transcript ids added to length disease plot
info <- data.frame(gene=length_exon$gene_name, transcript=length_exon$transcript_id)
names(info)[1]<-"gene"
len_plot2 <- merge(length_plot, info, by="gene")
#remove dup trnascript bec 1 gene is in twice
len_plot <- len_plot2 %>% distinct()
#now merge with orfs
orf_dis <- merge(len_plot, orf_gene, by="transcript")
#need percentages in each group which have each type of uaug
orf_plot <- orf_dis %>% group_by(group, orf_type, disease) %>% summarise(n())
#can "grep" the number out of the group col to work out percentages
library(tidyr)
orf_plot$total <- parse_number(orf_plot$group)
orf_plot$perc <- (orf_plot$`n()`/orf_plot$total) *100
#plot
#change to Start-Stop
orf_plot[orf_plot=="start-stop"] <- "Start-Stop"
#order bars
orf_plot$orf_type <- factor(orf_plot$orf_type, levels = c("Start-Stop","oORF", "uORF"))
orf_plot[orf_plot=="Developmental Disorder"] <- "DD"
orf_plot[orf_plot=="Dosage Sensitive"] <- "DS"
orf_plot[orf_plot=="All Genes"] <- "All"
ggplot(orf_plot, aes(x=perc, y=group, fill=orf_type)) + geom_bar(position="dodge", stat="identity", width=0.8) + theme_light() + ylab("Gene Groups") + xlab("Percentage")+ scale_fill_manual(values=c("deepskyblue1","palevioletred1","navy" )) + guides(fill=guide_legend(title="uAUG Type")) +facet_grid(disease ~ ., scales = "free", space = "free", labeller = labeller(disease=label_wrap_gen(width=20)))+ theme(axis.text.y = element_text(size = 10)) + theme(strip.background =element_rect(fill="gray47"))+ theme(strip.text = element_text(colour = 'white', size = '9', face="bold")) + geom_vline(xintercept = 34.4, colour="navy",linetype = "dotted") + geom_vline(xintercept = 15, colour="palevioletred1",linetype = "dotted") + geom_vline(xintercept = 5, colour="deepskyblue1",linetype = "dotted") + theme(axis.text.x = element_text(vjust = 0.5))
#write.table(orf_plot, "disease_orfs4plot_m1.txt")
#######stats############
#chi-square between having 0 and 1 uorf and start stop compared to all genes
#stats for disease groups having uorfs and start-stops
#make matrix chi squared tables for each against all genes-disease genes. so essentially testing whether each disease group has more vs all genes. between having 0 uorfs/start-stops and 1+
#1. get all genes with 0 or with 1+ uorfs - 2 df's (no start-stops)
uORF <- orf_gene %>% filter(orf_type=="uORF")#6461
#merge length_exon to get info
names(length_exon)[3]<-"transcript"
uorf_per_gene <- merge(uORF, length_exon, by="transcript")
#all genes no uorf
no_uorf <- anti_join(length_exon, uorf_per_gene, by="transcript")#12303
#2. merge each of these with disease groups to get the all gene set for comparison for each section
#better way to do this is have them as lists/vectors so not making a million dfs. then can put values in more easily i think
#need all genes - each of the disease cat
no_dom <- anti_join(length_exon, mono, by="gene")
no_rec <- anti_join(length_exon, bi, by="gene")
no_onc <- anti_join(length_exon, onc, by="gene")
no_tsg <- anti_join(length_exon, TSG2, by="gene")
no_hap <- anti_join(length_exon, hap_len, by="gene")
no_trip <- anti_join(length_exon, trip_len, by="gene")
#DD:
no_dom <- anti_join(length_exon, mono, by="gene")#all gene set without DD dom genes in
no_d_no<- merge(no_dom, no_uorf, by="gene")#all genes with no DD dom in which have no uorfs
no_d_uorf <- merge(no_dom, uorf_per_gene, by="gene")#all genes with no dd dom in , which have 1+uorf
no_rec <- anti_join(length_exon, bi, by="gene") #
no_r_no <- merge(no_rec, no_uorf, by="gene")#
no_r_uorf <- merge(no_rec, uorf_per_gene, by="gene")#
#cancer
no_onc_no <- merge(no_onc, no_uorf, by="gene")
no_onc_uorf <- merge(no_onc, uorf_per_gene, by="gene")#
no_tsg_no<- merge(no_tsg, no_uorf, by="gene")#
no_tsg_uorf <- merge(no_tsg, uorf_per_gene, by="gene")#
#ds
no_hap_no <- merge(no_hap, no_uorf, by="gene")
no_hap_uorf <- merge(no_hap, uorf_per_gene, by="gene")#
no_trip_no <- merge(no_trip, no_uorf, by="gene")
no_trip_uorf <- merge(no_trip, uorf_per_gene, by="gene")#
#stats. make a matrix for each dis set but without manual coding
#1.DD
#dd dom genes with no uorf and 1+ uorf
d2 <- anti_join(mono, uorf_per_gene, by="gene")#
dom_uorf <- merge(uorf_per_gene, mono, by="gene")#
#get 0 uorf for all and dd dom
nouo <- c(nrow(no_d_no), nrow(d2))
uo <- c(nrow(no_d_uorf), nrow(dom_uorf))
monuorf <- data.frame("0"=nouo, "1"=uo)
rownames(monuorf) <- c("all", "dom")
colnames(monuorf) <- c("0", "1")
chisq.test(monuorf)$p.value
#p = 2.776507e-19
#rec
r2 <- anti_join(bi, uorf_per_gene, by="gene")#691
rec_uorf <- merge(uorf_per_gene, bi, by="gene")#260
#get 0 uorf for all and dd rec
nouo <- c(nrow(no_r_no), nrow(r2))
uo <- c(nrow(no_r_uorf), nrow(rec_uorf))
biuorf <- data.frame("0"=nouo, "1"=uo)
rownames(biuorf) <- c("all", "rec")
colnames(biuorf) <- c("0", "1")
chisq.test(biuorf)$p.value
#2.700602e-06
#2. cancer
#onc
o2 <- anti_join(onc, uorf_per_gene, by="gene")#
onc_uorf <- merge(uorf_per_gene, onc, by="gene")#43
#get 0 uorf for all and dd rec
nouo <- c(nrow(no_onc_no), nrow(o2))
uo <- c(nrow(no_onc_uorf), nrow(onc_uorf))
oncuorf <- data.frame("0"=nouo, "1"=uo)
rownames(oncuorf) <- c("all", "onc")
colnames(oncuorf) <- c("0", "1")
chisq.test(oncuorf)$p.value
#0.06968515
#TSG
t2 <- anti_join(TSG2, uorf_per_gene, by="gene")#84
tsg_uorf <- merge(uorf_per_gene, TSG2, by="gene")#75
#get 0 uorf for all and dd rec
nouo <- c(nrow(no_tsg_no), nrow(t2))
uo <- c(nrow(no_tsg_uorf), nrow(tsg_uorf))
tsguorf <- data.frame("0"=nouo, "1"=uo)
rownames(tsguorf) <- c("all", "tsg")
colnames(tsguorf) <- c("0", "1")
chisq.test(tsguorf)$p.value
#6.471464e-05
#DS
#hap
h2 <- anti_join(hap_len, uorf_per_gene, by="gene")#84
hap_uorf <- merge(uorf_per_gene, hap_len, by="gene")#75
#get 0 uorf for all and dd rec
nouo <- c(nrow(no_hap_no), nrow(h2))
uo <- c(nrow(no_hap_uorf), nrow(hap_uorf))
hapuorf <- data.frame("0"=nouo, "1"=uo)
rownames(hapuorf) <- c("all", "hap")
colnames(hapuorf) <- c("0", "1")
chisq.test(hapuorf)$p.value
# 3.320967e-43
#trip
t2 <- anti_join(trip_len, uorf_per_gene, by="gene")#173
trip_uorf <- merge(uorf_per_gene, trip_len, by="gene")#95
#get 0 uorf for all and dd rec
nouo <- c(nrow(no_trip_no), nrow(t2))
uo <- c(nrow(no_trip_uorf), nrow(trip_uorf))
tripuorf <- data.frame("0"=nouo, "1"=uo)
rownames(tripuorf) <- c("all", "trip")
colnames(tripuorf) <- c("0", "1")
chisq.test(tripuorf)$p.value
#1.414266e-07
#now for stats for start-stops
#1. get all genes with 0 or with 1+ start-stops - 2 df's
#genes with start-stops
ss_per_gene <- orf_gene %>% filter(orf_type=="start-stop")
ss_per_gene <- merge(ss_per_gene, length_exon, by="transcript")
#all genes no ss
no_ss <- anti_join(length_exon, ss_per_gene, by="gene")
#2. All genes with no disease genes in with and without start-stops
no_d_no<- merge(no_dom, no_ss, by="gene")#
no_d_ss <- merge(no_dom, ss_per_gene, by="gene")#
no_r_no <- merge(no_rec, no_ss, by="gene")#
no_r_ss <- merge(no_rec, ss_per_gene, by="gene")#
#cancer
no_onc_no <- merge(no_onc, no_ss, by="gene")
no_onc_ss <- merge(no_onc, ss_per_gene, by="gene")#
no_tsg_no<- merge(no_tsg, no_ss, by="gene")#
no_tsg_ss <- merge(no_tsg, ss_per_gene, by="gene")#
#ds
no_hap_no <- merge(no_hap, no_ss, by="gene")
no_hap_ss <- merge(no_hap, ss_per_gene, by="gene")#
no_trip_no <- merge(no_trip, no_ss, by="gene")
no_trip_ss <- merge(no_trip, ss_per_gene, by="gene")#
#3. stats
#make a matrix for each dis set but without manual coding
#1.DD
#dd dom genes with no ss and 1+ ss
d2 <- anti_join(mono, ss_per_gene, by="gene")#305
dom_ss <- merge(ss_per_gene, mono, by="gene")#23
#get 0 ss for all and dd dom
noss <- c(nrow(no_d_no), nrow(d2))
ss1 <- c(nrow(no_d_ss), nrow(dom_ss))
monss <- data.frame("0"=noss, "1"=ss1)
rownames(monss) <- c("all", "dom")
colnames(monss) <- c("0", "1")
chisq.test(monss)$p.value
# 0.07098842
#rec
r2 <- anti_join(bi, ss_per_gene, by="gene")#
rec_ss <- merge(ss_per_gene, bi, by="gene")#
#get 0 ss for all and dd rec
noss <- c(nrow(no_r_no), nrow(r2))
ss1 <- c(nrow(no_r_ss), nrow(rec_ss))
recss <- data.frame("0"=noss, "1"=ss1)
rownames(recss) <- c("all", "rec")
colnames(recss) <- c("0", "1")
chisq.test(recss)$p.value
#0.0001345775
#CANCER
#onc
o2 <- anti_join(onc, ss_per_gene, by="gene")#
onc_ss <- merge(ss_per_gene, onc, by="gene")#
#get 0 uorf for all and dd rec
noss <- c(nrow(no_onc_no), nrow(o2))
ss1 <- c(nrow(no_onc_ss), nrow(onc_ss))
oncss <- data.frame("0"=noss, "1"=ss1)
rownames(oncss) <- c("all", "onc")
colnames(oncss) <- c("0", "1")
chisq.test(oncss)$p.value
# 0.0752052
#tsg
t2 <- anti_join(TSG2, ss_per_gene, by="gene")#
tsg_ss <- merge(ss_per_gene, TSG2, by="gene")#
#get 0 uorf for all and dd rec
noss <- c(nrow(no_tsg_no), nrow(t2))
ss1 <- c(nrow(no_tsg_ss), nrow(tsg_ss))
tsgss <- data.frame("0"=noss, "1"=ss1)
rownames(tsgss) <- c("all", "tsg")
colnames(tsgss) <- c("0", "1")
chisq.test(tsgss)$p.value
#0.9472937
#DS
#hap
h2 <- anti_join(hap_len, ss_per_gene, by="gene")#
hap_ss <- merge(ss_per_gene, hap_len, by="gene")#
#get 0 uorf for all and dd rec
noss <- c(nrow(no_hap_no), nrow(h2))
ss1 <- c(nrow(no_hap_ss), nrow(hap_ss))
hapss <- data.frame("0"=noss, "1"=ss1)
rownames(hapss) <- c("all", "hap")
colnames(hapss) <- c("0", "1")
chisq.test(hapss)$p.value
#3.062198e-08
#trip
ti2 <- anti_join(trip_len, ss_per_gene, by="gene")#
trip_ss <- merge(ss_per_gene, trip_len, by="gene")#
#get 0 uorf for all and dd rec
noss <- c(nrow(no_trip_no), nrow(ti2))
ss1 <- c(nrow(no_trip_ss), nrow(trip_ss))
tripss <- data.frame("0"=noss, "1"=ss1)
rownames(tripss) <- c("all", "trip")
colnames(tripss) <- c("0", "1")
chisq.test(tripss)$p.value
#0.04723285
```
#4. PhyloP
```{r}
#make a grouped boxplot of mean phylop for 5utrs
phylop <- read.table("five_utr_ph_m1.txt")
#add phylop to each disease sub and rbind into 1
#hap_len, trip_len,onc,TSG2,mono, bi, all
hap_ph <- merge(phylop, hap_len, by="gene")
trip_ph <- merge(phylop, trip_len, by="gene")
onc_ph <- merge(phylop, onc, by="gene")
tsg_ph <- merge(phylop, TSG2, by="gene")
mono_ph <- merge(phylop, mono, by="gene")
bi_ph <- merge(phylop, bi, by="gene")
all_ph <- merge(phylop, all, by="gene")
phylop_plot <- rbind(all_ph, hap_ph, trip_ph,onc_ph,tsg_ph,mono_ph, bi_ph)
#add median intercept
median(all_ph$agg_PGS, na.rm = TRUE)
#0.2765763
#raincloud
ggplot(data = phylop_plot, aes(x=group, y = agg_PGS)) + geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8, fill="#793A92") + geom_point(aes(y = agg_PGS, fill = group), position = position_jitter(width = 0.15), size = 1, alpha = 0.1, color="grey48") + geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5, fill="#793A92") + labs(y = "Mean PhyloP per 5'UTR") + guides(fill = FALSE, color = FALSE) +theme_light() + coord_flip() + geom_hline(yintercept = 0.2765763, colour="black",linetype = "dotted") + facet_grid(disease ~ ., scales = "free", space = "free", labeller = labeller(disease=label_wrap_gen(width=20))) + facet_grid(disease ~ ., scales = "free", space = "free", labeller = labeller(disease=label_wrap_gen(width=20))) + theme(axis.text.y = element_text(size = 10)) + theme(strip.background =element_rect(fill="gray47"))+ theme(strip.text = element_text(colour = 'white', size = '6', face="bold"))
#some stats
#mean 5'utr phylop per group
#function
means <- function(df){
x <- sum(df$agg_PGS[which(df$agg_PGS>=0)])
y <- sum(df$agg_PGS[which(df$agg_PGS<=0)])
z <- x+y #(x-y added it)
#get number of values in df
return(z/nrow(df))
}
means(hap_ph)#1.072923
means(trip_ph)#1.133459
means(onc_ph)#0.8545772
means(tsg_ph)#0.89462
means(mono_ph)#1.487475
means(bi_ph)# 0.2837177
means(all_ph)# 0.4173547
#need stats for this
##############do students t test for these
#dd dominant
n_dom_ph <- anti_join(all_ph,mono_ph, by="gene")#all genes with no dd dom in
no_domp <- n_dom_ph$agg_PGS
domp <- mono_ph$agg_PGS
t.test(no_domp, domp)$p.value
#1.378929e-46
#dd recessive
n_rec_ph <- anti_join(all_ph,bi_ph, by="gene")
no_recp <- n_rec_ph$agg_PGS
recp <- bi_ph$agg_PGS
t.test(no_recp, recp)$p.value
#4.933076e-08
#onc
n_onc_ph <- anti_join(all_ph,onc_ph, by="gene")
no_oncp <- n_onc_ph$agg_PGS
oncp <- onc_ph$agg_PGS
t.test(no_oncp,oncp)$p.value
#9.040601e-06
#tsg
n_tsg_ph <- anti_join(all_ph,tsg_ph, by="gene")
no_tsgp <- n_tsg_ph$agg_PGS
tsgp <- tsg_ph$agg_PGS
t.test(no_tsgp,tsgp)$p.value
#4.298317e-08
#hap
n_hap_ph <- anti_join(all_ph,hap_ph, by="gene")
no_happ <- n_hap_ph$agg_PGS
happ <- hap_ph$agg_PGS
t.test(no_happ, happ)$p.value
#2.915245e-243
#trip
n_trip_ph <- anti_join(all_ph,trip_ph, by="gene")
no_tripp <- n_trip_ph$agg_PGS
tripp<- trip_ph$agg_PGS
t.test(no_tripp, tripp)$p.value
#6.059623e-133
#do chi square test on how many in each group had phylop >2 or < 2. make a matrix of each disease group. think how to set up, bec i was originally comparing top and bottom loeuf, so wht am i comparign ehre? all genes minus disease group? yes
#1. need all genes minus the disease set
#2.set up individual matrices for each disease set
#(all_ph, hap_ph, trip_ph,onc_ph,tsg_ph,mono_ph, bi_ph)
#all genes without specific disease set in
n_dom_ph <- anti_join(all_ph,mono_ph, by="gene")#
n_rec_ph <- anti_join(all_ph,bi_ph, by="gene")#
n_hap_ph <- anti_join(all_ph,hap_ph, by="gene")#
n_trip_ph <- anti_join(all_ph,trip_ph, by="gene")#
n_onc_ph <- anti_join(all_ph,onc_ph, by="gene")#
n_tsg_ph <- anti_join(all_ph,tsg_ph, by="gene")#
dom_p_stat <- n_dom_ph %>% mutate(two_cutoff = agg_PGS>2)#allgenes
dom_p_stat2 <- mono_ph %>% mutate(two_cutoff = agg_PGS>2)#dom
dom_p_stat <- dom_p_stat %>% mutate(two_cutoff=str_replace(two_cutoff, "TRUE", ">2"))
dom_p_stat <- dom_p_stat %>% mutate(two_cutoff=str_replace(two_cutoff, "FALSE", "<2"))
dom_p_stat2 <- dom_p_stat2 %>% mutate(two_cutoff=str_replace(two_cutoff, "TRUE", ">2"))
dom_p_stat2 <- dom_p_stat2 %>% mutate(two_cutoff=str_replace(two_cutoff, "FALSE", "<2"))
#get totals for each group
dom <- nrow(mono_ph) #327
n_dom <- nrow(n_dom_ph) #16339
dom_st <- data.frame(table(dom_p_stat$two_cutoff))
dom_st2 <- data.frame(table(dom_p_stat2$two_cutoff))
dom_st$total <- n_dom
dom_st2$total <- dom
dom_st$perc <- (dom_st$Freq/dom_st$total)*100
dom_st2$perc <- (dom_st2$Freq/dom_st2$total)*100
alldom_fin <- as.data.frame(t(dom_st))
dom_fin <- as.data.frame(t(dom_st2))
#select the only row I want
alldom_fin <- alldom_fin %>% slice(2)
names(alldom_fin)[1]<-"<2"
names(alldom_fin)[2]<-">2"
rownames(alldom_fin) <- c("all")
dom_fin <- dom_fin %>% slice(2)
names(dom_fin)[1]<-"<2"
names(dom_fin)[2]<-">2"
rownames(dom_fin) <- c("dom")
phydom_stats <- rbind(alldom_fin, dom_fin)
#2 issues here - need to be round numbers, numeric and unlisted i think
#make numeric
phydom_stats$`<2` <- as.numeric(phydom_stats$`<2`)
phydom_stats$`>2` <- as.numeric(phydom_stats$`>2`)
#round
phydom_stats <- phydom_stats %>% mutate_if(is.numeric, round)
#fisher test
chisq.test(phydom_stats) $p.value
#p3.131061e-65
#make function
phy_stats <- function(df){
df <- df%>% mutate(two_cutoff = agg_PGS>2)#differentiate above and below 2
df <- df %>% mutate(two_cutoff=str_replace(two_cutoff, "TRUE", ">2"))#change to meaningful vals
df <- df %>% mutate(two_cutoff=str_replace(two_cutoff, "FALSE", "<2"))
total <- nrow(df) #16339
df2 <- data.frame(table(df$two_cutoff))
df2$total <- total
df2$perc <- (df2$Freq/df2$total)*100
#transpose
df3 <- as.data.frame(t(df2))
#select the only row I want
df4<- df3 %>% slice(2)
names(df4)[1]<-"<2"
names(df4)[2]<-">2"
rownames(df4) <- c("dis")
df4$`<2` <- as.numeric(df4$`<2`)
df4$`>2` <- as.numeric(df4$`>2`)
#round
df4<- df4 %>% mutate_if(is.numeric, round)
return (df4)
}
#to run
nrec <- phy_stats(n_rec_ph)
#now can run on all 12 and then bind them and do stats on. make them into a list and
nrec <- phy_stats(n_rec_ph)
nonc <- phy_stats(n_onc_ph)
ntsg <- phy_stats(n_tsg_ph)
nhap<- phy_stats(n_hap_ph)
ntrip <- phy_stats(n_trip_ph)
rec<- phy_stats(bi_ph)
onc<- phy_stats(onc_ph)
tsg<- phy_stats(tsg_ph)
hap<- phy_stats(hap_ph)
trip<- phy_stats(trip_ph)
rec_stat <- rbind(nrec, rec)
onc_stat <- rbind(nonc, onc)
tsg_stat <- rbind(ntsg, tsg)
hap_stat <- rbind(nhap, hap)
trip_stat <- rbind(ntrip, trip)
#chisq
chisq.test(rec_stat)$p.value#0.0008868729
chisq.test(onc_stat)$p.value#0.01144827
chisq.test(tsg_stat)$p.value#9.542434e-05
chisq.test(hap_stat)$p.value#3.976505e-181
chisq.test(trip_stat)$p.value#6.16831e-12
##for paper i want percentages in each group
perc_fun <- function(df){
df$total <- df$`<2` + df$`>2`
df$perc_less <- (df$`<2` / df$total) *100
df$perc_more <- (df$`>2` / df$total) *100
return(df)
}
#list of df i want to run this on
dfs <- list(phydom_stats, rec_stat, onc_stat, tsg_stat, hap_stat, trip_stat)
#run perc fun on all
lapply(dfs, perc_fun)
#dom
#5% of all genes (minuns dd dom) had phylop>2
#25% of DD dom genes had phylop>2
#rec
#5% of all genes (minuns dd rec) had phylop>2
#3% of DD rec genes had phylop>2
#onc
#5% of all genes (minuns onc) had phylop>2
#11% of onc genes had phylop>2
#tsg
# 5% of all genes (minuns tsg) had phylop>2
# 12% of tsg genes had phylop>2
#hap
# 3% of all genes (minuns hap) had phylop>2
# 16% of hap genes had phylop>2
#trip
# 5% of all genes (minuns trip) had phylop>2
# 14% of trip genes had phylop>2
#all genes with no disease removed % for >2 and <2
all_ph2 <- phy_stats(all_ph)
perc_fun(all_ph2)
#5% have >2 phlyop
```
#5. Introns
```{r}
#length_plot from length chunk has intron count in
dis_int <- length_plot %>% mutate(intron_cutoff = intron_final>0)#this makes true if intron number is >0, false if less
#count how many
dis_int2 <- dis_int %>% group_by(group, disease) %>% dplyr::count(intron_cutoff)#this looks nice
#get percentages
dis_int2$total <- parse_number(dis_int2$group)
dis_int2$n <- as.numeric(dis_int2$n)
dis_int2$perc <- (dis_int2$n/dis_int2$total)*100
#prepare for plot
#change intron true/false into meaningful
dis_int2 <- dis_int2 %>% mutate(intron_cutoff=str_replace(intron_cutoff, "TRUE", "1+ Intron"))
dis_int2 <- dis_int2 %>% mutate(intron_cutoff=str_replace(intron_cutoff, "FALSE", "0 Intron"))
dis_int2$intron_cutoff <- factor(dis_int2$intron_cutoff, levels=c("1+ Intron","0 Intron")) #but now figure leg is wrong
ggplot(dis_int2, aes(x=perc, y=group, fill=intron_cutoff)) + geom_bar(position="dodge", stat="identity", width=0.8) + theme_light() + ylab("Gene Groups") + xlab("Percentage") +facet_grid(disease ~ ., scales = "free", space = "free", labeller = labeller(disease=label_wrap_gen(width=20))) + theme(axis.text.y = element_text(size = 10)) + theme(strip.background =element_rect(fill="gray47"))+ theme(strip.text = element_text(colour = 'white', size = '9', face="bold")) +scale_fill_manual(values = c("0 Intron" = "#BFACC8", "1+ Intron" = "#2B4595"))
#change group names
dis_int2[dis_int2=="Developmental Disorder"] <- "DD"
dis_int2[dis_int2=="Dosage Sensitive"] <- "DS"
dis_int2[dis_int2=="All Genes"] <- "All"
ggplot(dis_int2, aes(x=perc, y=group, fill=intron_cutoff)) + geom_bar(position="dodge", stat="identity", width=0.8) + theme_light() + ylab("Gene Groups") + xlab("Percentage") +facet_grid(disease ~ ., scales = "free", space = "free", labeller = labeller(disease=label_wrap_gen(width=20))) + theme(axis.text.y = element_text(size = 10)) + theme(strip.background =element_rect(fill="gray47"))+ theme(strip.text = element_text(colour = 'white', size = '9', face="bold")) +scale_fill_manual(values = c("0 Intron" = "#BFACC8", "1+ Intron" = "#2B4595")) + theme(legend.title=element_blank())
#######STATS############
#make matrices of each group plus all gene set
##redo to be against all genes minus individual disease sets
#in the phylop stats i have already generated all gene sets minus individual disease genes and it contains introns
#in each set split between how many have 0 introns and how nany have 1+ intron?
#then compare proportions between the 2 groups?
#so i have proprtions already for all disease genes, now i need for all genes minus the disease ones
all_min_dom <- n_dom_ph %>% mutate(intron_cutoff = intron_final.x>0)
all_min_rec <- n_rec_ph %>% mutate(intron_cutoff = intron_final.x>0)
all_min_onc <- n_onc_ph %>% mutate(intron_cutoff = intron_final.x>0)
all_min_tsg <- n_tsg_ph %>% mutate(intron_cutoff = intron_final.x>0)
all_min_hap <- n_hap_ph %>% mutate(intron_cutoff = intron_final.x>0)
all_min_trip <- n_trip_ph %>% mutate(intron_cutoff = intron_final.x>0)
all_min_dom <- as.data.frame(table(all_min_dom$intron_cutoff))
all_min_dom <- as.data.frame(t(all_min_dom))
all_min_rec <- as.data.frame(table(all_min_rec$intron_cutoff))
all_min_rec <- as.data.frame(t(all_min_rec))
all_min_onc <- as.data.frame(table(all_min_onc$intron_cutoff))
all_min_onc <- as.data.frame(t(all_min_onc))
all_min_tsg <- as.data.frame(table(all_min_tsg$intron_cutoff))
all_min_tsg <- as.data.frame(t(all_min_tsg))
all_min_hap <- as.data.frame(table(all_min_hap$intron_cutoff))
all_min_hap <- as.data.frame(t(all_min_hap))
all_min_trip <- as.data.frame(table(all_min_trip$intron_cutoff))
all_min_trip <- as.data.frame(t(all_min_trip))
#now we have corrected all genes sets, now compare to disease gene sets
group_int <- subset(dis_int2, select=c(group, n, intron_cutoff))
#go through each dis subgroup, pull out of this df and do a stats test on it
#dominant
dom_in <- group_int %>% filter(group=="Dominant (n=328)")
dom_int <- subset(dom_in, select=-c(group))
dom_int <- as.data.frame(t(dom_int))
#get into matrix with all_min_dom
names(dom_int)[1]<-"0"
names(dom_int)[2]<-"1"
names(all_min_dom)[1]<-"0"
names(all_min_dom)[2]<-"1"
dom_in2 <- rbind(all_min_dom, dom_int)
dom_in2 <- dom_in2 %>% dplyr::slice(2,3)
dom_in2$`0` <- as.integer(dom_in2$`0`)
dom_in2$`1` <- as.integer(dom_in2$`1`)
chisq.test(dom_in2)$p.value
# 0.06648969
#recessive
rec_in <- group_int %>% filter(group=="Recessive (n=948)")
rec_int <- subset(rec_in, select=-c(group))
rec_int <- as.data.frame(t(rec_int))
#get into matrix with all_min_dom
names(rec_int)[1]<-"0"
names(rec_int)[2]<-"1"
names(all_min_rec)[1]<-"0"
names(all_min_rec)[2]<-"1"
rec_in2 <- rbind(all_min_rec, rec_int)
rec_in2 <- rec_in2 %>% dplyr::slice(2,3)
rec_in2$`0` <- as.integer(rec_in2$`0`)
rec_in2$`1` <- as.integer(rec_in2$`1`)
chisq.test(rec_in2)$p.value
# 4.691054e-06
#onc
onc_in <- group_int %>% filter(group=="Oncogene (n=109)")
onc_int <- subset(onc_in, select=-c(group))
onc_int <- as.data.frame(t(onc_int))
#get into matrix with all_min_dom
names(onc_int)[1]<-"0"
names(onc_int)[2]<-"1"
names(all_min_onc)[1]<-"0"
names(all_min_onc)[2]<-"1"
onc_in2 <- rbind(all_min_onc, onc_int)
onc_in2 <- onc_in2 %>% dplyr::slice(2,3)
onc_in2$`0` <- as.integer(onc_in2$`0`)
onc_in2$`1` <- as.integer(onc_in2$`1`)
chisq.test(onc_in2)$p.value
#0.09402826
#tsg
tsg_in <- group_int %>% filter(group=="TSG (n=166)")
tsg_int <- subset(tsg_in, select=-c(group))
tsg_int <- as.data.frame(t(tsg_int))
#get into matrix with all_min_dom
names(tsg_int)[1]<-"0"
names(tsg_int)[2]<-"1"
names(all_min_tsg)[1]<-"0"
names(all_min_tsg)[2]<-"1"
tsg_in2 <- rbind(all_min_tsg, tsg_int)
tsg_in2 <- tsg_in2 %>% dplyr::slice(2,3)
tsg_in2$`0` <- as.integer(tsg_in2$`0`)
tsg_in2$`1` <- as.integer(tsg_in2$`1`)
chisq.test(tsg_in2)$p.value
#0.148594
#hap
hap_in <- group_int %>% filter(group=="Haploinsufficient (n=2856)")
hap_int <- subset(hap_in, select=-c(group))
hap_int <- as.data.frame(t(hap_int))
#get into matrix with all_min_dom
names(hap_int)[1]<-"0"
names(hap_int)[2]<-"1"
names(all_min_hap)[1]<-"0"
names(all_min_hap)[2]<-"1"
hap_in2 <- rbind(all_min_hap, hap_int)
hap_in2 <- hap_in2 %>% dplyr::slice(2,3)
hap_in2$`0` <- as.integer(hap_in2$`0`)
hap_in2$`1` <- as.integer(hap_in2$`1`)
chisq.test(hap_in2)$p.value
#0.1771262
#trip
trip_in <- group_int %>% filter(group=="Triplosensitive (n=1487)")
trip_int <- subset(trip_in, select=-c(group))
trip_int <- as.data.frame(t(trip_int))
#get into matrix with all_min_dom
names(trip_int)[1]<-"0"
names(trip_int)[2]<-"1"
names(all_min_trip)[1]<-"0"
names(all_min_trip)[2]<-"1"
trip_in2 <- rbind(all_min_trip, trip_int)
trip_in2 <- onc_in2 %>% dplyr::slice(2,3)
trip_in2$`0` <- as.integer(trip_in2$`0`)
trip_in2$`1` <- as.integer(trip_in2$`1`)
chisq.test(trip_in2)$p.value
# 0.3886635
###stats version against all genes (with dis genes)
int_st <- subset(dis_int2, select=c(group, n, intron_cutoff))
zero_int <- int_st %>% filter(intron_cutoff=="0 Intron")
names(zero_int)[2]<-"zero"
one_int <- int_st %>% filter(intron_cutoff=="1+ Intron")
names(one_int)[2]<-"one"
#merge
int_stat <- merge(zero_int, one_int, by="group")
int_stat <- subset(int_stat, select=-c(intron_cutoff.x, intron_cutoff.y))
int_stat$zero <- as.numeric(int_stat$zero)
int_stat$one <- as.numeric(int_stat$one)
#slice df to get all the individual ones
int_stat <- subset(int_stat, select=-c(group))
dom_s <- int_stat %>% slice(1,2)
chisq.test(dom_s)$p.value
#0.0712268
hap_s <- int_stat %>% slice(1,3)
chisq.test(hap_s)$p.value
#0.2505068
onc_s <- int_stat %>% slice(1,4)
chisq.test(onc_s)$p.value
#0.09584875
rec_s <- int_stat %>% slice(1,5)
chisq.test(rec_s)$p.value
#1.332505e-05
trip_s <- int_stat %>% slice(1,6)
chisq.test(trip_s)$p.value
#0.7378612
tsg_s <- int_stat %>% slice(1,7)
chisq.test(tsg_s)$p.value
#0.1519951
```
#6. Recessive Genes vs Middle LEOUF genes
```{r}
#1. middle leouf deciles vs recessive genes length
bi_genes <- bilof$gene.symbol #948
#run gnomad from gnomad file
middle <- gn_final1 %>% filter(decile %in% (5:6))
middle_genes <- middle$gene
#how many recessive are in middle
df <- intersect(bi_genes, middle_genes) #319
length_exon <- read.table("length_introns.txt")
names(length_exon)[1]<-"gene"
mid_len <- merge(middle, length_exon, by="gene")#3507
mean(mid_len$UTR_length)#176.5332
#mean middle leouf genes 5 utr length
rec <- bi$UTR_length
mid <- mid_len$UTR_length
wilcox.test(rec, mid)
#p-value = 0.03791525
#2. introns
intron <- data.frame(gene=length_exon$gene, intron=length_exon$intron_final)
midgn2 <- merge(intron, middle, by="gene")#3507
#comapre to rec. remember a bunch in middle 2 dec contian recessive
bi_int <- merge(intron, bilof, by="gene")
midgn2 <- merge(intron, midgn, by="gene")
mid_int <- midgn2 %>% filter(intron>=1)#1318
bi_int2 <- bi_int %>% filter(intron_final>=1)#290
midz <- nrow(midgn2) - nrow(mid_int)#2189
biz <- nrow(bi_int) - nrow(bi_int2)#658
zeroint <- c(midz, biz)
int <- c(nrow(mid_int), nrow(bi_int2))
rec_int <- data.frame("0"=zeroint, "1+"=int)
rownames(rec_int) <- c("mid", "rec")
colnames(rec_int) <- c("0", "1+")
chisq.test(rec_int)$p.value
# 8.198708e-05
#3. uorfs