4. 5'UTRs of disease genes.Rmd

---
title: "4. 5'UTRs across LEOUF"
output: rmarkdown::github_document
---

#5'UTR Analysis for disease genes (DD, cancer, DS)
1. Bring in gene sets and clean data
2. 5UTR Length
3. uAUG type 
4. PhyloP
5. 5UTR Intron
6. Recessive DD genes vs middle LEOUF deciles

Load Libraries
```{r}
library(splitstackshape)
library(stringr)
library(dplyr)
library(tidyverse)
library("readxl")
library(ggplot2)
```


#1. Bring all data in
```{r}
#plan
#1. Developmental Disorders (DDG2P) restrict to LoF and split dom+rec
#2. Cancer, split onco and TSG
#3. Dosage Sensitive, hap and trip
#4. all genes


################1. DD
#data: downloaded 18 February 2021 from DDG2P (McRae JF, Clayton S, Fitzgerald TW, Kaplanis J, Prigmore E, Rajan D, et al. Prevalence and architecture of de novo mutations in developmental disorders. Nature. 2017 Feb;542(7642):433–8)
#limited to confirmed or probable roles in developmental disorders and loss-of-function disease mechanism.
mst_ddG2P<-read.csv(file="DDG2P_18_2_2021.csv.gz")
ddG2P <- mst_ddG2P
#bring in MANE summary file and merge to get transcript/gene IDs
mane_summary <- read.delim(file = "MANE.GRCh38.v1.0.summary.txt.gz")
mane <- mane_summary
#remove "HGNC:" from HGNC column
manesplit <- cSplit(mane, "HGNC_ID", ":")
#subset
mane2 <- data.frame("gene_id" = manesplit$Ensembl_Gene, "transcript_id" = manesplit$Ensembl_nuc, "HGNC" = manesplit$HGNC_ID_2, "gene"=manesplit$symbol)
#filter DDD.category to confirmed or probable
ddconf <- ddG2P %>% filter(DDD.category == "confirmed")
ddprob <- ddG2P %>% filter(DDD.category == "probable")
dd1 <- rbind(ddconf, ddprob) #2037
#merge MANE with DD data to get transcript/gene ids
names(dd1)[13] <- "HGNC"
dd <- merge(dd1, mane2, by = "HGNC")#2071

#bring length data in
length_exon <- read.table("length_introns.txt")

#filter to just LoF, mono/bi
lof <- dd %>% filter(mutation.consequence == "loss of function")#1443
#merge with lengths to get info
loflen <- merge(lof, length_exon, by = "transcript_id")#1416
monolof <- loflen %>% filter(allelic.requirement == "monoallelic")#328
bilof <- loflen %>% filter(allelic.requirement == "biallelic")#948

########################2. Cancer, onco and TSG
#data: The COSMIC Cancer Gene Census was downloaded 22nd February 2021.  
#Restricted to nonsense, frameshift and missense mutation types and then filtered to oncogene or TSG only as cancer gene type.

#bring in gene census data to extract if they are TSG/oncogenes
copy_census <-  read.csv("Census_allMon Feb 22 14_01_37 2021.csv")
census_cancer_genes <- copy_census
#nonsense, frameshift, missense mutations 
mutations <- dplyr::filter(census_cancer_genes, grepl("N|F|Mis", Mutation.Types))
mutations$split1 <- sub(".*ENSG", "", mutations$Synonyms) 
#this removed ENSG and gave me everything after 
#can now split by comma and then re-add ENSG as a prefix
mutations$ensg <- "ENSG"
mutations$paste <- paste(mutations$ensg, mutations$split1, sep = "")
mut2 <- cSplit(mutations, "paste", ",")

#rename gene_id
names(mut2)[23] <- "gene_id"
#cut off after the full stop for ENSGs  (otherwise when merged with length_exon not enough overlap)
mut3 <- cSplit(mut2, "gene_id", ".")

#filter to TSG and onco
oncogene <- mut3[grepl('oncogene',tolower(mut3$Role.in.Cancer)) & !grepl('tsg',tolower(mut3$Role.in.Cancer)),]
TSG <- mut3[!grepl('oncogene',tolower(mut3$Role.in.Cancer)) & grepl('tsg',tolower(mut3$Role.in.Cancer)),]

#merge length exon
allUTRlengths <- cSplit(length_exon, "gene_id", ".")
#merge
tsg_len <- merge(TSG, allUTRlengths, by = "gene_id_1") #166
onco_len <- merge(oncogene, allUTRlengths, by = "gene_id_1") #109


##########################3. DS, hap and trip
#Collins RL, Glessner JT, Porcu E, Lepamets M, Brandon R, Lauricella C, et al. A cross-disorder dosage sensitivity map of the human genome. Cell. 2022 Aug 4;185(16):3041-3055.e25 
#2,987 HI (pHaplo ≥ 0.86) and 1,559 TS (pTriplo ≥ 0.94) genes

ds <- read_excel("collins DS.xlsx")#18641
names(ds)[1]<-"gene_name"
#gives warnings but think its fine
hap <- ds %>% filter(pHaplo>=0.86)#2987
trip <- ds %>% filter(pTriplo>=0.94)#1559
hap2 <- merge(length_exon, hap, by="gene_name")#2856
trip2 <- merge(length_exon, trip, by="gene_name")#1487
#merged by gene bec no other info provided

#4. All Genes
#length_exon, added above

```
#2. Grouped Length plot 
```{r}
#Get each sub group into its own df with gene, UTR length, and group its in, (and intron_final bec i need it later) Then can rbind them all together

#FACET plot

#1. DD
mono <- subset(monolof, select = c(gene, UTR_length, intron_final))
mono$group <- "Dominant (n=328)"
mono$disease <- "Developmental Disorder"
bi <- subset(bilof, select = c(gene, UTR_length, intron_final))
bi$group <- "Recessive (n=948)"
bi$disease <- "Developmental Disorder"

#2. Cancer
#subset onco_len and TSG_len
onc <- data.frame("gene"=onco_len$gene_name, "UTR_length"=onco_len$UTR_length, "group"="Oncogene (n=109)", "disease"="Cancer", "intron_final"=onco_len$intron_final)
TSG2 <- data.frame("gene"=tsg_len$gene_name, "UTR_length"=tsg_len$UTR_length, "group"="TSG (n=166)", "disease"="Cancer","intron_final"=tsg_len$intron_final)

#3. DS
hap_len <- data.frame("gene"=hap2$gene_name, "UTR_length"=hap2$UTR_length, "group"="Haploinsufficient (n=2856)", "disease"="Dosage Sensitive", "intron_final"=hap2$intron_final)
trip_len <- data.frame("gene"=trip2$gene_name, "UTR_length"=trip2$UTR_length, "group"="Triplosensitive (n=1487)", "disease"="Dosage Sensitive", "intron_final"=trip2$intron_final)

#4. All genes 
all <- data.frame("gene"=length_exon$gene_name, "UTR_length"=length_exon$UTR_length, "group"="All Genes (n=18764)", "disease"="All Genes", "intron_final"=length_exon$intron_final)

#bind into one df and then plot a faceted boxplot?
length_plot <-bind_rows(all, hap_len, trip_len,onc,TSG2,mono, bi)

#plot
#get median for x intercept
median(all$UTR_length)


#raincloud for this
#this works 
source("https://gist.githubusercontent.com/benmarwick/2a1bb0133ff568cbe28d/raw/fb53bd97121f7f9ce947837ef1a4c65a73bffb3f/geom_flat_violin.R") 

ggplot(data = length_plot, aes(y=UTR_length, x = group)) +   geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8, fill="#793A92") +     geom_point(aes(y = UTR_length, fill = group), position = position_jitter(width = 0.15), size = 1, alpha = 0.1, color="grey48") +     geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5, fill="#793A92") +     labs(x = "5'UTR Length (bp)") + guides(fill = FALSE, color = FALSE) +theme_light()  + coord_flip(ylim=c(0,2000)) + geom_hline(yintercept = 136, colour="black",linetype = "dotted") + facet_grid(disease ~ ., scales = "free", space = "free", labeller = labeller(disease=label_wrap_gen(width=20))) + facet_grid(disease ~ ., scales = "free", space = "free", labeller = labeller(disease=label_wrap_gen(width=20))) + theme(axis.text.y = element_text(size = 10)) + theme(strip.background =element_rect(fill="gray47"))+ theme(strip.text = element_text(colour = 'white', size = '6', face="bold"))  + ylab("5'UTR Length (bp)") + xlab("Disease Gene Group")


#some stats 
#means for all and stats to show whether increase/decrease in length is sig

#means
mean(all$UTR_length)#201.6124
mean(mono$UTR_length)#368.9695
mean(bi$UTR_length)#168.6814
mean(onc$UTR_length)#259.4495
mean(TSG2$UTR_length)#254.0181
mean(hap_len$UTR_length)#278.5063
mean(trip_len$UTR_length)# 252.694


#wilcox
al_len <- all$UTR_length
m_len <- mono$UTR_length
bi_le<- bi$UTR_length
on_le<- onc$UTR_length
tsg_le<- TSG2$UTR_length
ha_le<- hap_len$UTR_length
tr_le <- trip_len$UTR_length

#DD
wilcox.test(al_len, m_len)$p.value #4.202658e-39
wilcox.test(al_len, bi_le)$p.value #1.270368e-07
wilcox.test(al_len, on_le)$p.value #1.71202e-05
wilcox.test(al_len, tsg_le)$p.value #0.0003257773
wilcox.test(al_len, ha_le)$p.value #4.777048e-100
wilcox.test(al_len, tr_le)$p.value #2.510808e-30

#do stats for 5utr length against all genes with disease genes removed
n_dom <- anti_join(all,mono,  by="gene")#
n_rec <- anti_join(all,bi,  by="gene")#
n_hap <- anti_join(all,hap_len,  by="gene")#
n_trip <- anti_join(all,trip_len,  by="gene")#
n_onc <- anti_join(all,onc,  by="gene")#
n_tsg <- anti_join(all,TSG2,  by="gene")#

n_dom <- n_dom$UTR_length
n_rec <- n_rec$UTR_length
n_hap <- n_hap$UTR_length
n_trip <- n_trip$UTR_length
n_onc <- n_onc$UTR_length
n_tsg <- n_tsg$UTR_length


wilcox.test(n_dom, m_len)$p.value #1.967071e-40
wilcox.test(n_rec, bi_le)$p.value #2.743379e-08
wilcox.test(n_onc, on_le)$p.value #1.528611e-05
wilcox.test(n_tsg, tsg_le)$p.value #0.0002880399
wilcox.test(n_hap, ha_le)$p.value #2.904055e-135
wilcox.test(n_trip, tr_le)$p.value #2.953262e-35

```

#3. uAUG Types
```{r}
#bring in data
orf_gene <- read.table("orf_gene.txt")

#merge with length disese plot?
#this is only transcript ids. need transcript ids added to length disease plot
info <- data.frame(gene=length_exon$gene_name, transcript=length_exon$transcript_id)
names(info)[1]<-"gene"
len_plot2 <- merge(length_plot, info, by="gene")
#remove dup trnascript bec 1 gene is in twice
len_plot <- len_plot2 %>% distinct()

#now merge with orfs
orf_dis <- merge(len_plot, orf_gene, by="transcript")

#need percentages in each group which have each type of uaug
orf_plot <- orf_dis %>% group_by(group, orf_type, disease) %>% summarise(n())
#can "grep" the number out of the group col to work out percentages
library(tidyr)
orf_plot$total <- parse_number(orf_plot$group)
orf_plot$perc <- (orf_plot$`n()`/orf_plot$total) *100
#plot
#change to Start-Stop
orf_plot[orf_plot=="start-stop"] <- "Start-Stop"
#order bars
orf_plot$orf_type <- factor(orf_plot$orf_type, levels = c("Start-Stop","oORF", "uORF"))


orf_plot[orf_plot=="Developmental Disorder"] <- "DD"
orf_plot[orf_plot=="Dosage Sensitive"] <- "DS"
orf_plot[orf_plot=="All Genes"] <- "All"

ggplot(orf_plot, aes(x=perc, y=group, fill=orf_type)) + geom_bar(position="dodge", stat="identity", width=0.8) + theme_light()  + ylab("Gene Groups") + xlab("Percentage")+ scale_fill_manual(values=c("deepskyblue1","palevioletred1","navy" ))  + guides(fill=guide_legend(title="uAUG Type")) +facet_grid(disease ~ ., scales = "free", space = "free", labeller = labeller(disease=label_wrap_gen(width=20)))+ theme(axis.text.y = element_text(size = 10)) + theme(strip.background =element_rect(fill="gray47"))+ theme(strip.text = element_text(colour = 'white', size = '9', face="bold")) + geom_vline(xintercept = 34.4, colour="navy",linetype = "dotted") + geom_vline(xintercept = 15, colour="palevioletred1",linetype = "dotted") + geom_vline(xintercept = 5, colour="deepskyblue1",linetype = "dotted") + theme(axis.text.x = element_text(vjust = 0.5))

#write.table(orf_plot, "disease_orfs4plot_m1.txt")

#######stats############
#chi-square between having 0 and 1 uorf and start stop compared to all genes

#stats for disease groups having uorfs and start-stops
#make matrix chi squared tables for each against all genes-disease genes. so essentially testing whether each disease group has more vs all genes. between having 0 uorfs/start-stops and 1+

#1. get all genes with 0 or with 1+ uorfs - 2 df's (no start-stops)
uORF <- orf_gene %>% filter(orf_type=="uORF")#6461
#merge length_exon to get info
names(length_exon)[3]<-"transcript"
uorf_per_gene <- merge(uORF, length_exon, by="transcript")
#all genes no uorf
no_uorf <- anti_join(length_exon, uorf_per_gene, by="transcript")#12303

#2. merge each of these with disease groups to get the all gene set for comparison for each section
#better way to do this is have them as lists/vectors so not making a million dfs. then can put values in more easily i think


#need all genes - each of the disease cat
no_dom <- anti_join(length_exon, mono, by="gene")
no_rec <- anti_join(length_exon, bi, by="gene") 
no_onc <- anti_join(length_exon, onc, by="gene")
no_tsg <- anti_join(length_exon, TSG2, by="gene") 
no_hap <- anti_join(length_exon, hap_len, by="gene")
no_trip <- anti_join(length_exon, trip_len, by="gene")

#DD:
no_dom <- anti_join(length_exon, mono, by="gene")#all gene set without DD dom genes in
no_d_no<- merge(no_dom, no_uorf, by="gene")#all genes with no DD dom in which have no uorfs
no_d_uorf <- merge(no_dom, uorf_per_gene, by="gene")#all genes with no dd dom in , which have 1+uorf

no_rec <- anti_join(length_exon, bi, by="gene") #
no_r_no <- merge(no_rec, no_uorf, by="gene")#
no_r_uorf <- merge(no_rec, uorf_per_gene, by="gene")#

#cancer
no_onc_no <- merge(no_onc, no_uorf, by="gene") 
no_onc_uorf <- merge(no_onc, uorf_per_gene, by="gene")#
no_tsg_no<- merge(no_tsg, no_uorf, by="gene")#
no_tsg_uorf <- merge(no_tsg, uorf_per_gene, by="gene")#
#ds
no_hap_no <- merge(no_hap, no_uorf, by="gene")
no_hap_uorf <- merge(no_hap, uorf_per_gene, by="gene")#
no_trip_no <- merge(no_trip, no_uorf, by="gene")
no_trip_uorf <- merge(no_trip, uorf_per_gene, by="gene")#

#stats. make a matrix for each dis set but without manual coding
#1.DD

#dd dom genes with no uorf and 1+ uorf
d2 <- anti_join(mono, uorf_per_gene, by="gene")#
dom_uorf <- merge(uorf_per_gene, mono, by="gene")#

#get 0 uorf for all and dd dom
nouo <- c(nrow(no_d_no), nrow(d2))
uo <- c(nrow(no_d_uorf), nrow(dom_uorf))
monuorf <- data.frame("0"=nouo, "1"=uo)
rownames(monuorf) <- c("all", "dom")
colnames(monuorf) <- c("0", "1")

chisq.test(monuorf)$p.value
#p =  2.776507e-19

#rec
r2 <- anti_join(bi, uorf_per_gene, by="gene")#691
rec_uorf <- merge(uorf_per_gene, bi, by="gene")#260
#get 0 uorf for all and dd rec
nouo <- c(nrow(no_r_no), nrow(r2))
uo <- c(nrow(no_r_uorf), nrow(rec_uorf))
biuorf <- data.frame("0"=nouo, "1"=uo)
rownames(biuorf) <- c("all", "rec")
colnames(biuorf) <- c("0", "1")
chisq.test(biuorf)$p.value
#2.700602e-06

#2. cancer
#onc
o2 <- anti_join(onc, uorf_per_gene, by="gene")#
onc_uorf <- merge(uorf_per_gene, onc, by="gene")#43
#get 0 uorf for all and dd rec
nouo <- c(nrow(no_onc_no), nrow(o2))
uo <- c(nrow(no_onc_uorf), nrow(onc_uorf))
oncuorf <- data.frame("0"=nouo, "1"=uo)
rownames(oncuorf) <- c("all", "onc")
colnames(oncuorf) <- c("0", "1")
chisq.test(oncuorf)$p.value
#0.06968515

#TSG
t2 <- anti_join(TSG2, uorf_per_gene, by="gene")#84
tsg_uorf <- merge(uorf_per_gene, TSG2, by="gene")#75
#get 0 uorf for all and dd rec
nouo <- c(nrow(no_tsg_no), nrow(t2))
uo <- c(nrow(no_tsg_uorf), nrow(tsg_uorf))
tsguorf <- data.frame("0"=nouo, "1"=uo)
rownames(tsguorf) <- c("all", "tsg")
colnames(tsguorf) <- c("0", "1")
chisq.test(tsguorf)$p.value
#6.471464e-05

#DS
#hap
h2 <- anti_join(hap_len, uorf_per_gene, by="gene")#84
hap_uorf <- merge(uorf_per_gene, hap_len, by="gene")#75
#get 0 uorf for all and dd rec
nouo <- c(nrow(no_hap_no), nrow(h2))
uo <- c(nrow(no_hap_uorf), nrow(hap_uorf))
hapuorf <- data.frame("0"=nouo, "1"=uo)
rownames(hapuorf) <- c("all", "hap")
colnames(hapuorf) <- c("0", "1")
chisq.test(hapuorf)$p.value
# 3.320967e-43

#trip
t2 <- anti_join(trip_len, uorf_per_gene, by="gene")#173
trip_uorf <- merge(uorf_per_gene, trip_len, by="gene")#95
#get 0 uorf for all and dd rec
nouo <- c(nrow(no_trip_no), nrow(t2))
uo <- c(nrow(no_trip_uorf), nrow(trip_uorf))
tripuorf <- data.frame("0"=nouo, "1"=uo)
rownames(tripuorf) <- c("all", "trip")
colnames(tripuorf) <- c("0", "1")
chisq.test(tripuorf)$p.value
#1.414266e-07


#now for stats for start-stops
#1. get all genes with 0 or with 1+ start-stops - 2 df's 
#genes with start-stops
ss_per_gene <- orf_gene %>% filter(orf_type=="start-stop")
ss_per_gene <- merge(ss_per_gene, length_exon, by="transcript")
#all genes no ss
no_ss <- anti_join(length_exon, ss_per_gene, by="gene")

#2. All genes with no disease genes in with and without start-stops
no_d_no<- merge(no_dom, no_ss, by="gene")#
no_d_ss <- merge(no_dom, ss_per_gene, by="gene")#
no_r_no <- merge(no_rec, no_ss, by="gene")#
no_r_ss <- merge(no_rec, ss_per_gene, by="gene")#

#cancer
no_onc_no <- merge(no_onc, no_ss, by="gene") 
no_onc_ss <- merge(no_onc, ss_per_gene, by="gene")#
no_tsg_no<- merge(no_tsg, no_ss, by="gene")#
no_tsg_ss <- merge(no_tsg, ss_per_gene, by="gene")#
#ds
no_hap_no <- merge(no_hap, no_ss, by="gene")
no_hap_ss <- merge(no_hap, ss_per_gene, by="gene")#
no_trip_no <- merge(no_trip, no_ss, by="gene")
no_trip_ss <- merge(no_trip, ss_per_gene, by="gene")#

#3. stats
#make a matrix for each dis set but without manual coding
#1.DD
#dd dom genes with no ss and 1+ ss
d2 <- anti_join(mono, ss_per_gene, by="gene")#305
dom_ss <- merge(ss_per_gene, mono, by="gene")#23

#get 0 ss for all and dd dom
noss <- c(nrow(no_d_no), nrow(d2))
ss1 <- c(nrow(no_d_ss), nrow(dom_ss))
monss <- data.frame("0"=noss, "1"=ss1)
rownames(monss) <- c("all", "dom")
colnames(monss) <- c("0", "1")
chisq.test(monss)$p.value
# 0.07098842

#rec
r2 <- anti_join(bi, ss_per_gene, by="gene")#
rec_ss <- merge(ss_per_gene, bi, by="gene")#

#get 0 ss for all and dd rec
noss <- c(nrow(no_r_no), nrow(r2))
ss1 <- c(nrow(no_r_ss), nrow(rec_ss))
recss <- data.frame("0"=noss, "1"=ss1)
rownames(recss) <- c("all", "rec")
colnames(recss) <- c("0", "1")
chisq.test(recss)$p.value
#0.0001345775

#CANCER
#onc
o2 <- anti_join(onc, ss_per_gene, by="gene")#
onc_ss <- merge(ss_per_gene, onc, by="gene")#
#get 0 uorf for all and dd rec
noss <- c(nrow(no_onc_no), nrow(o2))
ss1 <- c(nrow(no_onc_ss), nrow(onc_ss))
oncss <- data.frame("0"=noss, "1"=ss1)
rownames(oncss) <- c("all", "onc")
colnames(oncss) <- c("0", "1")
chisq.test(oncss)$p.value
# 0.0752052


#tsg

t2 <- anti_join(TSG2, ss_per_gene, by="gene")#
tsg_ss <- merge(ss_per_gene, TSG2, by="gene")#
#get 0 uorf for all and dd rec
noss <- c(nrow(no_tsg_no), nrow(t2))
ss1 <- c(nrow(no_tsg_ss), nrow(tsg_ss))
tsgss <- data.frame("0"=noss, "1"=ss1)
rownames(tsgss) <- c("all", "tsg")
colnames(tsgss) <- c("0", "1")
chisq.test(tsgss)$p.value
#0.9472937

#DS
#hap
h2 <- anti_join(hap_len, ss_per_gene, by="gene")#
hap_ss <- merge(ss_per_gene, hap_len, by="gene")#
#get 0 uorf for all and dd rec
noss <- c(nrow(no_hap_no), nrow(h2))
ss1 <- c(nrow(no_hap_ss), nrow(hap_ss))
hapss <- data.frame("0"=noss, "1"=ss1)
rownames(hapss) <- c("all", "hap")
colnames(hapss) <- c("0", "1")
chisq.test(hapss)$p.value
#3.062198e-08

#trip
ti2 <- anti_join(trip_len, ss_per_gene, by="gene")#
trip_ss <- merge(ss_per_gene, trip_len, by="gene")#
#get 0 uorf for all and dd rec
noss <- c(nrow(no_trip_no), nrow(ti2))
ss1 <- c(nrow(no_trip_ss), nrow(trip_ss))
tripss <- data.frame("0"=noss, "1"=ss1)
rownames(tripss) <- c("all", "trip")
colnames(tripss) <- c("0", "1")
chisq.test(tripss)$p.value
#0.04723285
```

#4. PhyloP 
```{r}
#make a grouped boxplot of mean phylop for 5utrs 
phylop <- read.table("five_utr_ph_m1.txt")

#add phylop to each disease sub and rbind into 1
#hap_len, trip_len,onc,TSG2,mono, bi, all

hap_ph <- merge(phylop, hap_len, by="gene")
trip_ph <- merge(phylop, trip_len, by="gene")
onc_ph <- merge(phylop, onc, by="gene")
tsg_ph <- merge(phylop, TSG2, by="gene")
mono_ph <- merge(phylop, mono, by="gene")
bi_ph <- merge(phylop, bi, by="gene")
all_ph <- merge(phylop, all, by="gene")

phylop_plot <- rbind(all_ph, hap_ph, trip_ph,onc_ph,tsg_ph,mono_ph, bi_ph)


#add median intercept
median(all_ph$agg_PGS, na.rm = TRUE) 
#0.2765763

#raincloud
ggplot(data = phylop_plot, aes(x=group, y = agg_PGS)) +   geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8, fill="#793A92") +     geom_point(aes(y = agg_PGS, fill = group), position = position_jitter(width = 0.15), size = 1, alpha = 0.1, color="grey48") +     geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5, fill="#793A92") +     labs(y = "Mean PhyloP per 5'UTR") + guides(fill = FALSE, color = FALSE) +theme_light()   + coord_flip() + geom_hline(yintercept = 0.2765763, colour="black",linetype = "dotted") + facet_grid(disease ~ ., scales = "free", space = "free", labeller = labeller(disease=label_wrap_gen(width=20))) + facet_grid(disease ~ ., scales = "free", space = "free", labeller = labeller(disease=label_wrap_gen(width=20))) + theme(axis.text.y = element_text(size = 10)) + theme(strip.background =element_rect(fill="gray47"))+ theme(strip.text = element_text(colour = 'white', size = '6', face="bold")) 


#some stats
#mean 5'utr phylop per group
#function
means <- function(df){
  x <- sum(df$agg_PGS[which(df$agg_PGS>=0)])
y <- sum(df$agg_PGS[which(df$agg_PGS<=0)])
z <- x+y #(x-y added it)
#get number of values in df
return(z/nrow(df))
}

means(hap_ph)#1.072923
means(trip_ph)#1.133459
means(onc_ph)#0.8545772
means(tsg_ph)#0.89462
means(mono_ph)#1.487475
means(bi_ph)#  0.2837177
means(all_ph)# 0.4173547


#need stats for this

##############do students t test for these
#dd dominant
n_dom_ph <- anti_join(all_ph,mono_ph,  by="gene")#all genes with no dd dom in 
no_domp <- n_dom_ph$agg_PGS
domp <- mono_ph$agg_PGS

t.test(no_domp, domp)$p.value
#1.378929e-46

#dd recessive
n_rec_ph <- anti_join(all_ph,bi_ph,  by="gene")
no_recp <- n_rec_ph$agg_PGS
recp <- bi_ph$agg_PGS
t.test(no_recp, recp)$p.value
#4.933076e-08
#onc
n_onc_ph <- anti_join(all_ph,onc_ph,  by="gene")
no_oncp <- n_onc_ph$agg_PGS
oncp <- onc_ph$agg_PGS
t.test(no_oncp,oncp)$p.value
#9.040601e-06
#tsg
n_tsg_ph <- anti_join(all_ph,tsg_ph,  by="gene")
no_tsgp <- n_tsg_ph$agg_PGS
tsgp <- tsg_ph$agg_PGS
t.test(no_tsgp,tsgp)$p.value
#4.298317e-08
#hap
n_hap_ph <- anti_join(all_ph,hap_ph,  by="gene")
no_happ <- n_hap_ph$agg_PGS
happ <- hap_ph$agg_PGS
t.test(no_happ, happ)$p.value
#2.915245e-243
#trip
n_trip_ph <- anti_join(all_ph,trip_ph,  by="gene")
no_tripp <- n_trip_ph$agg_PGS
tripp<- trip_ph$agg_PGS
t.test(no_tripp, tripp)$p.value
#6.059623e-133


#do chi square test on how many in each group had phylop >2 or < 2. make a matrix of each disease group. think how to set up, bec i was originally comparing top and bottom loeuf, so wht am i comparign ehre? all genes minus disease group? yes

#1. need all genes minus the disease set
#2.set up individual matrices for each disease set

#(all_ph, hap_ph, trip_ph,onc_ph,tsg_ph,mono_ph, bi_ph)
#all genes without specific disease set in
n_dom_ph <- anti_join(all_ph,mono_ph,  by="gene")#
n_rec_ph <- anti_join(all_ph,bi_ph,  by="gene")#
n_hap_ph <- anti_join(all_ph,hap_ph,  by="gene")#
n_trip_ph <- anti_join(all_ph,trip_ph,  by="gene")#
n_onc_ph <- anti_join(all_ph,onc_ph,  by="gene")#
n_tsg_ph <- anti_join(all_ph,tsg_ph,  by="gene")#

dom_p_stat <- n_dom_ph %>% mutate(two_cutoff = agg_PGS>2)#allgenes
dom_p_stat2 <- mono_ph %>% mutate(two_cutoff = agg_PGS>2)#dom

dom_p_stat <- dom_p_stat %>% mutate(two_cutoff=str_replace(two_cutoff, "TRUE", ">2"))
dom_p_stat <- dom_p_stat %>% mutate(two_cutoff=str_replace(two_cutoff, "FALSE", "<2"))

dom_p_stat2 <- dom_p_stat2 %>% mutate(two_cutoff=str_replace(two_cutoff, "TRUE", ">2"))
dom_p_stat2 <- dom_p_stat2 %>% mutate(two_cutoff=str_replace(two_cutoff, "FALSE", "<2"))

#get totals for each group
dom <- nrow(mono_ph) #327
n_dom <- nrow(n_dom_ph) #16339

dom_st <- data.frame(table(dom_p_stat$two_cutoff))
dom_st2 <-  data.frame(table(dom_p_stat2$two_cutoff))
dom_st$total <- n_dom
dom_st2$total <- dom

dom_st$perc <- (dom_st$Freq/dom_st$total)*100
dom_st2$perc <- (dom_st2$Freq/dom_st2$total)*100

alldom_fin <- as.data.frame(t(dom_st))
dom_fin <- as.data.frame(t(dom_st2))


#select the only row I want
alldom_fin <- alldom_fin %>% slice(2)
names(alldom_fin)[1]<-"<2"
names(alldom_fin)[2]<-">2"
rownames(alldom_fin) <- c("all")

dom_fin <- dom_fin %>% slice(2)
names(dom_fin)[1]<-"<2"
names(dom_fin)[2]<-">2"
rownames(dom_fin) <- c("dom")

phydom_stats <- rbind(alldom_fin, dom_fin)
#2 issues here - need to be round numbers, numeric and unlisted i think
#make numeric
phydom_stats$`<2` <- as.numeric(phydom_stats$`<2`)
phydom_stats$`>2` <- as.numeric(phydom_stats$`>2`)
#round
phydom_stats <- phydom_stats %>% mutate_if(is.numeric, round)

#fisher test
chisq.test(phydom_stats) $p.value
#p3.131061e-65

#make function
phy_stats <- function(df){
df <- df%>% mutate(two_cutoff = agg_PGS>2)#differentiate above and below 2
df <- df %>% mutate(two_cutoff=str_replace(two_cutoff, "TRUE", ">2"))#change to meaningful vals
df <- df %>% mutate(two_cutoff=str_replace(two_cutoff, "FALSE", "<2"))
total <- nrow(df) #16339
df2 <- data.frame(table(df$two_cutoff))
df2$total <- total
df2$perc <- (df2$Freq/df2$total)*100
#transpose
df3 <- as.data.frame(t(df2))
#select the only row I want
df4<- df3 %>% slice(2)
names(df4)[1]<-"<2"
names(df4)[2]<-">2"
rownames(df4) <- c("dis")
df4$`<2` <- as.numeric(df4$`<2`)
df4$`>2` <- as.numeric(df4$`>2`)
#round
df4<- df4 %>% mutate_if(is.numeric, round)
return (df4)
}

#to run
nrec <- phy_stats(n_rec_ph)

#now can run on all 12 and then bind them and do stats on. make them into a list and 
nrec  <- phy_stats(n_rec_ph)
nonc <- phy_stats(n_onc_ph)
ntsg <- phy_stats(n_tsg_ph)
nhap<- phy_stats(n_hap_ph)
ntrip <- phy_stats(n_trip_ph)

rec<- phy_stats(bi_ph)
onc<- phy_stats(onc_ph)
tsg<- phy_stats(tsg_ph)
hap<- phy_stats(hap_ph)
trip<- phy_stats(trip_ph)

rec_stat <- rbind(nrec, rec)
onc_stat <- rbind(nonc, onc)
tsg_stat <- rbind(ntsg, tsg)
hap_stat <- rbind(nhap, hap)
trip_stat <- rbind(ntrip, trip)

#chisq
chisq.test(rec_stat)$p.value#0.0008868729
chisq.test(onc_stat)$p.value#0.01144827
chisq.test(tsg_stat)$p.value#9.542434e-05
chisq.test(hap_stat)$p.value#3.976505e-181
chisq.test(trip_stat)$p.value#6.16831e-12

##for paper i want percentages in each group
perc_fun <- function(df){
  df$total <- df$`<2` + df$`>2`
  df$perc_less <- (df$`<2` / df$total) *100
  df$perc_more <- (df$`>2` / df$total) *100
    return(df)
}

#list of df i want to run this on
dfs <- list(phydom_stats, rec_stat, onc_stat, tsg_stat, hap_stat, trip_stat)

#run perc fun on all
lapply(dfs, perc_fun)

#dom
#5% of all genes (minuns dd dom) had phylop>2
#25% of DD dom genes had phylop>2

#rec
#5% of all genes (minuns dd rec) had phylop>2
#3% of DD rec genes had phylop>2

#onc
#5% of all genes (minuns onc) had phylop>2
#11% of onc genes had phylop>2

#tsg
# 5% of all genes (minuns tsg) had phylop>2
# 12% of tsg genes had phylop>2

#hap
# 3% of all genes (minuns hap) had phylop>2
# 16% of hap genes had phylop>2

#trip
# 5% of all genes (minuns trip) had phylop>2
# 14% of trip genes had phylop>2  


#all genes with no disease removed % for >2 and <2
all_ph2 <- phy_stats(all_ph)
perc_fun(all_ph2)
#5% have >2 phlyop
```

#5. Introns
```{r}
#length_plot from length chunk has intron count in

dis_int <- length_plot %>% mutate(intron_cutoff = intron_final>0)#this makes true if intron number is >0, false if less
#count how many 
dis_int2 <- dis_int %>% group_by(group, disease) %>% dplyr::count(intron_cutoff)#this looks nice
#get percentages
dis_int2$total <- parse_number(dis_int2$group)
dis_int2$n <- as.numeric(dis_int2$n)

dis_int2$perc <- (dis_int2$n/dis_int2$total)*100

#prepare for plot
#change intron true/false into meaningful
dis_int2 <- dis_int2 %>% mutate(intron_cutoff=str_replace(intron_cutoff, "TRUE", "1+ Intron"))
dis_int2 <- dis_int2 %>% mutate(intron_cutoff=str_replace(intron_cutoff, "FALSE", "0 Intron"))

dis_int2$intron_cutoff <- factor(dis_int2$intron_cutoff, levels=c("1+ Intron","0 Intron")) #but now figure leg is wrong


ggplot(dis_int2, aes(x=perc, y=group, fill=intron_cutoff)) + geom_bar(position="dodge", stat="identity", width=0.8) + theme_light() + ylab("Gene Groups") + xlab("Percentage")  +facet_grid(disease ~ ., scales = "free", space = "free", labeller = labeller(disease=label_wrap_gen(width=20))) + theme(axis.text.y = element_text(size = 10)) + theme(strip.background =element_rect(fill="gray47"))+ theme(strip.text = element_text(colour = 'white', size = '9', face="bold")) +scale_fill_manual(values = c("0 Intron" = "#BFACC8", "1+ Intron" = "#2B4595")) 

#change group names
dis_int2[dis_int2=="Developmental Disorder"] <- "DD"
dis_int2[dis_int2=="Dosage Sensitive"] <- "DS"
dis_int2[dis_int2=="All Genes"] <- "All"

ggplot(dis_int2, aes(x=perc, y=group, fill=intron_cutoff)) + geom_bar(position="dodge", stat="identity", width=0.8) + theme_light() + ylab("Gene Groups") + xlab("Percentage")  +facet_grid(disease ~ ., scales = "free", space = "free", labeller = labeller(disease=label_wrap_gen(width=20))) + theme(axis.text.y = element_text(size = 10)) + theme(strip.background =element_rect(fill="gray47"))+ theme(strip.text = element_text(colour = 'white', size = '9', face="bold")) +scale_fill_manual(values = c("0 Intron" = "#BFACC8", "1+ Intron" = "#2B4595")) + theme(legend.title=element_blank())


#######STATS############
#make matrices of each group plus all gene set
##redo to be against all genes minus individual disease sets

#in the phylop stats i have already generated all gene sets minus individual disease genes and it contains introns 
#in each set split between how many have 0 introns and how nany have 1+ intron?
#then compare proportions between the 2 groups?
#so i have proprtions already for all disease genes, now i need for all genes minus the disease ones

all_min_dom <- n_dom_ph %>% mutate(intron_cutoff = intron_final.x>0)
all_min_rec <- n_rec_ph %>% mutate(intron_cutoff = intron_final.x>0)
all_min_onc <- n_onc_ph %>% mutate(intron_cutoff = intron_final.x>0)
all_min_tsg <- n_tsg_ph %>% mutate(intron_cutoff = intron_final.x>0)
all_min_hap <- n_hap_ph %>% mutate(intron_cutoff = intron_final.x>0)
all_min_trip <- n_trip_ph %>% mutate(intron_cutoff = intron_final.x>0)

all_min_dom <- as.data.frame(table(all_min_dom$intron_cutoff))
all_min_dom <- as.data.frame(t(all_min_dom))

all_min_rec <- as.data.frame(table(all_min_rec$intron_cutoff))
all_min_rec <- as.data.frame(t(all_min_rec))

all_min_onc <- as.data.frame(table(all_min_onc$intron_cutoff))
all_min_onc <- as.data.frame(t(all_min_onc))


all_min_tsg <- as.data.frame(table(all_min_tsg$intron_cutoff))
all_min_tsg <- as.data.frame(t(all_min_tsg))

all_min_hap <- as.data.frame(table(all_min_hap$intron_cutoff))
all_min_hap <- as.data.frame(t(all_min_hap))

all_min_trip <- as.data.frame(table(all_min_trip$intron_cutoff))
all_min_trip <- as.data.frame(t(all_min_trip))

#now we have corrected all genes sets, now compare to disease gene sets
group_int <- subset(dis_int2, select=c(group, n, intron_cutoff))

#go through each dis subgroup, pull out of this df and do a stats test on it
#dominant
dom_in <- group_int %>% filter(group=="Dominant (n=328)")
dom_int <- subset(dom_in, select=-c(group))
dom_int <- as.data.frame(t(dom_int))
#get into matrix with all_min_dom
names(dom_int)[1]<-"0"
names(dom_int)[2]<-"1"
names(all_min_dom)[1]<-"0"
names(all_min_dom)[2]<-"1"

dom_in2 <- rbind(all_min_dom, dom_int)
dom_in2 <- dom_in2 %>% dplyr::slice(2,3)  
dom_in2$`0` <- as.integer(dom_in2$`0`)
dom_in2$`1` <- as.integer(dom_in2$`1`)
chisq.test(dom_in2)$p.value
# 0.06648969


#recessive
rec_in <- group_int %>% filter(group=="Recessive (n=948)")
rec_int <- subset(rec_in, select=-c(group))
rec_int <- as.data.frame(t(rec_int))
#get into matrix with all_min_dom
names(rec_int)[1]<-"0"
names(rec_int)[2]<-"1"
names(all_min_rec)[1]<-"0"
names(all_min_rec)[2]<-"1"

rec_in2 <- rbind(all_min_rec, rec_int)
rec_in2 <- rec_in2 %>% dplyr::slice(2,3)  
rec_in2$`0` <- as.integer(rec_in2$`0`)
rec_in2$`1` <- as.integer(rec_in2$`1`)
chisq.test(rec_in2)$p.value
# 4.691054e-06

#onc
onc_in <- group_int %>% filter(group=="Oncogene (n=109)")
onc_int <- subset(onc_in, select=-c(group))
onc_int <- as.data.frame(t(onc_int))
#get into matrix with all_min_dom
names(onc_int)[1]<-"0"
names(onc_int)[2]<-"1"
names(all_min_onc)[1]<-"0"
names(all_min_onc)[2]<-"1"

onc_in2 <- rbind(all_min_onc, onc_int)
onc_in2 <- onc_in2 %>% dplyr::slice(2,3)  
onc_in2$`0` <- as.integer(onc_in2$`0`)
onc_in2$`1` <- as.integer(onc_in2$`1`)
chisq.test(onc_in2)$p.value
#0.09402826

#tsg
tsg_in <- group_int %>% filter(group=="TSG (n=166)")
tsg_int <- subset(tsg_in, select=-c(group))
tsg_int <- as.data.frame(t(tsg_int))
#get into matrix with all_min_dom
names(tsg_int)[1]<-"0"
names(tsg_int)[2]<-"1"
names(all_min_tsg)[1]<-"0"
names(all_min_tsg)[2]<-"1"

tsg_in2 <- rbind(all_min_tsg, tsg_int)
tsg_in2 <- tsg_in2 %>% dplyr::slice(2,3)  
tsg_in2$`0` <- as.integer(tsg_in2$`0`)
tsg_in2$`1` <- as.integer(tsg_in2$`1`)
chisq.test(tsg_in2)$p.value
#0.148594

#hap
hap_in <- group_int %>% filter(group=="Haploinsufficient (n=2856)")
hap_int <- subset(hap_in, select=-c(group))
hap_int <- as.data.frame(t(hap_int))
#get into matrix with all_min_dom
names(hap_int)[1]<-"0"
names(hap_int)[2]<-"1"
names(all_min_hap)[1]<-"0"
names(all_min_hap)[2]<-"1"

hap_in2 <- rbind(all_min_hap, hap_int)
hap_in2 <- hap_in2 %>% dplyr::slice(2,3)  
hap_in2$`0` <- as.integer(hap_in2$`0`)
hap_in2$`1` <- as.integer(hap_in2$`1`)
chisq.test(hap_in2)$p.value
#0.1771262


#trip
trip_in <- group_int %>% filter(group=="Triplosensitive (n=1487)")
trip_int <- subset(trip_in, select=-c(group))
trip_int <- as.data.frame(t(trip_int))
#get into matrix with all_min_dom
names(trip_int)[1]<-"0"
names(trip_int)[2]<-"1"
names(all_min_trip)[1]<-"0"
names(all_min_trip)[2]<-"1"

trip_in2 <- rbind(all_min_trip, trip_int)
trip_in2 <- onc_in2 %>% dplyr::slice(2,3)  
trip_in2$`0` <- as.integer(trip_in2$`0`)
trip_in2$`1` <- as.integer(trip_in2$`1`)
chisq.test(trip_in2)$p.value
# 0.3886635


###stats version against all genes (with dis genes)
int_st <- subset(dis_int2, select=c(group, n, intron_cutoff))
zero_int <- int_st %>% filter(intron_cutoff=="0 Intron")
names(zero_int)[2]<-"zero"
one_int <- int_st %>% filter(intron_cutoff=="1+ Intron")
names(one_int)[2]<-"one"

#merge
int_stat <- merge(zero_int, one_int, by="group")
int_stat <- subset(int_stat, select=-c(intron_cutoff.x, intron_cutoff.y))

int_stat$zero <- as.numeric(int_stat$zero)
int_stat$one <- as.numeric(int_stat$one)
#slice df to get all the individual ones
int_stat <- subset(int_stat, select=-c(group))
dom_s <- int_stat %>% slice(1,2)
chisq.test(dom_s)$p.value
#0.0712268
hap_s  <- int_stat %>% slice(1,3)
chisq.test(hap_s)$p.value
#0.2505068
onc_s  <- int_stat %>% slice(1,4)
chisq.test(onc_s)$p.value
#0.09584875
rec_s <- int_stat %>% slice(1,5)
chisq.test(rec_s)$p.value
#1.332505e-05
trip_s  <- int_stat %>% slice(1,6)
chisq.test(trip_s)$p.value
#0.7378612
tsg_s  <- int_stat %>% slice(1,7)
chisq.test(tsg_s)$p.value
#0.1519951

```

#6. Recessive Genes vs Middle LEOUF genes 
```{r}
#1. middle leouf deciles vs recessive genes length

bi_genes <- bilof$gene.symbol #948

#run gnomad from gnomad file
middle <- gn_final1 %>% filter(decile %in% (5:6))
middle_genes <- middle$gene
#how many recessive are in middle
df <- intersect(bi_genes, middle_genes) #319

length_exon <- read.table("length_introns.txt")

names(length_exon)[1]<-"gene"
mid_len <- merge(middle, length_exon, by="gene")#3507
mean(mid_len$UTR_length)#176.5332
#mean middle leouf genes 5 utr length


rec <- bi$UTR_length
mid <- mid_len$UTR_length

wilcox.test(rec, mid)
#p-value = 0.03791525

#2. introns

intron <- data.frame(gene=length_exon$gene, intron=length_exon$intron_final)

midgn2 <- merge(intron, middle, by="gene")#3507

#comapre to rec. remember a bunch in middle 2 dec contian recessive

bi_int <- merge(intron, bilof, by="gene")
midgn2 <- merge(intron, midgn, by="gene")


mid_int <- midgn2 %>% filter(intron>=1)#1318
bi_int2 <- bi_int %>% filter(intron_final>=1)#290

midz <- nrow(midgn2) - nrow(mid_int)#2189
biz <- nrow(bi_int) - nrow(bi_int2)#658

zeroint <- c(midz, biz)
int <- c(nrow(mid_int), nrow(bi_int2))
rec_int <- data.frame("0"=zeroint, "1+"=int)

rownames(rec_int) <- c("mid", "rec")
colnames(rec_int) <- c("0", "1+")

chisq.test(rec_int)$p.value
# 8.198708e-05

#3. uorfs

#middle leouf
mid_uo <- merge(middle, uorf_per_gene, by="gene")#1077
mid_zero <- nrow(middle) - nrow(mid_uo)#2754

#rec
rec_uo <- orf_plot %>% filter(group=="Recessive (n=948)")
rec_uo <- rec_uo %>% filter(orf_type=="uORF")
rec_uo$no <- rec_uo$total- rec_uo$`n()`
rec_zero <- rec_uo$no
rec_uo1 <- rec_uo$`n()`

# stats of having 0 or 1+uorfs

nouo1 <- c(mid_zero, rec_zero)
uo_1 <- c(nrow(mid_uo), rec_uo1)
rec_uo2 <- data.frame("0"=nouo1, "1"=uo_1)

rownames(rec_uo2) <- c("mid", "rec")
colnames(rec_uo2) <- c("0", "1")

chisq.test(rec_uo2)$p.value
#0.6554969

#4. phlyop
#compare phylop recessive vs middle louef phylop
recp <- bi_ph$agg_PGS

phylop <- read.table("five_utr_ph_m1.txt")
#merge
utr_ph <- merge(phylop, gn_final1,by="gene")
mid_ph <- utr_ph %>% filter(decile %in% (5:6))

midp <- mid_ph$agg_PGS

t.test(recp, midp)$p.value
#0.9987832
```