# Figure 7: Venn Diagram

In [None]:
setwd ("")

Step 0: Make sure that the format of the bedfiles are all good 

In [None]:
system('cut -f 1,2,3 -d "\t" UKBB/UKBB_rAF_hi_regions_10bp_CHGR37.bed | awk -F"\t" \'{gsub("chr", "", $1)}1\' OFS="\t" > UKBB/UKBB_rAF_hi_regions_10bp_CHGR37_formatted.bed', intern = TRUE)


Step 1: Find the regions that do not overlap with LCR 

In [None]:
system("pwd", intern = TRUE)

system("bedtools subtract -a gnomAD/gnomad.exomes.r2.1.1.sites_indelsonly_rAF_bp10_rAF_hiIndels.lt50bp.region.bed -b LCR/GRCh37_AllTandemRepeatsandHomopolymers_slop5.bed | sort -k1,1 -k2,2n -k3,3n  | uniq > venn/gnomAD_10bp_rAF_hi_not_LCR_37.bed", intern = TRUE)

system("bedtools subtract -a IGM/2023-03-23_IGM_n39367_indelsonly_rAF_bp10_rAF_hiIndels.lt50bp.region.bed -b LCR/GRCh37_AllTandemRepeatsandHomopolymers_slop5.bed | sort -k1,1 -k2,2n -k3,3n  | uniq  > venn/IGM_10bp_rAF_hi_not_LCR_37.bed", intern = TRUE)

system("bedtools subtract -a UKBB/UKBB_rAF_hi_regions_10bp_CHGR37_formatted.bed -b LCR/GRCh37_AllTandemRepeatsandHomopolymers_slop5.bed | sort -k1,1 -k2,2n -k3,3n  | uniq  > venn/UKBB_10bp_rAF_hi_not_LCR_37.bed", intern = TRUE)


Step 2: Find the regions that are in all three data sets. Let's call this X.

In [None]:
system("bedtools intersect -a venn/gnomAD_10bp_rAF_hi_not_LCR_37.bed -b venn/IGM_10bp_rAF_hi_not_LCR_37.bed | sort -k1,1 -k2,2n -k3,3n  | uniq > venn/gnomAD_v_IGM.bed", intern = TRUE)

system("bedtools intersect -a venn/gnomAD_v_IGM.bed -b venn/UKBB_10bp_rAF_hi_not_LCR_37.bed | sort -k1,1 -k2,2n -k3,3n  | uniq > venn/X.bed", intern = TRUE)

Step 3: Find these 3 intersecting regions

    1) gnomAD v. IGM (Let's call this the D and X region)
    
    2) IGM v. UKBB (Let's call this the E and X region)
    
    3) gnomAD v. UKBB (Let's call this the F and X region)

In [None]:
system("bedtools intersect -a venn/gnomAD_10bp_rAF_hi_not_LCR_37.bed -b venn/IGM_10bp_rAF_hi_not_LCR_37.bed | sort -k1,1 -k2,2n -k3,3n  | uniq > venn/D_and_X.bed", intern = TRUE)

system("bedtools intersect -a venn/IGM_10bp_rAF_hi_not_LCR_37.bed -b venn/UKBB_10bp_rAF_hi_not_LCR_37.bed | sort -k1,1 -k2,2n -k3,3n  | uniq > venn/E_and_X.bed", intern = TRUE)

system("bedtools intersect -a venn/gnomAD_10bp_rAF_hi_not_LCR_37.bed -b venn/UKBB_10bp_rAF_hi_not_LCR_37.bed | sort -k1,1 -k2,2n -k3,3n  | uniq > venn/F_and_X.bed", intern = TRUE)

Step 4: Read in all the files! Time to do some calculations. 

In [None]:
library(tidyverse)
library(data.table)
library(readxl)
library(dplyr)
library(stringr)

In [None]:
gnomAD_tot = fread("venn/gnomAD_10bp_rAF_hi_not_LCR_37.bed")
IGM_tot = fread("venn/IGM_10bp_rAF_hi_not_LCR_37.bed")
UKBB_tot = fread("venn/UKBB_10bp_rAF_hi_not_LCR_37.bed")

X = fread("venn/X.bed")

D_and_X = fread("venn/D_and_X.bed")
E_and_X = fread("venn/E_and_X.bed")
F_and_X = fread("venn/F_and_X.bed")

In [None]:

## Calculate the pos col

gnomAD_tot$bp_length = gnomAD_tot$V3 - gnomAD_tot$V2 + 1
IGM_tot$bp_length = IGM_tot$V3 - IGM_tot$V2 + 1 
UKBB_tot$bp_length = UKBB_tot$V3 - UKBB_tot$V2 + 1 

X$bp_length = X$V3 - X$V2 + 1

D_and_X$bp_length = D_and_X$V3 - D_and_X$V2 + 1 
E_and_X$bp_length = E_and_X$V3 - E_and_X$V2 + 1
F_and_X$bp_length = F_and_X$V3 - F_and_X$V2 + 1

## sum up the column 

gnomAD_num = sum(gnomAD_tot$bp_length)
IGM_num = sum(IGM_tot$bp_length)
UKBB_num = sum(UKBB_tot$bp_length)

X_num = sum(X$bp_length)

D_and_X_num = sum(D_and_X$bp_length)
E_and_X_num = sum(E_and_X$bp_length)
F_and_X_num = sum(F_and_X$bp_length)

## find actual numbers 
D_num = D_and_X_num - X_num
E_num = E_and_X_num - X_num
F_num = F_and_X_num - X_num 

A_num = gnomAD_num - D_num - F_num - X_num
B_num = IGM_num - D_num - E_num - X_num
C_num = UKBB_num - E_num - F_num - X_num
  
# print
print (paste0("This is A: " , A_num))
print (paste0("This is B: " , B_num))
print (paste0("This is C: " , C_num))
print (paste0("This is D: " , D_num))
print (paste0("This is E: " , E_num))
print (paste0("This is F: " , F_num))
print (paste0("This is X: " , X_num))

