# The purpose of this code is to investigate the relationship between the prevalence of suspicious deleterious indels in genes associated with dominant disorders. 

Deleterious indels were defined as those annotated as frameshift, splice region, splice donor, splice acceptor, stop gained, start lost, stop lost, or exon loss. 

In [1]:
library(tidyverse)
library(data.table)
library(readxl)
library(dplyr)
library(stringr)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.1     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.1     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: ‘data.table’


The following objects are masked from ‘package:lubridate’:

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    y

Set rare.

In [2]:
rare_IGM = 10^-4 

Set bp ranges. 

In [3]:
bp_range = c("10", "20", "30", "40")

Set variant effects. 

In [4]:
variants_effects = c("frameshift_variant", "splice_donor_variant", "splice_acceptor_variant", "stop_gained", "start_lost", "stop_lost", "exon_loss_variant")

Set the total number of samples. 

In [5]:
tot_num_samples = 39367

Read in all files.

In [6]:
all_sus_bp10 = fread("IGM_all_suspicious_bp10.csv")
all_sus_bp20 = fread("IGM_all_suspicious_bp20.csv")
all_sus_bp30 = fread("IGM_all_suspicious_bp30.csv")
all_sus_bp40 = fread("IGM_all_suspicious_bp40.csv")

In [7]:
seqDB = fread("2023-05-31_seqDB_exome_genome_in_dragendb.csv")

In [8]:
sample_name_gene_name = fread("2023-03-24_11-47-14_IGM_n39367_indels_genotypes_selectcols.csv") 
colnames(sample_name_gene_name) = c("VarID", "geneName", "sampleName", "coveredCtrl", "AC")


effects = fread("2023-04-21_IGM_n39367_indels_genotypes_effects.csv", header = TRUE)
colnames(effects) = c("VarID", "Effect")

annotations = distinct(fread ("2023-04-21_IGM_genename_gnomadpli_gnomadloeuf_omimdisease.csv")) #18231 
colnames(annotations) = c("geneName", "pLI", "oe_lof_upper", "OMIM_disease")


In [9]:
annotations[geneName == "'HTT'"]$OMIM_disease = "Huntington disease, 143100 (3), Autosomal dominant"
annotations[geneName == "'GLTSCR1'"]$geneName = "BICRA"
annotations[geneName == "'FAM46A'"]$geneName = "TENT5A"

# Supplementary Tables S12-S14

In [10]:
for (i in bp_range) {
  
  ## get sus df 
  get_sus_df = paste0("all_sus_bp", i)
  
  ## merge with all the annotations files 
  with_effects = paste0("with_effects_bp", i)
  with_samples = paste0("with_samples_bp", i)
  with_annotations = paste0("with_annotations_bp", i)
  masterfile = paste0("masterfile_sus_bp", i)
  
  assign(with_effects, merge(get(get_sus_df), effects, by = "VarID"))
  assign(with_samples, merge(get(with_effects), sample_name_gene_name, by = "VarID"))
  assign(with_annotations, merge(get(with_samples), annotations, by = "geneName"))
  assign(masterfile, get(with_annotations))
  
  masterfile = get(with_annotations)
  
  #sus indels 
  #filter for certain effects 
  #constrained (gnomAD pLI > 0.5, oe_lof_upper < 0.35)

  constrained_sus = paste0("constrained_sus_", i)
  
  assign(constrained_sus, masterfile %>% filter(Effect %in% variants_effects) %>% filter(oe_lof_upper < 0.35) %>% filter(pLI > 0.5)) 
  
  ## Find more than 50bp individuals 
  
  table = paste0("table_", i)
  
  assign(table, get(constrained_sus) %>% group_by(geneName) %>% mutate(uniqSamples = n_distinct(sampleName)) %>% filter(uniqSamples > 50) %>% ungroup())
  
  assign(table, get(table) %>% mutate(geneName = gsub("'", "", get(table)$geneName)))
  
  assign(table, get(table) %>% mutate(prevalence_percent = round (get(table)$uniqSamples / tot_num_samples * 100, digits = 2 )))
  
  assign(table, distinct(get(table) %>% select("geneName", "uniqSamples", "prevalence_percent", "pLI", "oe_lof_upper", "OMIM_disease")))
  
}


In [11]:
table_10
table_20
table_30
table_40

geneName,uniqSamples,prevalence_percent,pLI,oe_lof_upper,OMIM_disease
<chr>,<int>,<dbl>,<dbl>,<dbl>,<chr>
ATXN2,59,0.15,0.85289,0.333,"{Amyotrophic lateral sclerosis, susceptibility to, 13}, 183090 (3), Autosomal dominant; Spinocerebellar ataxia 2, 183090 (3), Autosomal dominant; {Parkinson disease, late-onset, susceptibility to}, 168600 (3), Autosomal dominant, Multifactorial"
EP400,112,0.28,1.0,0.208,
HTT,63,0.16,1.0,0.176,"Huntington disease, 143100 (3), Autosomal dominant"
KMT2B,84,0.21,1.0,0.07,"Intellectual developmental disorder, autosomal dominant 68, 619934 (3), Autosomal dominant; Dystonia 28, childhood-onset, 617284 (3), Autosomal dominant"
NABP2,57,0.14,0.97257,0.27,
TRRAP,69,0.18,1.0,0.06,"?Deafness, autosomal dominant 75, 618778 (3), Autosomal dominant; Developmental delay with or without dysmorphic facies and autism, 618454 (3), Autosomal dominant"


geneName,uniqSamples,prevalence_percent,pLI,oe_lof_upper,OMIM_disease
<chr>,<int>,<dbl>,<dbl>,<dbl>,<chr>
ATXN2,62,0.16,0.85289,0.333,"{Amyotrophic lateral sclerosis, susceptibility to, 13}, 183090 (3), Autosomal dominant; Spinocerebellar ataxia 2, 183090 (3), Autosomal dominant; {Parkinson disease, late-onset, susceptibility to}, 168600 (3), Autosomal dominant, Multifactorial"
EP400,112,0.28,1.0,0.208,
HTT,63,0.16,1.0,0.176,"Huntington disease, 143100 (3), Autosomal dominant"
KMT2B,89,0.23,1.0,0.07,"Intellectual developmental disorder, autosomal dominant 68, 619934 (3), Autosomal dominant; Dystonia 28, childhood-onset, 617284 (3), Autosomal dominant"
NABP2,57,0.14,0.97257,0.27,
TRRAP,69,0.18,1.0,0.06,"?Deafness, autosomal dominant 75, 618778 (3), Autosomal dominant; Developmental delay with or without dysmorphic facies and autism, 618454 (3), Autosomal dominant"


geneName,uniqSamples,prevalence_percent,pLI,oe_lof_upper,OMIM_disease
<chr>,<int>,<dbl>,<dbl>,<dbl>,<chr>
APC2,56,0.14,0.99994,0.225,"Cortical dysplasia, complex, with other brain malformations 10, 618677 (3), Autosomal recessive; Intellectual developmental disorder, autosomal recessive 74, 617169 (3), Autosomal recessive"
ATXN2,70,0.18,0.85289,0.333,"{Amyotrophic lateral sclerosis, susceptibility to, 13}, 183090 (3), Autosomal dominant; Spinocerebellar ataxia 2, 183090 (3), Autosomal dominant; {Parkinson disease, late-onset, susceptibility to}, 168600 (3), Autosomal dominant, Multifactorial"
EP400,112,0.28,1.0,0.208,
HTT,63,0.16,1.0,0.176,"Huntington disease, 143100 (3), Autosomal dominant"
KMT2B,90,0.23,1.0,0.07,"Intellectual developmental disorder, autosomal dominant 68, 619934 (3), Autosomal dominant; Dystonia 28, childhood-onset, 617284 (3), Autosomal dominant"
NABP2,57,0.14,0.97257,0.27,
TRRAP,69,0.18,1.0,0.06,"?Deafness, autosomal dominant 75, 618778 (3), Autosomal dominant; Developmental delay with or without dysmorphic facies and autism, 618454 (3), Autosomal dominant"


geneName,uniqSamples,prevalence_percent,pLI,oe_lof_upper,OMIM_disease
<chr>,<int>,<dbl>,<dbl>,<dbl>,<chr>
APC2,93,0.24,0.99994,0.225,"Cortical dysplasia, complex, with other brain malformations 10, 618677 (3), Autosomal recessive; Intellectual developmental disorder, autosomal recessive 74, 617169 (3), Autosomal recessive"
ARID1B,56,0.14,1.0,0.102,"Coffin-Siris syndrome 1, 135900 (3), Autosomal dominant"
ATXN2,75,0.19,0.85289,0.333,"{Amyotrophic lateral sclerosis, susceptibility to, 13}, 183090 (3), Autosomal dominant; Spinocerebellar ataxia 2, 183090 (3), Autosomal dominant; {Parkinson disease, late-onset, susceptibility to}, 168600 (3), Autosomal dominant, Multifactorial"
EP400,114,0.29,1.0,0.208,
HTT,63,0.16,1.0,0.176,"Huntington disease, 143100 (3), Autosomal dominant"
KMT2B,90,0.23,1.0,0.07,"Intellectual developmental disorder, autosomal dominant 68, 619934 (3), Autosomal dominant; Dystonia 28, childhood-onset, 617284 (3), Autosomal dominant"
NABP2,57,0.14,0.97257,0.27,
SCAF1,52,0.13,0.99819,0.254,
TRRAP,69,0.18,1.0,0.06,"?Deafness, autosomal dominant 75, 618778 (3), Autosomal dominant; Developmental delay with or without dysmorphic facies and autism, 618454 (3), Autosomal dominant"


# Table 2
Find the Broad Phenotypes for the people who have suspicious indels in constrained genes. 

Merge the suspicious indels in constrained genes with the seqDB data by sample name.

In [12]:
table2 = table_10[table_10$OMIM_disease %like% "Autosomal dominant",]

In [13]:
constrained_sus_10_merged = merge(constrained_sus_10, seqDB, by.x = "sampleName", by.y = "sample_internal_name")

In [14]:
EP400 = as.data.frame(table((distinct(constrained_sus_10_merged %>% select(sampleName, BroadPhenotype, geneName)) %>% filter (geneName == "'EP400'"))$BroadPhenotype))
KMT2B = as.data.frame(table((distinct(constrained_sus_10_merged %>% select(sampleName, BroadPhenotype, geneName)) %>% filter (geneName == "'KMT2B'"))$BroadPhenotype))
TRRAP = as.data.frame(table((distinct(constrained_sus_10_merged %>% select(sampleName, BroadPhenotype, geneName)) %>% filter (geneName == "'TRRAP'"))$BroadPhenotype))
HTT = as.data.frame(table((distinct(constrained_sus_10_merged %>% select(sampleName, BroadPhenotype, geneName)) %>% filter (geneName == "'HTT'"))$BroadPhenotype))
ATXN2 = as.data.frame(table((distinct(constrained_sus_10_merged %>% select(sampleName, BroadPhenotype, geneName)) %>% filter (geneName == "'ATXN2'"))$BroadPhenotype))
NABP2 = as.data.frame(table((distinct(constrained_sus_10_merged %>% select(sampleName, BroadPhenotype, geneName)) %>% filter (geneName == "'NABP2'"))$BroadPhenotype))

In [15]:
table2
EP400
KMT2B
TRRAP
HTT
ATXN2
NABP2

geneName,uniqSamples,prevalence_percent,pLI,oe_lof_upper,OMIM_disease
<chr>,<int>,<dbl>,<dbl>,<dbl>,<chr>
ATXN2,59,0.15,0.85289,0.333,"{Amyotrophic lateral sclerosis, susceptibility to, 13}, 183090 (3), Autosomal dominant; Spinocerebellar ataxia 2, 183090 (3), Autosomal dominant; {Parkinson disease, late-onset, susceptibility to}, 168600 (3), Autosomal dominant, Multifactorial"
HTT,63,0.16,1.0,0.176,"Huntington disease, 143100 (3), Autosomal dominant"
KMT2B,84,0.21,1.0,0.07,"Intellectual developmental disorder, autosomal dominant 68, 619934 (3), Autosomal dominant; Dystonia 28, childhood-onset, 617284 (3), Autosomal dominant"
TRRAP,69,0.18,1.0,0.06,"?Deafness, autosomal dominant 75, 618778 (3), Autosomal dominant; Developmental delay with or without dysmorphic facies and autism, 618454 (3), Autosomal dominant"


Var1,Freq
<fct>,<int>
amyotrophic lateral sclerosis,8
cardiovascular disease,1
congenital disorder,2
control,5
dementia,2
epilepsy,35
healthy family member,15
kidney and urological disease,28
liver disease,5
obsessive compulsive disorder,5


Var1,Freq
<fct>,<int>
brain malformation,1
congenital disorder,5
control,8
dementia,5
epilepsy,17
fetal ultrasound anomaly,1
healthy family member,14
kidney and urological disease,24
obsessive compulsive disorder,1
other neurological disease,2


Var1,Freq
<fct>,<int>
amyotrophic lateral sclerosis,4
control,6
dementia,2
epilepsy,49
healthy family member,1
kidney and urological disease,2
obsessive compulsive disorder,1
pulmonary disease,4


Var1,Freq
<fct>,<int>
,1
amyotrophic lateral sclerosis,6
congenital disorder,1
control,5
covid-19,1
dementia,4
epilepsy,10
fetal ultrasound anomaly,1
healthy family member,8
kidney and urological disease,14


Var1,Freq
<fct>,<int>
amyotrophic lateral sclerosis,15
control,5
dementia,5
epilepsy,4
healthy family member,9
infectious disease,1
kidney and urological disease,12
liver disease,1
obsessive compulsive disorder,3
other neurodevelopmental disease,1


Var1,Freq
<fct>,<int>
control,1
dementia,1
epilepsy,14
gastrointestinal disease,1
healthy family member,7
kidney and urological disease,30
liver disease,1
obsessive compulsive disorder,1
schizophrenia,1
