In [1]:
library(tidyverse)
library(data.table)
library(readxl)
library(dplyr)
library(stringr)

── [1mAttaching core tidyverse packages[22m ────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.1     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.1     
── [1mConflicts[22m ──────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: ‘data.table’


The following objects are masked from ‘package:lubridate’:

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday,

Make a bedfile of the non-suspicious indels where the second position is length of ref - 1. 

In [2]:
## This script is to get a bedfile for non-suspicious IGM indels 

## The columns for the bedfiles are: CHR | START | END

nonsus_bp10 = fread("2023-03-23_IGM_n39367_indelsonly_rAF_bp10_RareIndels.lt50bp.csv")
nonsus_bp20 = fread("2023-03-23_IGM_n39367_indelsonly_rAF_bp20_RareIndels.lt50bp.csv")
nonsus_bp30 = fread("2023-03-23_IGM_n39367_indelsonly_rAF_bp30_RareIndels.lt50bp.csv")
nonsus_bp40 = fread("2023-03-23_IGM_n39367_indelsonly_rAF_bp40_RareIndels.lt50bp.csv")

df_name = "IGM"


bp_range = c("10", "20", "30", "40")

for (i in bp_range) {
  get_df = paste0("nonsus_bp", i)
  
  bed = paste0("bed_bp", i)
  
  assign(get_df, get(get_df) %>% separate(VarID, c("CHR", "POS", "REF", "ALT")))
  assign(get_df, get(get_df) %>% mutate(refLen = nchar(get(get_df)$REF)))
  assign(bed, data.frame(get(get_df)$CHR, get(get_df)$POS, as.numeric(get(get_df)$POS) + as.numeric(get(get_df)$refLen) - 1))

  fwrite(get(bed), paste0(df_name, "_non_sus_indels_bp", i, ".bed"), col.names=FALSE, sep = "\t")
}


Merge bedfile of gnomAD bins that are common and suspicious. 
Bash script below. 

In [3]:
system("bp_range=('10' '20' '30' '40')

date=''
output_path=''

df_name='gnomAD'

for i in ${bp_range[@]};
do
   echo $i
   cat gnomad.exomes.r2.1.1.sites_indelsonly_rAF_bp${i}_SuspiciousIndels.lt50bp.bin.bed gnomad.exomes.r2.1.1.sites_indelsonly_rAF_bp${i}_CommonIndels.lt50bp.bin.bed > ${output_path}${df_name}_suspicious_and_common_bins_bp${i}.bed
done;", intern = TRUE)

Bedtools intersect IGM indels that are non-suspicious by IGM indel with gnomAD bins that are common or suspicious. 
Bash script below.

In [4]:
system("bp_range=('10' '20' '30' '40')

date=''
output_path=''

df_1_name='gnomAD'
df_2_name='IGM'

for i in ${bp_range[@]};
do
   echo $i
   bedtools intersect -wa -a ${output_path}${df_2_name}_non_sus_indels_bp${i}.bed -b ${output_path}${df_1_name}_suspicious_and_common_bins_bp${i}.bed | sort -k1,1n -k2,2n | uniq > ${output_path}${df_2_name}_non_sus_indels_annotated_bp${i}.bed
done;", intern = TRUE)


Get files with: 
 1. File 1 (all_sus): [indels that are sus by IGM rAF] + [indels that are overlapping with gnomAD sus and common bins]
 2. File 2 (reduced_sus): [indels that are non-sus by IGM rAF] - [indels that are overlapping with gnomAD sus and common bins]

In [5]:
library(tidyverse)
library(data.table)
library(readxl)
library(dplyr)
library(stringr)

## This script is to get a file with all the suspicious IGM indels, including the indels that are non-suspicious by IGM rAF but are common/suspicious by gnomAD rAF 

bp_range = c("10", "20", "30", "40")
output_path = ""

df_name = "IGM"

###############################################################
###############################################################

## Read in IGM indels that are suspicious by IGM rAF only 
sus_bp10 = fread("2023-03-23_IGM_n39367_indelsonly_rAF_bp10_SuspiciousIndels.lt50bp.csv")
sus_bp20 = fread("2023-03-23_IGM_n39367_indelsonly_rAF_bp20_SuspiciousIndels.lt50bp.csv")
sus_bp30 = fread("2023-03-23_IGM_n39367_indelsonly_rAF_bp30_SuspiciousIndels.lt50bp.csv")
sus_bp40 = fread("2023-03-23_IGM_n39367_indelsonly_rAF_bp40_SuspiciousIndels.lt50bp.csv")

## Read in non-suspicious IGM indels 
nonsus_bp10 = fread("2023-03-23_IGM_n39367_indelsonly_rAF_bp10_RareIndels.lt50bp.csv")
nonsus_bp20 = fread("2023-03-23_IGM_n39367_indelsonly_rAF_bp20_RareIndels.lt50bp.csv")
nonsus_bp30 = fread("2023-03-23_IGM_n39367_indelsonly_rAF_bp30_RareIndels.lt50bp.csv")
nonsus_bp40 = fread("2023-03-23_IGM_n39367_indelsonly_rAF_bp40_RareIndels.lt50bp.csv")

## Read in non-suspicous IGM indels bedfile that are annotated to be non-suspicous by gnomAD rAF
annotated_bp10 = fread("IGM_non_sus_indels_annotated_bp10.bed")
annotated_bp20 = fread("IGM_non_sus_indels_annotated_bp20.bed")
annotated_bp30 = fread("IGM_non_sus_indels_annotated_bp30.bed")
annotated_bp40 = fread("IGM_non_sus_indels_annotated_bp40.bed")

###############################################################
###############################################################

## Map the annotated_bp indels to the nonsus_bp dataframes by CHR and first POS

for (i in bp_range) {
  nonsus_bp = paste0("nonsus_bp", i)
  annotated_bp = paste0("annotated_bp", i)
  
  varID_annotated_bp = paste0("varID_annotated_bp", i)
  
  assign(nonsus_bp, get(nonsus_bp) %>% separate(VarID, c("CHR", "POS", "REF", "ALT")))
  
  assign(varID_annotated_bp, get(nonsus_bp) %>% filter(paste0(get(nonsus_bp)$CHR, "-", get(nonsus_bp)$POS) %in% paste0(get(annotated_bp)$V1, "-", get(annotated_bp)$V2)))
}

## Combine the suspicious indels that are suspicious by IGM rAF with the annotated non-suspicious indels that are suspicious by gnomAD rAF 

for (i in bp_range) {
  varID_annotated_bp = paste0("varID_annotated_bp", i)
  assign(varID_annotated_bp, subset((get(varID_annotated_bp) %>% mutate(VarID = paste0(CHR, "-", POS, "-", REF, "-", ALT))), select = -c(CHR, POS, REF, ALT)))
  
  ## bind to suspicious indels by IGM rAF
  sus_bp = paste0("sus_bp", i)
  
  ## all suspicious (including annotated one)
  all_sus_bp = paste0("all_sus_bp", i)
  
  assign(all_sus_bp, rbind(get(varID_annotated_bp), get(sus_bp)))
  fwrite(get(all_sus_bp), paste0(output_path, df_name, "_all_suspicious_bp", i, ".csv"))
}

## Find the reduced non-suspicous indels.
## Non-suspicious indels that are NOT in gnomAD bins that are common/sus. 
for (i in bp_range) {
  nonsus_bp = paste0("nonsus_bp", i)
  assign(nonsus_bp, subset((get(nonsus_bp) %>% mutate(VarID = paste0(CHR, "-", POS, "-", REF, "-", ALT))), select = -c(CHR, POS, REF, ALT)))
  
  varID_annotated_bp = paste0("varID_annotated_bp", i)
  reduced_nonsus_bp = paste0("reduced_nonsus_bp", i)
  
  # exclude VarIDs that are in varID_annotated_bp from the nonsus_bp df 
  assign(reduced_nonsus_bp, get(nonsus_bp) %>% filter(!(VarID %in% get(varID_annotated_bp)$VarID)))
  fwrite(get(reduced_nonsus_bp), paste0(output_path, df_name, "_reduced_nonsus_bp", i, ".csv"))
  
}

Find the number of individuals that contain only suspicious deleterious indels by not counting the individuals that are also in the cohort with at least one suspicious deleterious indel in a constrained gene associated with an autosomal dominant disorder. 


In [6]:
## Table 3.

library(tidyverse)
library(data.table)
library(readxl)
library(dplyr)
library(stringr)

bp_range = c("10", "20", "30", "40")

total_samples = 39367

variants_effects = c("frameshift_variant", "splice_donor_variant", "splice_acceptor_variant", "stop_gained", "start_lost", "stop_lost", "exon_loss_variant")

## This script finds the number of individuals with only deleterious suspicious indels in constrained, autosomal dominant genes  
all_sus_bp10 = fread("IGM_all_suspicious_bp10.csv")
all_sus_bp20 = fread("IGM_all_suspicious_bp20.csv")
all_sus_bp30 = fread("IGM_all_suspicious_bp30.csv")
all_sus_bp40 = fread("IGM_all_suspicious_bp40.csv")

## Read in non-suspicious IGM indels (removed the indels that are in common/sus gnomAD bins)
reduced_nonsus_bp10 = fread("IGM_reduced_nonsus_bp10.csv")
reduced_nonsus_bp20 = fread("IGM_reduced_nonsus_bp20.csv")
reduced_nonsus_bp30 = fread("IGM_reduced_nonsus_bp30.csv")
reduced_nonsus_bp40 = fread("IGM_reduced_nonsus_bp40.csv")

sample_name_gene_name = fread("2023-03-24_11-47-14_IGM_n39367_indels_genotypes_selectcols.csv") 
colnames(sample_name_gene_name) = c("VarID", "geneName", "sampleName", "coveredCtrl", "AC")


effects = fread("2023-04-21_IGM_n39367_indels_genotypes_effects.csv", header = TRUE)
colnames(effects) = c("VarID", "Effect")

annotations = distinct(fread ("2023-04-21_IGM_genename_gnomadpli_gnomadloeuf_omimdisease.csv")) #18231 
colnames(annotations) = c("geneName", "pLI", "oe_lof_upper", "OMIM_disease")

annotations[geneName == "'HTT'"]$OMIM_disease = "Huntington disease, 143100 (3), Autosomal dominant"
annotations[geneName == "'GLTSCR1'"]$geneName = "BICRA"
annotations[geneName == "'FAM46A'"]$geneName = "TENT5A"


## Merge IGM all suspicious indels with gene name, sample name, pLI, loeuf, omim, effects

for (i in bp_range) {
  all_sus_bp = paste0("all_sus_bp", i)
  
  sus_with_gene_bp = paste0("sus_with_gene_bp", i)
  sus_with_effects_bp = paste0("sus_with_effects_bp", i)
  sus_with_annot_bp = paste0("sus_with_annot_bp", i)
  
  assign (sus_with_gene_bp, merge(get(all_sus_bp), sample_name_gene_name, by="VarID" ))
  assign (sus_with_effects_bp, merge(get(sus_with_gene_bp), effects, by="VarID"))
  assign (sus_with_annot_bp, merge(get(sus_with_effects_bp), annotations, by="geneName"))
  ## sus dataframe with all the annotations is sus_with_annot_bp
  
  ## Now only look for suspicious indels of interest by filtering for indels that are autosomal dominant, deleterious, and constrained (pLI > 0.5; oe_lof_upper < 0.35)
  filtered_sus_bp = paste0("filtered_sus_bp", i)
  assign(filtered_sus_bp, get(sus_with_annot_bp)[OMIM_disease %like% "Autosomal dominant", ] %>% filter(Effect %in% variants_effects) %>% filter(oe_lof_upper < 0.35) %>% filter(pLI > 0.5))
  
  ### Now look for non-suspicious indels 
  
  reduced_nonsus_bp = paste0("reduced_nonsus_bp", i)
  
  nonsus_with_gene_bp = paste0("nonsus_with_gene_bp", i)
  nonsus_with_effects_bp = paste0("nonsus_with_effects_bp", i)
  nonsus_with_annot_bp = paste0("nonsus_with_annot_bp", i)
  ## reduced nonsus dataframe with all the annotations is nonsus_with_annot_bp
  
  assign(nonsus_with_gene_bp, merge(get(reduced_nonsus_bp), sample_name_gene_name, by = "VarID"))
  assign(nonsus_with_effects_bp, merge(get(nonsus_with_gene_bp), effects, by = "VarID"))
  assign(nonsus_with_annot_bp, merge(get(nonsus_with_effects_bp), annotations, by ="geneName"))
  
  ## Now only look for non suspicious indels of interest by filtering for indels that are autosomal dominant, deleterious, and constrained (pLI > 0.5; oe_lof_upper < 0.35)
  filtered_nonsus_bp = paste0("filtered_nonsus_bp", i)
  assign(filtered_nonsus_bp, get(nonsus_with_annot_bp)[OMIM_disease %like% "Autosomal dominant", ] %>% filter(Effect %in% variants_effects) %>% filter(oe_lof_upper < 0.35) %>% filter(pLI > 0.5))
  
  ## find unique individuals that are in filtered_nonsus_bp 
  nonsus_individuals = paste0("only_sus_individuals_bp", i)
  num_nonsus_individuals= paste0("num_nonsus_individuals_bp", i)
  
  assign(nonsus_individuals, as.data.frame(unique(get(filtered_nonsus_bp)$sampleName)))
  assign(num_nonsus_individuals, nrow(get(nonsus_individuals)))
  
  ## find unique sus individuals that are in filtered_nonsus_bp 
  ## this means sus indels that are NOT in non_sus indels 
  only_sus_individuals = paste0("only_sus_individuals_bp", i )
  num_only_sus_individuals = paste0("num_only_sus_individuals_bp", i)
  
  assign(only_sus_individuals, as.data.frame(unique((get(filtered_sus_bp) %>% filter(!(get(filtered_sus_bp)$sampleName %in% get(filtered_nonsus_bp)$sampleName)))$sampleName)) )
  assign(num_only_sus_individuals, nrow(get(only_sus_individuals)))
}

## make dataframe to visualize data 
nonsus_individuals = c(num_nonsus_individuals_bp10, num_nonsus_individuals_bp20, num_nonsus_individuals_bp30, num_nonsus_individuals_bp40)
only_sus_individuals = c(num_only_sus_individuals_bp10, num_only_sus_individuals_bp20, num_only_sus_individuals_bp30, num_only_sus_individuals_bp40)
  
summary = data.frame(bp_range, nonsus_individuals, only_sus_individuals)

## Find proportion out of all IGM cohort 
summary$prct_nonsus_out_of_total = round((as.numeric(summary$nonsus_individuals) / total_samples) * 100, 2) 
summary$prct_only_sus_out_of_total = round((as.numeric(summary$only_sus_individuals) / total_samples) * 100, 2) 
summary$prct_only_sus_out_of_nonsus = round((as.numeric(summary$only_sus_individuals) / (as.numeric(summary$nonsus_individuals))) * 100, 2) 
summary$prct_only_sus_out_of_5299 = round((as.numeric(summary$only_sus_individuals) / (5299)) * 100, 2) 


summary 

bp_range,nonsus_individuals,only_sus_individuals,prct_nonsus_out_of_total,prct_only_sus_out_of_total,prct_only_sus_out_of_nonsus,prct_only_sus_out_of_5299
<chr>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>
10,4113,1186,10.45,3.01,28.84,22.38
20,3834,1465,9.74,3.72,38.21,27.65
30,3617,1682,9.19,4.27,46.5,31.74
40,3412,1887,8.67,4.79,55.3,35.61
