In [1]:
library(tidyverse)
library(data.table)
library(readxl)
library(dplyr)
library(stringr)

── [1mAttaching core tidyverse packages[22m ───────────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.1     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.1     
── [1mConflicts[22m ─────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: ‘data.table’


The following objects are masked from ‘package:lubridate’:

    hour, isoweek, mday, minute,

In [2]:
rare = 10^-4 

In [3]:
# Read in suspicious indels for each data set 
sus_bp_10_df_1 = fread("gnomad.exomes.r2.1.1.sites_indelsonly_rAF_bp10_rAF_hiIndels.lt50bp.csv")
sus_bp_20_df_1 = fread("gnomad.exomes.r2.1.1.sites_indelsonly_rAF_bp20_rAF_hiIndels.lt50bp.csv")
sus_bp_30_df_1 = fread("gnomad.exomes.r2.1.1.sites_indelsonly_rAF_bp30_rAF_hiIndels.lt50bp.csv")
sus_bp_40_df_1 = fread("gnomad.exomes.r2.1.1.sites_indelsonly_rAF_bp40_rAF_hiIndels.lt50bp.csv")

sus_bp_10_df_2 = fread("2023-03-23_IGM_n39367_indelsonly_rAF_bp10_rAF_hiIndels.lt50bp.csv")
sus_bp_20_df_2 = fread("2023-03-23_IGM_n39367_indelsonly_rAF_bp20_rAF_hiIndels.lt50bp.csv")
sus_bp_30_df_2 = fread("2023-03-23_IGM_n39367_indelsonly_rAF_bp30_rAF_hiIndels.lt50bp.csv")
sus_bp_40_df_2 = fread("2023-03-23_IGM_n39367_indelsonly_rAF_bp40_rAF_hiIndels.lt50bp.csv")

In [4]:
sample_name_gene_name_df1 = fread("gnomad.exomes.r2.1.1.sites_indelsonly.vcf_withFirstSYMBOL.txt") 
sample_name_gene_name_df1$VarID = paste0(sample_name_gene_name_df1$CHR, "-" ,sample_name_gene_name_df1$POS, "-", sample_name_gene_name_df1$REF,"-", sample_name_gene_name_df1$ALT)
sample_name_gene_name_df1 = sample_name_gene_name_df1 %>% select ("VarID", "SYMBOL")
colnames(sample_name_gene_name_df1) = c("VarID", "geneName")

In [5]:
sample_name_gene_name_df2 = fread("2023-03-24_11-47-14_IGM_n39367_indels_genotypes_selectcols.csv") 
colnames(sample_name_gene_name_df2) = c("VarID", "geneName", "sampleName", "coveredCtrl", "AC")

In [6]:
# set bp ranges and variant effects 
bp_range = c("10", "20", "30", "40")

In [7]:
df_name_1="gnomAD"
df_name_2="IGM"

In [8]:
## Suspicious indels were identified in XX genes in the IGM dataset and YY genes in the gnomAD dataset. 

for (i in bp_range) {

  ## get df with sus indels 
  sus_df_1 = paste0("sus_bp_", i, "_df_1")
  sus_df_2 = paste0("sus_bp_", i, "_df_2")
  
  ## merge with sample name and gene name 
  df1_with_gene = paste0("df_1_with_gene_bp", i)
  df2_with_gene = paste0("df_2_with_gene_bp", i)
  
  assign(df1_with_gene, merge(get(sus_df_1), sample_name_gene_name_df1, by = "VarID"))
  assign(df2_with_gene, merge(get(sus_df_2), sample_name_gene_name_df2, by = "VarID"))
  
  ## find unique number of gene names 
  uniq_genes_df1 = paste0("uniq_genes_df1_bp", i)
  uniq_genes_df2 = paste0("uniq_genes_df2_bp", i)
  
  assign(uniq_genes_df1, nrow(as.data.frame(unique(get(df1_with_gene)$geneName))))
  assign(uniq_genes_df2, nrow(as.data.frame(unique(get(df2_with_gene)$geneName))))
}

In [9]:
df1_genes = c(uniq_genes_df1_bp10, uniq_genes_df1_bp20,uniq_genes_df1_bp30, uniq_genes_df1_bp40)
df2_genes = c(uniq_genes_df2_bp10, uniq_genes_df2_bp20, uniq_genes_df2_bp30, uniq_genes_df2_bp40)

In [10]:
summary_df = data.frame(bp_range, df1_genes, df2_genes)

In [11]:
summary_df

bp_range,df1_genes,df2_genes
<chr>,<int>,<int>
10,13137,4784
20,14176,5781
30,14739,6463
40,15153,6938
