# The goal of this code is to find the number of genes that contain suspicious indels in each bp sliding window for each data set. 

In [1]:
library(tidyverse)
library(data.table)
library(readxl)
library(dplyr)
library(stringr)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.1     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.1     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: ‘data.table’


The following objects are masked from ‘package:lubridate’:

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    y

Set rare. 

In [2]:
rare = 10^-4 

Read in suspicious indels files for each data set. 

In [3]:
sus_bp_10_df_1 = fread("/Users/sy3115/Library/CloudStorage/OneDrive-cumc.columbia.edu/rAF_scripts_and_inputs/PublicationReady_Data/gnomAD/gnomad.exomes.r2.1.1.sites_indelsonly_rAF_bp10_SuspiciousIndels.lt50bp.csv")
sus_bp_20_df_1 = fread("/Users/sy3115/Library/CloudStorage/OneDrive-cumc.columbia.edu/rAF_scripts_and_inputs/PublicationReady_Data/gnomAD/gnomad.exomes.r2.1.1.sites_indelsonly_rAF_bp20_SuspiciousIndels.lt50bp.csv")
sus_bp_30_df_1 = fread("/Users/sy3115/Library/CloudStorage/OneDrive-cumc.columbia.edu/rAF_scripts_and_inputs/PublicationReady_Data/gnomAD/gnomad.exomes.r2.1.1.sites_indelsonly_rAF_bp30_SuspiciousIndels.lt50bp.csv")
sus_bp_40_df_1 = fread("/Users/sy3115/Library/CloudStorage/OneDrive-cumc.columbia.edu/rAF_scripts_and_inputs/PublicationReady_Data/gnomAD/gnomad.exomes.r2.1.1.sites_indelsonly_rAF_bp40_SuspiciousIndels.lt50bp.csv")

In [4]:
sus_bp_10_df_2 = fread("/Users/sy3115/Library/CloudStorage/OneDrive-cumc.columbia.edu/rAF_scripts_and_inputs/PublicationReady_Data/IGM/2023-03-23_IGM_n39367_indelsonly_rAF_bp10_SuspiciousIndels.lt50bp.csv")
sus_bp_20_df_2 = fread("/Users/sy3115/Library/CloudStorage/OneDrive-cumc.columbia.edu/rAF_scripts_and_inputs/PublicationReady_Data/IGM/2023-03-23_IGM_n39367_indelsonly_rAF_bp20_SuspiciousIndels.lt50bp.csv")
sus_bp_30_df_2 = fread("/Users/sy3115/Library/CloudStorage/OneDrive-cumc.columbia.edu/rAF_scripts_and_inputs/PublicationReady_Data/IGM/2023-03-23_IGM_n39367_indelsonly_rAF_bp30_SuspiciousIndels.lt50bp.csv")
sus_bp_40_df_2 = fread("/Users/sy3115/Library/CloudStorage/OneDrive-cumc.columbia.edu/rAF_scripts_and_inputs/PublicationReady_Data/IGM/2023-03-23_IGM_n39367_indelsonly_rAF_bp40_SuspiciousIndels.lt50bp.csv")

Read in file with gene name for each VarID for df 1. 

In [5]:
sample_name_gene_name_df1 = fread("/Users/sy3115/Library/CloudStorage/OneDrive-cumc.columbia.edu/rAF_scripts_and_inputs/PublicationReady_Data/gnomAD/gnomad.exomes.r2.1.1.sites_indelsonly.vcf_withFirstSYMBOL.txt") 

Modify file as needed to get a file with two columns: VarID (CHR-POS-REF-ALT) and geneName. 

In [6]:
sample_name_gene_name_df1$VarID = paste0(sample_name_gene_name_df1$CHR, "-" ,sample_name_gene_name_df1$POS, "-", sample_name_gene_name_df1$REF,"-", sample_name_gene_name_df1$ALT)
sample_name_gene_name_df1 = sample_name_gene_name_df1 %>% select ("VarID", "SYMBOL")
colnames(sample_name_gene_name_df1) = c("VarID", "geneName")

Read in file with gene name for each VarID for df 2. 

In [7]:
sample_name_gene_name_df2 = fread("/Users/sy3115/Library/CloudStorage/OneDrive-cumc.columbia.edu/rAF_scripts_and_inputs/PublicationReady_Data/IGM/2023-03-24_11-47-14_IGM_n39367_indels_genotypes_selectcols.csv") 

Modify file as needed to get a file with two columns: VarID (CHR-POS-REF-ALT) and geneName. 

In [8]:
colnames(sample_name_gene_name_df2) = c("VarID", "geneName", "sampleName", "coveredCtrl", "AC")

Set bp ranges. 

In [9]:
bp_range = c("10", "20", "30", "40")

Find number of genes that have a suspicious indel in each bp window for each dataset. 

In [10]:
for (i in bp_range) {

  ## get df with sus indels 
  sus_df_1 = paste0("sus_bp_", i, "_df_1")
  sus_df_2 = paste0("sus_bp_", i, "_df_2")
  
  ## merge with sample name and gene name 
  df1_with_gene = paste0("df_1_with_gene_bp", i)
  df2_with_gene = paste0("df_2_with_gene_bp", i)
  
  assign(df1_with_gene, merge(get(sus_df_1), sample_name_gene_name_df1, by = "VarID"))
  assign(df2_with_gene, merge(get(sus_df_2), sample_name_gene_name_df2, by = "VarID"))
  
  ## find unique number of gene names 
  uniq_genes_df1 = paste0("uniq_genes_df1_bp", i)
  uniq_genes_df2 = paste0("uniq_genes_df2_bp", i)
  
  assign(uniq_genes_df1, nrow(as.data.frame(unique(get(df1_with_gene)$geneName))))
  assign(uniq_genes_df2, nrow(as.data.frame(unique(get(df2_with_gene)$geneName))))
}

Make summary table. 

In [11]:
df1_genes = c(uniq_genes_df1_bp10, uniq_genes_df1_bp20,uniq_genes_df1_bp30, uniq_genes_df1_bp40)

df2_genes = c(uniq_genes_df2_bp10, uniq_genes_df2_bp20, uniq_genes_df2_bp30, uniq_genes_df2_bp40)

summary_df = data.frame(bp_range, df1_genes, df2_genes)

summary_df


bp_range,df1_genes,df2_genes
<chr>,<int>,<int>
10,12124,4784
20,13027,5781
30,13510,6463
40,13887,6938
