# Regional Allele Frequency Calculation script for the IGM dataset

Step 1: Pull the data from IGM ATAV database for unrelated samples available for control use

ATAV command:
atav_7.4.sh --list-var-geno --exclude-artifacts --exclude-snv --gene-boundary /nfs/goldstein/software/atav_home/data/ccds/addjusted.CCDS.genes.index.r20.hg19.ensembl87.txt --het-percent-alt-read 0.2-0.8 --sample 2023-03-23_n39367_IGM_cohort.txt --out /nfs/labs/gharavi/sk4759/RecurrentIndels/IGM_n39367_indels

Database server: db11 (0 running jobs)

Total samples: 39367 (0 cases and 39367 controls)

Start running list variant genotype function



Total runtime: 129470 seconds (aka 2157 minutes or 35 hours)

Step 2.1: Filter the large file into smaller dataset by selecting fewer columns




In [None]:
#!command csvcut -c 1,9,23,58,63 2023-03-24_11-47-14_IGM_n39367_indels_genotypes.csv >2023-03-24_11-47-14_IGM_n39367_indels_genotypes_selectcols.csv

Step 2.2: Format the data from comma delimiter to tab delimiter and add appropriate header

In [None]:
!command cut -d, -f1,4,5 2023-03-24_11-47-14_IGM_n39367_indels_genotypes_selectcols.csv | sed 's/-/\t/g;s/,/\t/g' | sort -u | grep -v "Variant ID" >2023-03-23_IGM_n39367_unique_indel_ordered_noheader.csv
!command echo -e "CHR\tPOS\tREF\tALT\tAN\tAC" >header.txt
!command cat header.txt 2023-03-23_IGM_n39367_unique_indel_ordered_noheader.csv >2023-03-23_IGM_n39367_unique_indel_ordered.csv
!command rm 2023-03-23_IGM_n39367_unique_indel_ordered_noheader.csv header.txt


Step 3: Create BINs for different genomic ranges (10bp,20bp,30bp,40bp)

i) Load the file using pandas

In [None]:
import pandas as pd
from functools import reduce
df=pd.read_csv("2023-03-23_IGM_n39367_unique_indel_ordered.csv",sep="\t",low_memory=False,dtype={'CHR': 'str', 'POS': 'str','REF':'str','ALT':'str','AC':'int'})


In [None]:
df['indel_length'] = abs(df['REF'].astype(str).map(len)-df['ALT'].astype(str).map(len))
df = df[df['indel_length']<51]
df = df.reset_index(drop= True)

In [None]:
def cluster(pos_list, gap):
    maxgap=gap/2
    pos=pos_list.astype(int)
    pos.sort_values(ascending=True, inplace=True)
    groups = [[list(pos)[0]]]
    for x in pos[1:]:
        if abs(x - groups[-1][-1]) <= maxgap:
            groups[-1].append(x)
        else:
            groups.append([x])
    return groups

def group_ids(pairs,chr,bp):
    groups = cluster(pairs['POS'], bp)
    d = {}
    for i, group in enumerate(sorted(groups)):
        for elem in group:
            d[elem] = "chr"+str(chr)+"_"+str(i)
    return d

In [None]:
chrom=df['CHR'].unique()
bp=[10,20,30,40]

chrom_output=[]
creategroups={}
binned_groups={}
for i in chrom:
    print(i)
    input=df[df["CHR"]==i]
    for j in bp:
        creategroups[j]=group_ids(input,i,j)
        binned_groups[j] = pd.DataFrame(creategroups[j].items(), columns=['POS',"GID_bp"+str(j)])
        binned_groups[j]['POS']=binned_groups[j]['POS'].astype(str)
        
    #define list of DataFrames
    dfs = [input,binned_groups[10], binned_groups[20], binned_groups[30], binned_groups[40]]
    
    #merge all dataframes into one
    final_df = reduce(lambda  left,right: pd.merge(left,right,on=['POS'], how='outer'), dfs)
    #create bin level AC
    for j in bp:
        final_df["AC_bp"+str(j)]= final_df['AC'].groupby(final_df["GID_bp"+str(j)]).transform('sum')
    chrom_output.append(final_df)
IGM_rAF = pd.concat(chrom_output)

Step 4: Calculate sAF = AC/AN

In [None]:
IGM_rAF['sAF'] = IGM_rAF['AC']/IGM_rAF['AN']

Step 5: Calculate rAN value for different values by getting the mean of AN of the indels inside the same bin, Use the rAN to calculate rAF = rAC/rAN

In [None]:
for j in bp:
    IGM_rAF["AN_bp"+str(j)] = IGM_rAF.groupby(['CHR',"GID_bp"+str(j)])['AN'].transform('mean')
    IGM_rAF["rAF_bp"+str(j)] = IGM_rAF["AC_bp"+str(j)]/IGM_rAF["AN_bp"+str(j)]


Step 5: Save the output to a file

In [None]:
IGM_rAF.to_csv("2023-03-23_IGM_n39367_indelsonly_rAF_lt50bp.csv",index = False)

Step 6: Subset for suspicious indels (SI) (sAF ≤ 10^-4 & rAF > 10^-4) 

In [None]:
si_AF=0.0001
IGM_rAF_SI={}
for i in bp:
    IGM_rAF_SI[i] = IGM_rAF[ (IGM_rAF['sAF']<=si_AF) & (IGM_rAF["rAF_bp"+str(i)]>si_AF) ]
    IGM_rAF_SI[i].to_csv("2023-03-23_IGM_n39367_indelsonly_rAF_bp"+str(i)+"_SuspiciousIndels.lt50bp.csv",index = False)
