# Regional Allele Frequency Calculation script for the gnomAD dataset

*Step 1:* Download gnomAD v2.1 exome data from: https://gnomad.broadinstitute.org/downloads

In [None]:
#!command wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/2.1.1/vcf/exomes/gnomad.exomes.r2.1.1.sites.vcf.bgz
#!command wget https://gnomad-public-us-east-1.s3.amazonaws.com/release/2.1.1/vcf/exomes/gnomad.exomes.r2.1.1.sites.vcf.bgz.tbi

Step 2: Using BCFtools filter the gnomAD file for indels only, select required columns and also create another file with Gene Symbols

In [None]:

#!command bcftools view --include 'TYPE="INDEL"' gnomad.exomes.r2.1.1.sites.vcf.bgz -O b >gnomad.exomes.r2.1.1.sites_indelsonly.vcf.gz

#Saving a file with AC & AN columns
!command bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t%AC\t%AN\n' gnomad.exomes.r2.1.1.sites_indelsonly.vcf.gz >gnomad.exomes.r2.1.1.sites_indelsonly_AC_AN_noheader.txt
!command echo -e "CHR\tPOS\tREF\tALT\tAC\tAN" >header.txt
!command cat header.txt gnomad.exomes.r2.1.1.sites_indelsonly_AC_AN_noheader.txt >gnomad.exomes.r2.1.1.sites_indelsonly_AC_AN.txt
!command rm gnomad.exomes.r2.1.1.sites_indelsonly_AC_AN_noheader.txt header.txt

#Saving another file with associated gene names 
!command bcftools +split-vep gnomad.exomes.r2.1.1.sites_indelsonly.vcf.gz -f '%CHROM\t%POS\t%REF\t%ALT\t%SYMBOL\n' -a vep | cut -d, -f1 >gnomad.exomes.r2.1.1.sites_indelsonly.vcf_withFIRSTSYMBOL_noheader.txt
!command echo -e "CHR\tPOS\tREF\tALT\tSYMBOL" >header.txt
!command cat header.txt gnomad.exomes.r2.1.1.sites_indelsonly.vcf_withFIRSTSYMBOL_noheader.txt >gnomad.exomes.r2.1.1.sites_indelsonly.vcf_withFIRSTSYMBOL.txt
!command rm gnomad.exomes.r2.1.1.sites_indelsonly.vcf_withFIRSTSYMBOL_noheader.txt header.txt


Step 3: Create BINs for different genomic ranges (10bp,20bp,30bp,40bp)

i) Load the file using pandas

In [None]:
import pandas as pd
from functools import reduce
df=pd.read_csv("gnomad.exomes.r2.1.1.sites_indelsonly_AC_AN.txt",sep="\t",dtype={'CHR': 'str', 'POS': 'int','REF':'str','ALT':'str','AC':'int','AN':'int'})

In [None]:
df = df[df['AN'] != 0 ]
df = df[df['AC'] != 0]
df['indel_length'] = abs(df['REF'].astype(str).map(len)-df['ALT'].astype(str).map(len))
df = df[df['indel_length']<51]
df = df.reset_index(drop= True)

ii) Functions to group indels falling within the given region into same bins and assign a unique bin ID

In [None]:
def cluster(pos_list, gap):
    maxgap=gap/2
    pos=pos_list.astype(int)
    pos.sort_values(ascending=True, inplace=True)
    groups = [[list(pos)[0]]]
    for x in pos[1:]:
        if abs(x - groups[-1][-1]) <= maxgap:
            groups[-1].append(x)
        else:
            groups.append([x])
    return groups

def group_ids(pairs,chr,bp):
    groups = cluster(pairs['POS'], bp)
    d = {}
    for i, group in enumerate(sorted(groups)):
        for elem in group:
            d[elem] = "chr"+str(chr)+"_"+str(i)
    return d


In [None]:
chrom=df['CHR'].unique()
bp=[10,20,30,40]

chrom_output=[]
creategroups={}
binned_groups={}
for i in chrom:
    print(i)
    input=df[df["CHR"]==i]
    for j in bp:
        creategroups[j]=group_ids(input,i,j)
        binned_groups[j] = pd.DataFrame(creategroups[j].items(), columns=['POS',"GID_bp"+str(j)])
        binned_groups[j]['POS']=binned_groups[j]['POS'].astype(int)
    #define list of DataFrames
    dfs = [input,binned_groups[10], binned_groups[20], binned_groups[30], binned_groups[40]]

    #merge all dataframes into one
    final_df = reduce(lambda  left,right: pd.merge(left,right,on=['POS'], how='outer'), dfs)
    #create bin level AC
    for j in bp:
        final_df["AC_bp"+str(j)]= final_df['AC'].groupby(final_df["GID_bp"+str(j)]).transform('sum')
    chrom_output.append(final_df)
gnomAD_rAF = pd.concat(chrom_output)

Step 4: Calculate sAF = AC/AN

In [None]:
gnomAD_rAF['sAF'] = gnomAD_rAF['AC']/gnomAD_rAF['AN']

Step 5: Calculate rAN value for different values by getting the mean of AN of the indels inside the same bin, Use the rAN to calculate rAF = rAC/rAN

In [None]:
for j in bp:
    gnomAD_rAF["AN_bp"+str(j)] = gnomAD_rAF.groupby(['CHR',"GID_bp"+str(j)])['AN'].transform('mean')
    gnomAD_rAF["rAF_bp"+str(j)] = gnomAD_rAF["AC_bp"+str(j)]/gnomAD_rAF["AN_bp"+str(j)]


Step 5: Save the output to a file

In [None]:
gnomAD_rAF.to_csv("gnomad.exomes.r2.1.1.sites_indelsonly_rAF_lt50bp.csv",index = False)

Step 6: Subset for suspicious indels (SI) (sAF ≤ 10^-4 & rAF > 10^-4) 

In [None]:
si_AF=0.0001
gnomAD_rAF_SI={}
for i in bp:
    gnomAD_rAF_SI[i] = gnomAD_rAF[ (gnomAD_rAF['sAF']<=si_AF) & (gnomAD_rAF["rAF_bp"+str(i)]>si_AF) ]
    gnomAD_rAF_SI[i].to_csv("gnomad.exomes.r2.1.1.sites_indelsonly_rAF_bp"+str(i)+"_SuspiciousIndels.lt50bp.csv",index = False)
