# Regional Allele Frequency Calculation script for the IGM dataset

Step 1: Pull the data from IGM ATAV database for unrelated samples available for control use

ATAV command:
atav_7.4.sh --list-var-geno --exclude-artifacts --exclude-snv --gene-boundary /nfs/goldstein/software/atav_home/data/ccds/addjusted.CCDS.genes.index.r20.hg19.ensembl87.txt --het-percent-alt-read 0.2-0.8 --sample 2023-03-23_n39367_IGM_cohort.txt --out /nfs/labs/gharavi/sk4759/RecurrentIndels/IGM_n39367_indels

Database server: db11 (0 running jobs)

Total samples: 39367 (0 cases and 39367 controls)

Start running list variant genotype function



Total runtime: 129470 seconds (aka 2157 minutes or 35 hours)

Step 2.1: Filter the large file into smaller dataset by selecting fewer columns




In [None]:
#select columns Variant ID,Genename,Sample ID, AC,Covered Controls
#!command csvcut -c 1,9,23,58,63 2023-03-24_11-47-14_IGM_n39367_indels_genotypes.csv >2023-03-24_11-47-14_IGM_n39367_indels_genotypes_selectcols.csv

#!command csvcut -c 1,7,9 2023-03-24_11-47-14_IGM_n39367_indels_genotypes.csv | sort -u | grep -v "Variant ID" >2023-03-24_11-47-14_IGM_n39367_indels_genotypes_effects_noheader.csv
#!command echo -e "CHR,POS,REF,ALT,Effect,Gene_Name" >header.txt
#!command cat header.txt 2023-03-24_11-47-14_IGM_n39367_indels_genotypes_effects_noheader.csv >2023-03-24_11-47-14_IGM_n39367_indels_genotypes_effects.csv 
#!command rm 2023-03-24_11-47-14_IGM_n39367_indels_genotypes_effects_noheader.csv header.txt

Step 2.2: Format the data from comma delimiter to tab delimiter and add appropriate header

In [None]:
!command cut -d, -f1,4,5 2023-03-24_11-47-14_IGM_n39367_indels_genotypes_selectcols.csv | sed 's/-/\t/g;s/,/\t/g' | sort -u | grep -v "Variant ID" >2023-03-23_IGM_n39367_unique_indel_ordered_noheader.tsv
!command echo -e "CHR\tPOS\tREF\tALT\tAN\tAC" >header.txt
!command cat header.txt 2023-03-23_IGM_n39367_unique_indel_ordered_noheader.tsv >2023-03-23_IGM_n39367_unique_indel_ordered.tsv
!command rm 2023-03-23_IGM_n39367_unique_indel_ordered_noheader.tsv header.txt

Setting up paramters

In [None]:
input_file="2023-03-23_IGM_n39367_unique_indel_ordered.csv"
out_prefix="2023-03-23_IGM_n39367_indelsonly_rAF"
#lo frequency cut off
AF=0.0001
#Genomic Ranges
bp=[10,20,30,40]

Step 3: Create BINs for different genomic ranges (10bp,20bp,30bp,40bp)

i) Load the file using pandas

In [None]:
import pandas as pd
from functools import reduce
df=pd.read_csv(input_file,sep="\t",low_memory=False,dtype={'CHR': 'str', 'POS': 'str','REF':'str','ALT':'str','AN':'int','AC':'int'})

In [None]:
df['indel_length'] = abs(df['REF'].astype(str).map(len)-df['ALT'].astype(str).map(len))
df = df[df['indel_length']<51]
df = df.reset_index(drop= True)

In [None]:
def cluster(pos_list, gap):
    maxgap=gap/2
    pos=pos_list.astype(int)
    pos.sort_values(ascending=True, inplace=True)
    groups = [[list(pos)[0]]]
    for x in pos[1:]:
        if abs(x - groups[-1][-1]) <= maxgap:
            groups[-1].append(x)
        else:
            groups.append([x])
    return groups

def group_ids(pairs,chr,bp):
    groups = cluster(pairs['POS'], bp)
    d = {}
    for i, group in enumerate(sorted(groups)):
        for elem in group:
            d[elem] = "chr"+str(chr)+"_"+str(i)
    return d

In [None]:
chrom=df['CHR'].unique()

chrom_output=[]
creategroups={}
binned_groups={}
for i in chrom:
    input=df[df["CHR"]==i]
    for j in bp:
        creategroups[j]=group_ids(input,i,j)
        binned_groups[j] = pd.DataFrame(creategroups[j].items(), columns=['POS',"GID_bp"+str(j)])
        binned_groups[j]['POS']=binned_groups[j]['POS'].astype(str)
        
    #define list of DataFrames
    dfs = [input,binned_groups[10], binned_groups[20], binned_groups[30], binned_groups[40]]
    
    #merge all dataframes into one
    final_df = reduce(lambda  left,right: pd.merge(left,right,on=['POS'], how='outer'), dfs)
    #create bin level AC
    for j in bp:
        final_df["AC_bp"+str(j)]= final_df['AC'].groupby(final_df["GID_bp"+str(j)]).transform('sum')
    chrom_output.append(final_df)
df_rAF = pd.concat(chrom_output)

Step 4: Calculate sAF = AC/AN (For IGM data set, have to multiply AN by 2 to account for the fact that the AN column represents individuals not number of covered alternate alleles) 

In [None]:
df_rAF['sAF'] = (df_rAF['AC']/df_rAF['AN'])/2

Step 5: Calculate rAN value for different values by getting the mean of AN of the indels inside the same bin, Use the rAN to calculate rAF = rAC/rAN (For IGM data set, have to multiply AN by 2 to account for the fact that the AN column represents individuals not the number of covered alternate alleles) 

In [None]:
for j in bp:
    df_rAF["AN_bp"+str(j)] = df_rAF.groupby(['CHR',"GID_bp"+str(j)])['AN'].transform('mean')
    df_rAF["rAF_bp"+str(j)] = (df_rAF["AC_bp"+str(j)]/df_rAF["AN_bp"+str(j)])/2


Step 5: Save the output to a file

In [None]:
#Recreate the VarID column to further use
df_rAF['VarID'] = df_rAF['CHR']+"-"+df_rAF['POS']+"-"+df_rAF['REF']+"-"+df_rAF['ALT']
# Remove columns 'CHR', 'POS', 'REF', 'ALT' for reducing file size. 
#df_rAF=df_rAF.drop(['CHR', 'POS','REF','ALT'], axis=1)
df_rAF = df_rAF[ ['VarID'] + [ col for col in df_rAF.columns if col != 'VarID' ] ]
df_rAF.to_csv(out_prefix+"_lt50bp.csv",index = False)

Step 6: Subset for rAF-hi (sAF ≤ 10^-4 & rAF > 10^-4) 

In [None]:
df_rAF_hi={}
for i in bp:
    df_rAF_hi[i] = df_rAF[ (df_rAF['sAF']<=AF) & (df_rAF["rAF_bp"+str(i)]>AF) ]
    df_rAF_hi[i].to_csv(out_prefix+"_bp"+str(i)+"_rAF_hiIndels.lt50bp.csv",index = False)


Step 7: Subset for rAF-lo indels (sAF ≤ 10^-4 & rAF ≤ 10^-4)

In [None]:
df_rAF_lo={}
for i in bp:
    df_rAF_lo[i] = df_rAF[ (df_rAF['sAF']<=AF) & (df_rAF["rAF_bp"+str(i)]<=AF) ]
    df_rAF_lo[i].to_csv(out_prefix+"_bp"+str(i)+"_rAF_loIndels.lt50bp.csv",index = False)


Step 8: Subset for sAF-hi indels (sAF > 10^-4)

In [None]:
df_sAF_hi={}
for i in bp:
    df_sAF_hi[i] = df_rAF[ (df_rAF['sAF']>AF) ]
    df_sAF_hi[i].to_csv(out_prefix+"_bp"+str(i)+"_sAF_hiIndels.lt50bp.csv",index = False)


In [None]:
df_rAF_hi_bin={}
df_rAF_hi_bin_bed={}
for i in bp:
    df_rAF_hi_bin[i] = df_rAF[df_rAF['GID_bp'+str(i)].isin(df_rAF_hi[i]['GID_bp'+str(i)].unique())]
    df_rAF_hi_bin_bed[i] = df_rAF_hi_bin[i].groupby(["GID_bp"+str(i)])['POS'].agg(['min', 'max'])
    df_rAF_hi_bin_bed[i]['index'] = df_rAF_hi_bin_bed[i].index
    df_rAF_hi_bin_bed[i]['Chr'] = df_rAF_hi_bin_bed[i]['index'].str.split('_',expand=True)[0].str.replace('chr' , '')
    df_rAF_hi_bin_bed[i] = df_rAF_hi_bin_bed[i].drop(['index'],axis=1)
    df_rAF_hi_bin_bed[i].columns = ['Start', 'End','Chr']
    df_rAF_hi_bin_bed[i] = df_rAF_hi_bin_bed[i][['Chr','Start','End']]
    df_rAF_hi_bin_bed[i] = df_rAF_hi_bin_bed[i].sort_values(by=['Chr'], ascending=True)
    df_rAF_hi_bin_bed[i].to_csv(out_prefix+"_bp"+str(i)+"_rAF_hiIndels.lt50bp.region.bed",index = False,sep='\t')


#Generate bed files for region with rAF-lo indels

In [None]:
df_rAF_lo_bin={}
df_rAF_lo_bin_bed={}
for i in bp:
    df_rAF_lo_bin[i] = df_rAF[df_rAF['GID_bp'+str(i)].isin(df_rAF_lo[i]['GID_bp'+str(i)].unique())]
    df_rAF_lo_bin_bed[i] = df_rAF_lo_bin[i].groupby(["GID_bp"+str(i)])['POS'].agg(['min', 'max'])
    df_rAF_lo_bin_bed[i]['index'] = df_rAF_lo_bin_bed[i].index
    df_rAF_lo_bin_bed[i]['Chr'] = df_rAF_lo_bin_bed[i]['index'].str.split('_',expand=True)[0].str.replace('chr' , '')
    df_rAF_lo_bin_bed[i] = df_rAF_lo_bin_bed[i].drop(['index'],axis=1)
    df_rAF_lo_bin_bed[i].columns = ['Start', 'End','Chr']
    df_rAF_lo_bin_bed[i] = df_rAF_lo_bin_bed[i][['Chr','Start','End']]
    df_rAF_lo_bin_bed[i] = df_rAF_lo_bin_bed[i].sort_values(by=['Chr'], ascending=True)
    df_rAF_lo_bin_bed[i].to_csv(out_prefix+"_bp"+str(i)+"_rAF_loIndels.lt50bp.region.bed",index = False,sep='\t')


#Generate bed files for region with sAF-hi indels

In [None]:
df_sAF_hi_bin={}
df_sAF_hi_bin_bed={}
for i in bp:
    df_sAF_hi_bin[i] = df_rAF[df_rAF['GID_bp'+str(i)].isin(df_sAF_hi[i]['GID_bp'+str(i)].unique())]
    df_sAF_hi_bin_bed[i] = df_sAF_hi_bin[i].groupby(["GID_bp"+str(i)])['POS'].agg(['min', 'max'])
    df_sAF_hi_bin_bed[i]['index'] = df_sAF_hi_bin_bed[i].index
    df_sAF_hi_bin_bed[i]['Chr'] = df_sAF_hi_bin_bed[i]['index'].str.split('_',expand=True)[0].str.replace('chr' , '')
    df_sAF_hi_bin_bed[i] = df_sAF_hi_bin_bed[i].drop(['index'],axis=1)
    df_sAF_hi_bin_bed[i].columns = ['Start', 'End','Chr']
    df_sAF_hi_bin_bed[i] = df_sAF_hi_bin_bed[i][['Chr','Start','End']]
    df_sAF_hi_bin_bed[i] = df_sAF_hi_bin_bed[i].sort_values(by=['Chr'], ascending=True)
    df_sAF_hi_bin_bed[i].to_csv(out_prefix+"_bp"+str(i)+"_sAF_hiIndels.lt50bp.region.bed",index = False,sep='\t')