# step 1

### Using the data from UKBIO Bank (input_file)

Setting up parameters 

In [1]:
input_file = r"rAF_scripts_and_inputs/PublicationReady_Data/UKBB/UKBB_n442734_Indels.tsv"
out_prefix=r"UK.BB.exomes.430k.sites_indelsonly_rAF"

#rAF_lo frequency cut off
AF=0.0001
#Genomic Ranges
bp=[10,20,30,40]

Step 2: Create Regions for different genomic ranges (10bp,20bp,30bp,40bp)

i) Load the file using pandas

In [2]:
import pandas as pd
from functools import reduce
df=pd.read_csv(input_file,sep="\t",dtype={'CHR': 'str', 'POS': 'int','REF':'str','ALT':'str','AC':'int','AN':'int'})

In [3]:
df = df[df['AN'] != 0 ]
df = df[df['AC'] != 0]
df['indel_length'] = abs(df['REF'].astype(str).map(len)-df['ALT'].astype(str).map(len))
df = df[df['indel_length']<51]
df = df.reset_index(drop= True)

In [4]:
#making sure that there are no indels greater than 50 bp
df.sort_values(by="indel_length",ascending=False)

#the len is 439786, so there are that many indels 50bp long (only 2 are 50bp long) or shorter 
print("The number of indels with a base pair length less than or equal to 50 is", len(df))

The number of indels with a base pair length less than or equal to 50 is 439786


ii) Functions to group indels falling within the given region into same regions and assign a unique region ID

In [5]:
def cluster(pos_list, gap):
    maxgap=gap/2
    pos=pos_list.astype(int)
    pos.sort_values(ascending=True, inplace=True)
    groups = [[list(pos)[0]]]
    for x in pos[1:]:
        if abs(x - groups[-1][-1]) <= maxgap:
            groups[-1].append(x)
        else:
            groups.append([x])
    return groups

def group_ids(pairs,chr,bp):
    groups = cluster(pairs['POS'], bp)
    d = {}
    for i, group in enumerate(sorted(groups)):
        for elem in group:
            d[elem] = "chr"+str(chr)+"_"+str(i)
    return d


In [6]:
chrom=df['CHR'].unique()

chrom_output=[]
creategroups={}
regioned_groups={}
for i in chrom:
    input=df[df["CHR"]==i]
    for j in bp:
        creategroups[j]=group_ids(input,i,j)
        regioned_groups[j] = pd.DataFrame(creategroups[j].items(), columns=['POS',"GID_bp"+str(j)])
        regioned_groups[j]['POS']=regioned_groups[j]['POS'].astype(int)
        
    #define list of DataFrames
    dfs = [input,regioned_groups[10], regioned_groups[20], regioned_groups[30], regioned_groups[40]]
    
    #merge all dataframes into one
    final_df = reduce(lambda  left,right: pd.merge(left,right,on=['POS'], how='outer'), dfs)
    #create region level AC
    for j in bp:
        final_df["AC_bp"+str(j)]= final_df['AC'].groupby(final_df["GID_bp"+str(j)]).transform('sum')
    chrom_output.append(final_df)
df_rAF = pd.concat(chrom_output)
df_rAF['POS']=df_rAF['POS'].astype(str)

Step 3: Calculate sAF = AC/AN

In [7]:
df_rAF['sAF'] = (df_rAF['AC']/df_rAF['AN'])

Step 4: Calculate rAN value for different values by getting the mean of AN of the indels inside the same region, Use the rAN to calculate rAF = rAC/rAN

In [8]:
for j in bp:
    df_rAF["AN_bp"+str(j)] = df_rAF.groupby(['CHR',"GID_bp"+str(j)])['AN'].transform('mean')
    df_rAF["rAF_bp"+str(j)] = (df_rAF["AC_bp"+str(j)]/df_rAF["AN_bp"+str(j)])


Step 5: Save the output to a file

In [9]:
#Recreate the VarID column to further use
df_rAF['VarID'] = df_rAF['CHR']+"-"+df_rAF['POS']+"-"+df_rAF['REF']+"-"+df_rAF['ALT']
# Remove columns 'CHR', 'POS', 'REF', 'ALT' for reducing file size. 
#df_rAF=df_rAF.drop(['CHR', 'POS','REF','ALT'], axis=1)
df_rAF = df_rAF[ ['VarID'] + [ col for col in df_rAF.columns if col != 'VarID' ] ]

#check if i can make the overall bed output file here 
df_rAF.to_csv(out_prefix+"_lt50bp.csv",index = False)

Step 6: Subset for rAF_hi indels (SI) (sAF ≤ 10^-4 & rAF > 10^-4) 

In [10]:
df_rAF_rAF_hi={}
for i in bp:
    df_rAF_rAF_hi[i] = df_rAF[ (df_rAF['sAF']<=AF) & (df_rAF["rAF_bp"+str(i)]>AF) ]
    df_rAF_rAF_hi[i].to_csv(out_prefix+"_bp"+str(i)+"_rAF_hiIndels.lt50bp.csv",index = False)


Step 7: Subset for rAF_lo indels (sAF ≤ 10^-4 & rAF ≤ 10^-4)

In [11]:
df_rAF_rAF_lo={}
for i in bp:
    df_rAF_rAF_lo[i] = df_rAF[ (df_rAF['sAF']<=AF) & (df_rAF["rAF_bp"+str(i)]<=AF) ]
    df_rAF_rAF_lo[i].to_csv(out_prefix+"_bp"+str(i)+"_rAF_loIndels.lt50bp.csv",index = False)


Step 8: Subset for sAF_hi indels (sAF > 10^-4)

In [12]:
df_rAF_sAF_hi={}
for i in bp:
    df_rAF_sAF_hi[i] = df_rAF[ (df_rAF['sAF']>AF) ]
    df_rAF_sAF_hi[i].to_csv(out_prefix+"_bp"+str(i)+"_sAF_hiIndels.lt50bp.csv",index = False)


Step 9: Generate bed file for the regions containing rAF_hi indels

In [13]:
df_rAF_rAF_hi_region={}
df_rAF_rAF_hi_region_bed={}
for i in bp:
    df_rAF_rAF_hi_region[i] = df_rAF[df_rAF['GID_bp'+str(i)].isin(df_rAF_rAF_hi[i]['GID_bp'+str(i)].unique())] 
    df_rAF_rAF_hi_region_bed[i] = df_rAF_rAF_hi_region[i].groupby(["GID_bp"+str(i)])['POS'].agg(['min', 'max'])
    df_rAF_rAF_hi_region_bed[i]['index'] = df_rAF_rAF_hi_region_bed[i].index
    df_rAF_rAF_hi_region_bed[i]['Chr'] = df_rAF_rAF_hi_region_bed[i]['index'].str.split('_',expand=True)[0].str.replace('chr' , '')
    df_rAF_rAF_hi_region_bed[i] = df_rAF_rAF_hi_region_bed[i].drop(['index'],axis=1)
    df_rAF_rAF_hi_region_bed[i].columns = ['Start', 'End','Chr']
    df_rAF_rAF_hi_region_bed[i] = df_rAF_rAF_hi_region_bed[i][['Chr','Start','End']]
    df_rAF_rAF_hi_region_bed[i] = df_rAF_rAF_hi_region_bed[i].rename(columns={'Chr':'#Chr'})
    df_rAF_rAF_hi_region_bed[i] = df_rAF_rAF_hi_region_bed[i].sort_values(by=['#Chr'], ascending=True)
    df_rAF_rAF_hi_region_bed[i].to_csv(out_prefix+"_bp"+str(i)+"_rAF_hiIndels.lt50bp.region.bed",index = False,sep='\t')


Step 9: Generate bed file for the regions containing rAF_lo indels

In [None]:
df_rAF_rAF_lo_region={}
df_rAF_rAF_lo_region_bed={}
for i in bp:
    df_rAF_rAF_lo_region[i] = df_rAF[df_rAF['GID_bp'+str(i)].isin(df_rAF_rAF_lo[i]['GID_bp'+str(i)].unique())]
    df_rAF_rAF_lo_region_bed[i] = df_rAF_rAF_lo_region[i].groupby(["GID_bp"+str(i)])['POS'].agg(['min', 'max'])
    df_rAF_rAF_lo_region_bed[i]['index'] = df_rAF_rAF_lo_region_bed[i].index
    df_rAF_rAF_lo_region_bed[i]['Chr'] = df_rAF_rAF_lo_region_bed[i]['index'].str.split('_',expand=True)[0].str.replace('chr' , '')
    df_rAF_rAF_lo_region_bed[i] = df_rAF_rAF_lo_region_bed[i].drop(['index'],axis=1)
    df_rAF_rAF_lo_region_bed[i].columns = ['Start', 'End','Chr']
    df_rAF_rAF_lo_region_bed[i] = df_rAF_rAF_lo_region_bed[i][['Chr','Start','End']]
    df_rAF_rAF_lo_region_bed[i] = df_rAF_rAF_lo_region_bed[i].rename(columns={'Chr':'#Chr'})
    df_rAF_rAF_lo_region_bed[i] = df_rAF_rAF_lo_region_bed[i].sort_values(by=['#Chr'], ascending=True)
    df_rAF_rAF_lo_region_bed[i].to_csv(out_prefix+"_bp"+str(i)+"_rAF_loIndels.lt50bp.region.bed",index = False,sep='\t')

Step 9: Generate bed file for the regionss containing sAF_hi indels

In [None]:
df_rAF_sAF_hi_region={}
df_rAF_sAF_hi_region_bed={}
for i in bp:
    df_rAF_sAF_hi_region[i] = df_rAF[df_rAF['GID_bp'+str(i)].isin(df_rAF_sAF_hi[i]['GID_bp'+str(i)].unique())]
    df_rAF_sAF_hi_region_bed[i] = df_rAF_sAF_hi_region[i].groupby(["GID_bp"+str(i)])['POS'].agg(['min', 'max'])
    df_rAF_sAF_hi_region_bed[i]['index'] = df_rAF_sAF_hi_region_bed[i].index
    df_rAF_sAF_hi_region_bed[i]['Chr'] = df_rAF_sAF_hi_region_bed[i]['index'].str.split('_',expand=True)[0].str.replace('chr' , '')
    df_rAF_sAF_hi_region_bed[i] = df_rAF_sAF_hi_region_bed[i].drop(['index'],axis=1)
    df_rAF_sAF_hi_region_bed[i].columns = ['Start', 'End','Chr']
    df_rAF_sAF_hi_region_bed[i] = df_rAF_sAF_hi_region_bed[i][['Chr','Start','End']]
    df_rAF_sAF_hi_region_bed[i] = df_rAF_sAF_hi_region_bed[i].rename(columns={'Chr':'#Chr'})
    df_rAF_sAF_hi_region_bed[i] = df_rAF_sAF_hi_region_bed[i].sort_values(by=['#Chr'], ascending=True)
    df_rAF_sAF_hi_region_bed[i].to_csv(out_prefix+"_bp"+str(i)+"_sAF_hiIndels.lt50bp.region.bed",index = False,sep='\t')