In [None]:
import pandas as pd
import numpy as np
import itertools
from Bio import SeqIO
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns
import ast

In [None]:
#input data
##list of GSE studies
gse_id_list = ['GSE145926', 'GSE154244', 'GSE150316']

##min for variant frequency
min_VF = 0.001
max_VF = 1
##min coverage
min_DP = 50

#hairpins coordinates in SARS-CoV-2 (positions start with 1)
hairpin_SARS_coordinates = 'hairpin_list.txt'
df_hairpin = pd.read_csv(hairpin_SARS_coordinates, skiprows=1, names=['Loop_length', 'Pos', 'Loop_sequence', 'Stem_length'])
df_hairpin['loop_start_pos'] = df_hairpin['Pos'].apply(lambda x: ast.literal_eval(x)[0][0]-1)
df_hairpin['loop_end_pos'] = df_hairpin['Pos'].apply(lambda x: ast.literal_eval(x)[0][1]-1)
start_loop_positions = df_hairpin.loop_start_pos.to_list()
end_loop_positions = df_hairpin.loop_end_pos.to_list()
list_loop_coordinates = [pd.Interval(left=start_loop_positions[i], right=end_loop_positions[i]) for i in range(0, len(start_loop_positions))]

##palette for pictures
colors_list = ['#C6878F', '#81B29A']

In [None]:
def apobec_mutagenesis(vcf_file, ID, align_method, min_vf=min_VF, max_vf=max_VF, min_dp=min_DP, hairpin_SARS_coordinates=list_loop_coordinates, colors=colors_list):
    flag = False
    ##read vcf file
    ##skip description in vcf file
    with open(vcf_file, 'r') as f:
        reader=f.readlines()
        row = 0
        while reader[row].startswith('##') == True:
            row += 1
    df_vcf = pd.read_csv(vcf_file, skiprows=row, sep='\t')
    df_vcf.columns.values[-1] = "FORMAT_numbers"
    df_vcf = df_vcf.dropna()
    
    #choose positions with C>T mutation
    df_vcf_C_T = df_vcf[(df_vcf['REF']== 'C') & (df_vcf['ALT']== 'T')]
    df_vcf_C_T.reset_index(inplace=True, drop=True)
    if df_vcf_C_T.empty:
        return 'empty'
    
    #filter by min and amx VF
    FORMAT = df_vcf_C_T.iloc[0, -2].split(':')
    VF_ind = FORMAT.index('VF')
    df_vcf_C_T['VF'] = df_vcf_C_T.apply(lambda x: float(x['FORMAT_numbers'].split(':')[VF_ind]), axis=1)
    df_vcf_C_T = df_vcf_C_T[(df_vcf_C_T['VF']>=min_vf)]
    df_vcf_C_T = df_vcf_C_T[(df_vcf_C_T['VF']<max_vf)]
    if df_vcf_C_T.empty:
        return 'empty'
    
    #filter by min coverage
    DP_ind = FORMAT.index('DP')
    df_vcf_C_T['coverage'] = df_vcf_C_T.apply(lambda x: float(x['FORMAT_numbers'].split(':')[DP_ind]), axis=1)
    df_vcf_C_T = df_vcf_C_T[(df_vcf_C_T['coverage']>=min_dp)]
    if df_vcf_C_T.empty:
        return 'empty'
    
    df_vcf_C_T['pos_in_loop'] = '-'
    df_vcf_C_T.reset_index(inplace=True, drop=True)
    
    for row in range(0, len(df_vcf_C_T)):
        mutation_pos = df_vcf_C_T.loc[row, 'POS'] - 1
        #write if mutation is in a loop
        for interval in hairpin_SARS_coordinates:
            if mutation_pos in interval:
                df_vcf_C_T.at[row, 'pos_in_loop'] = '+'
                break
    
    #sorting by VF
    df_vcf_C_T = df_vcf_C_T.sort_values(by='VF', ascending=False)
    df_vcf_C_T.reset_index(inplace=True, drop=True)
    df_vcf_C_T = df_vcf_C_T[['POS', 'VF', 'coverage', 'pos_in_loop']]
    
    #make plot
    ##change figure size
    if len(df_vcf_C_T) > 200:
        
        df_vcf_C_T_abbr = df_vcf_C_T[df_vcf_C_T['VF']>=0.002]
        df_vcf_C_T_abbr.reset_index(inplace=True, drop=True)
        #writing positions with VF < 0.002 to one row
        abbr1 = df_vcf_C_T[(df_vcf_C_T['VF']<0.002) & (df_vcf_C_T['pos_in_loop']=='-')]
        abbr2 = df_vcf_C_T[(df_vcf_C_T['VF']<0.002) & (df_vcf_C_T['pos_in_loop']=='+')]
        if len(abbr1) != 0:
            df_vcf_C_T_abbr.loc[len(df_vcf_C_T_abbr.index)] = ['1*', 0.002, '-', '-']
            flag = True
            abbr_text = '1* - ' + str(len(abbr1)) + ' positions not in loops with VF<'+str(0.002)
        if len(abbr2) != 0:
            df_vcf_C_T_abbr.loc[len(df_vcf_C_T_abbr.index)] = ['2*', 0.002, '-', '+']
            new_text = '\n2* - ' + str(len(abbr2)) + ' positions in loops with VF<'+str(0.002)
            abbr_text += new_text
        df_vcf_C_T = df_vcf_C_T_abbr.copy()
    figure_size = 10
    if len(df_vcf_C_T) < 20:
        figure_size = 4
    
    cols = [colors[1] if x == '+' else colors[0] for x in df_vcf_C_T.pos_in_loop]
    
    f, ax = plt.subplots(figsize=(figure_size, 4))
    sns.barplot(x=df_vcf_C_T.index, y=df_vcf_C_T.VF, palette=cols)
    f.suptitle('C>T mutations in '+ID+'\naligned using '+align_method)
    ax.set_xticklabels(df_vcf_C_T.POS, rotation = 90)
    plt.xticks(fontsize = 4) 
    ax.set(xlabel='Position in the genome')
    
    if flag == True:
        print(abbr_text)
        #add text below figure
        plt.subplots_adjust(bottom=0.2)
        # Place text outside below the plot
        plt.text(1, -0.2, abbr_text, ha='left', fontsize="6")
    plt.tight_layout()
    plt.close(f)
    return(f)

In [None]:
method = 'BWA'
for gse_id in gse_id_list:
    
    ##get list of srr for gse_id
    srr_acc_file_txt = "SRR_Acc_List_"+gse_id+".txt"
    df_srr_acc = pd.read_csv(srr_acc_file_txt, names=['acc'])
    srr_list = df_srr_acc.acc.to_list()
    
    ##path for directory with vcf files for gse_id
    directory = 'bwa/vcf_'+gse_id + '/'
    for srr_id in srr_list:
        print(srr_id, end=', ')
        vcf = directory + srr_id + '_sorted.vcf'
        plot = apobec_mutagenesis(vcf, srr_id, method)
        if type(plot) == str:
            print(plot)
        else:
            plot.savefig(directory+'/pictures/'+srr_id+'_CT_mutations_'+method+'.png', dpi=800)