In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as pplt
import seaborn as sns
import pandas as pd
import numpy as np
import pickle
from gtfparse import read_gtf

In [None]:
prefix_variant_file = 'transcripts_under_10_percent_expression'

In [None]:
imp_vars = pd.read_pickle('01_dataframes/'+prefix_variant_file+'_cDNA_protSeq')

In [None]:
def search_col(df, inp):
    return [i for i in df.columns if inp.lower() in (i.lower())]

### Protein domain analysis

In [None]:
up_df = pd.read_pickle('01_dataframes/sdhb_uniprot_domains')

### NMD line

In [None]:
# GTF file with sequence information
gtf_df = pd.read_pickle('01_dataframes/sdhb_gtf_pickle')
gtf_df['exon_number'] = gtf_df['exon_number'].astype(int)

In [None]:
def nmd_pos(gene):
    gtf_spec = gtf_df[gtf_df['gene']==gene]
    nmd_exon = max(gtf_spec['exon_number'].unique())-1
    len_exon = (int(gtf_spec['end'][gtf_spec['exon_number']==nmd_exon])-
                int(gtf_spec['start'][gtf_spec['exon_number']==nmd_exon]))
    strand = gtf_spec['strand'].unique()[0]
    len_cdna = 0

    for i in gtf_spec['exon_number'].unique():
        if i < nmd_exon:
            len_cdna += (int(gtf_spec['end'][gtf_spec['exon_number']==i])-
                         int(gtf_spec['start'][gtf_spec['exon_number']==i]))

    if len_exon > 50:
        if strand=='+':
            nmd_pos = int(gtf_spec['end'][gtf_df['exon_number']==nmd_exon])-50
            len_cdna += (nmd_pos-int(gtf_spec['start'][gtf_df['exon_number']==nmd_exon]))
        else:
            nmd_pos = int(gtf_spec['start'][gtf_df['exon_number']==nmd_exon])+50
            len_cdna += (int(gtf_spec['end'][gtf_df['exon_number']==nmd_exon])-nmd_pos)

    return (len_cdna, round(len_cdna/3))

In [None]:
nmd_lst = []
for g in imp_vars['HUGO_Symbol'].unique():
    c,p = nmd_pos(g)
    nmd_lst.append([g,c,p])

nmd_df = pd.DataFrame(nmd_lst, columns=['gene', 'NMD_cDNA', 'NMD_protein'])

In [None]:
nmd_df.to_pickle('01_dataframes/'+prefix_variant_file+'_NMD')

In [None]:
nmd_df = pd.read_pickle('01_dataframes/'+prefix_variant_file+'_NMD')

### Visualization Exons

In [None]:
def prot_domains(ind, df_var, up_df):
    if ind in df_var.index:
        prot = df_var.iloc[ind]['HUGO_Symbol']
        ips_prot = up_df[up_df['HUGO_Symbol']==prot]
        shortcut = df_var.iloc[ind]['shortcut']
        
        nmd_pos = int(nmd_df['NMD_protein'][nmd_df['gene']==prot])

        prot_change_pos = df_var.iloc[ind]['prot_change_pos']
        aa_next_pos = df_var.iloc[ind]['aa_next_pos']
        termin_pos = df_var.iloc[ind]['termin_prot_len']

        max_pos = df_var.iloc[ind]['ref_protein_seq_len']

        seq_alt = df_var.iloc[ind]['protein_seq']
        seq_ref = df_var.iloc[ind]['ref_protein_seq']
    
        if type(prot_change_pos)!='no_change':
            try:
                seq_alt = seq_alt[prot_change_pos-10:prot_change_pos+9]
                seq_ref = seq_ref[prot_change_pos-10:prot_change_pos+9]
            except:
                seq_alt = seq_alt[prot_change_pos-10:]
                seq_ref = seq_ref[prot_change_pos-10:]
            seq_match = ''
            for a,r in zip(seq_alt, seq_ref):
                if a==r:
                    seq_match += '|'
                else:
                    seq_match += ' '

            inds = ips_prot.index
            starts = ips_prot['start_pos']
            ends = ips_prot['end_pos']
            ids = ips_prot['info_domain']
            ys = np.arange(0.01,1,1/len(ips_prot))

            fig = plt.subplots(figsize=(14, 6))
            ax1 = plt.subplot(1,1,1)

            plt.ylim([0,1])
            plt.xlim([0,max_pos])
            ax1.get_yaxis().set_visible(False)
            ax1.set_xlabel('amino acid position', fontsize=12)
            ax1.xaxis.set_label_coords(0.5, -0.1)

            ax1.set_title('Protein domains of %s (%s)' %(prot, shortcut), fontsize = 18, fontweight = 'bold', 
                          loc = 'center')

            for i,s,e,y,t in zip(inds, starts, ends, ys, ids):
                ax1.add_patch(pplt.Rectangle((s, y), e-s, 0.01, fc = 'black', ec = 'black', alpha = 1))
                plt.text(max_pos+5, y, t, size = 13, ha='left')
            if prot_change_pos != 'no_change':
                ax1.add_patch(pplt.Rectangle((prot_change_pos, 0), 0.01, 1, fc = 'green', ec = 'green', alpha = 1))
                plt.text(x=prot_change_pos, y=-0.10, s='aa substitution', ha='center', c='green')
                if aa_next_pos not in ['probably frameshift', 'error']:
                    len_rel_for_dom = aa_next_pos-prot_change_pos
                    ax1.add_patch(pplt.Rectangle((prot_change_pos, 0), aa_next_pos-prot_change_pos, 1, 
                                                 fc = 'green', ec = 'green', alpha = 0.5))
                    
                elif aa_next_pos == 'probably frameshift':
                    len_rel_for_dom = termin_pos-prot_change_pos
                    ax1.add_patch(pplt.Rectangle((prot_change_pos, 0), termin_pos-prot_change_pos, 1, 
                                                 fc = 'magenta', ec = 'magenta', alpha = 0.5))
                else:
                    len_rel_for_dom = 1
            if aa_next_pos not in ['probably frameshift', 'error']:
                ax1.add_patch(pplt.Rectangle((termin_pos+(aa_next_pos-prot_change_pos), 0), 0.01, 1, fc = 'red', 
                                             ec = 'red', alpha = 1))
                plt.text(x=termin_pos+(aa_next_pos-prot_change_pos), y=-0.07, s='stop', ha='center', c='red')
            elif aa_next_pos == 'probably frameshift':
                ax1.add_patch(pplt.Rectangle((termin_pos, 0), 0.01, 1, fc = 'red', ec = 'red', alpha = 1))
                plt.text(x=termin_pos, y=-0.07, s='stop', ha='center', c='red')
            
            ax1.add_patch(pplt.Rectangle((nmd_pos, 0), 0.01, 1, fc = 'blue', ec = 'blue', alpha = 1))
            plt.text(x=nmd_pos, y=-0.13, s='NMD', ha='center', c='blue')
            plt.text(x=1, y=0.9, s=seq_alt, size = 15, ha='left', family='monospace')
            plt.text(x=1, y=0.85, s=seq_match, size = 15, ha='left', family='monospace')
            plt.text(x=1, y=0.8, s=seq_ref, size = 15, ha='left', family='monospace')
            plt.gcf().subplots_adjust(bottom = 0.12, top = 0.94, left = 0.01, right = 0.6)
            plt.savefig('02_output_analysis/02_domain_figs/dom_%s.png'%shortcut)
            
            out_domain_file = '02_output_analysis/03_domain_info/'+prefix_variant_file+'_'+shortcut+'_domain_info.tsv'
            prot_doms_aff = []
            for s,e,d in zip(starts, ends, ids):
                if prot_change_pos >= s and prot_change_pos <= e:
                    prot_doms_aff.append(d)
                elif s >= prot_change_pos and e <= prot_change_pos+len_rel_for_dom:
                    prot_doms_aff.append(d)
            ips_prot[ips_prot['info_domain'].isin(prot_doms_aff)].to_csv(out_domain_file, sep='\t', index=False)
            
        else:
            print('no change in protein sequence')
    else:
        print('index not in dataframe')

In [None]:
for i in range(len(imp_vars)):
    prot_domains(i, imp_vars, up_df)

In [None]:
prot_domains(0, imp_vars, up_df)