In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import sys

In [None]:
homology_file = "../data/output/homology/homology.tsv"

# Output
homology_plot = "../data/output/homology/homology.png"
homology_table = "../data/output/homology/homology_filters.tsv"

In [None]:
df = pd.read_csv(homology_file, sep='\t', na_values=['None'])
df

In [None]:
font = {'size'   : 32}
plt.rc('font', **font)

columns = ['blast_id', 'local_id', 'global_id']
titles = ['BLAST', 'Local', 'Global']
dbs = ['DisProt-Old', 'PDB-Seqres']

fig, axes = plt.subplots(2, 3, figsize=(40, 20))

for i, (db, df_g) in enumerate(df.groupby("db")):

    # print(df_g)
    df_g.hist(ax=axes[i,:], bins=[i for i in range(0, 101, 5)], rwidth=0.9, column=columns)

    # Vertical lines and bar height
    for j, (ax, title) in enumerate(zip(axes[i,:].reshape(-1), titles)):
        # if j in [0, 1]:
        #     ax.set_ylabel(dbs[i], labelpad=40, fontdict=font)
        ax.set_ylabel("Alignments", labelpad=40, fontdict=font)
        ax.set_xlabel("Sequence identity (%)", labelpad=20, fontdict=font)
        ax.set_title("CAID3 Vs. {} - {}".format(dbs[i], title), pad=40)

        # ax.axvline(30, lw=4, color='red', linestyle='--')


        mean = df_g.iloc[:, j*2+3].mean()
        median = df_g.iloc[:, j*2+3].median()

        print(db,columns[j],title, mean, median)

        # ax.axvline(mean, lw=4, color='black', linestyle='--')
        ax.axvline(median, lw=4, color='red', linestyle='--')

        for p in ax.patches:
            ax.annotate(str(int(p.get_height())), (p.get_x() + p.get_width() / 2.0, p.get_height() + 1.0), ha='center', va='bottom', fontsize=18)


        bottom, top = ax.get_ylim()
        ax.set_ylim(0, top*1.1)


plt.tight_layout()
fig.savefig(homology_plot, bbox_inches='tight')


In [None]:
df['no_pdb_blast'] = 0
df.loc[(df['db'] == 'pdb') & (df['blast_id'] < 30.0), 'no_pdb_blast'] = 1

df['no_disprot_blast'] = 0
df.loc[(df['db'] == 'disprot') & (df['blast_id'] < 30.0), 'no_disprot_blast'] = 1

df['no_pdb_local'] = 0
df.loc[(df['db'] == 'pdb') & (df['local_id'] < 30.0), 'no_pdb_local'] = 1

df['no_disprot_local'] = 0
df.loc[(df['db'] == 'disprot') & (df['local_id'] < 30.0), 'no_disprot_local'] = 1

df['no_pdb_global'] = 0
df.loc[(df['db'] == 'pdb') & (df['global_id'] < 30.0), 'no_pdb_global'] = 1

df['no_disprot_global'] = 0
df.loc[(df['db'] == 'disprot') & (df['global_id'] < 30.0), 'no_disprot_global'] = 1

df.to_csv(homology_table, sep="\t", index=False)
df