# Statistical Model Construction
Optimizing BepiPred performance based on errors in predictions

In [5]:
import json
import os
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pandas as pd
import scipy.stats
import numpy as np
import utils

In [64]:
# Load in data
with open('../bepipred3.json') as fi:
    data = json.load(fi)

In [65]:
pdbs = dict()

for chain in data:
    pdbid, chainid, partition = chain['desc'].split(' ')

    if pdbid not in pdbs:
        pdbs[pdbid] = {'PDB': list(),'predictions': list(),'targets': list(),'aminoacid':list(), 'partition': list() }


    preds = chain['preds']
    pdbs[pdbid]['predictions'].extend(preds)
    pdbs[pdbid]['targets'].extend(chain['epitope'])
    pdbs[pdbid]['aminoacid'].extend(list(chain['seq'])) 
    pdbs[pdbid]['partition'].extend([ int(partition) for x in range(len(chain['preds'])) ])
    pdbs[pdbid]['PDB'].extend([ pdbid for x in range(len(chain['preds'])) ])


In [67]:
pdbs_dataframe = pd.DataFrame.from_dict({'PDB':list(), 'predictions': list(),'targets': list(),'aminoacid':list(), 'partition': list() })
for pdb in sorted(pdbs):
    dataframe = pd.DataFrame.from_dict(pdbs[pdb])
    pdbs_dataframe = pdbs_dataframe.append(dataframe)

epitopes = pdbs_dataframe.groupby('PDB').targets.sum()
pred_sum = pdbs_dataframe.groupby('PDB').predictions.sum()
scipy.stats.pearsonr(epitopes, pred_sum)

(0.30025515170948264, 8.916592157958962e-05)