# Purity Value Analysis

First select which data set you would like to analyze.

In [None]:
import ipywidgets as widgets
from glob import glob
from os.path import basename, splitext

datasets = [basename(x) for x in glob("PurityData/*")]
w = widgets.Dropdown(
    options=datasets,
    description='Data Set:',
)
display(w)

## Extract the Purity Values

In [None]:
dataset = w.value
filelist = []
for f in glob("PurityData/"+dataset+"/*.yaml"):
    if "labels.yaml" in f:
        continue
    filelist.append(splitext(basename(f))[0])
filelist = sorted(filelist)

from yaml import load
with open("PurityData/"+dataset+"/labels.yaml") as ifile:
    values = load(ifile)
    label_list = [str(x) for x in values["data"]]
plot_title = values["title"]

In [None]:

from numpy import array

prog = widgets.IntProgress(
    max=len(filelist),
    description='Loading:',
)
display(prog)

key_values = []
purity_values = []

for f in filelist:
    logfile = f+".yaml"
    key_values.append(logfile)
    val = []
    with open("PurityData/"+dataset+"/"+logfile) as ifile:
        values = load(ifile)
        for frag in values["Fragment multipoles"]:
            purity = frag["Purity indicator"]
            val.append(purity)
    purity_values.append(array(val))
    prog.value += 1
prog.bar_style="success"
prog.description="Finished"

## Analysis

First we'll plot a sorted list of purity values for each of the different data sets.

In [None]:
from matplotlib import pyplot as plt
fig, ax = plt.subplots(1,1)
ax.set_xlabel("Fragment")
ax.set_ylabel("Purity")

plt.title("Purity Values for Data Set: "+ w.value)

for i in range(0, len(filelist)):
    plt.plot(sorted(abs(purity_values[i])), '.', label=key_values[i])
    
plt.legend(loc="best")
plt.margins(x=0.1, y=0.1)
plt.show()

Next, we'll break down the data into the protein, the water molecules, and the ions in solution.

In [None]:
protein_purity_values = []
na_purity_values = []
cl_purity_values = []
h2o_purity_values = []

prog2 = widgets.IntProgress(
    max=len(filelist),
    description='Loading:',
)
display(prog2)

for i in range(0, len(filelist)):
    logfile = filelist[i]+".yaml"
    valh2o = []
    valprotein = []
    valcl = []
    valna = []
    with open("PurityData/"+dataset+"/"+logfile) as ifile:
        values = load(ifile)
        for frag in values["Fragment multipoles"]:
            name = frag["Atom names"]
            purity = frag["Purity indicator"]
            if 'Na' in name:
                valna.append(purity)
            elif 'Cl' in name:
                valcl.append(purity)
            elif 'N' in name or 'C' in name:
                valprotein.append(purity)
            else:
                valh2o.append(purity)
    h2o_purity_values.append(array(valh2o))
    protein_purity_values.append(array(valprotein))
    cl_purity_values.append(array(valcl))
    na_purity_values.append(array(valna))
    prog2.value += 1
    
prog2.bar_style="success"
prog2.description="Finished"

And now plot that data side by side.

In [None]:
import matplotlib.ticker as plticker
from numpy import nditer, abs
fig, ax= plt.subplots(2,2, figsize=(8,6), sharex=False, sharey=True)
loc = plticker.MultipleLocator(base=1.0)

ymin = min([min(x) for x in [abs(x) for x in purity_values]])/1.1
ymax = max([max(x) for x in [abs(x) for x in purity_values]])*1.1

for axis in ax.reshape(-1):
    axis.set_ylim(ymin, ymax)
    axis.margins(x=0.1,y=0.1)
ax[0,0].set_ylabel("Purity Value", fontsize=10)
ax[1,0].set_ylabel("Purity Value", fontsize=10)
ax[0,0].set_xlabel("Protein Fragment", fontsize=10)
for i in range(0, len(filelist)):
    ax[0,0].plot(abs(protein_purity_values[i]), 'x', label=label_list[i])
ax[0,1].set_xlabel("H2O Fragment", fontsize=10)
for i in range(0, len(filelist)):
    ax[0,1].plot(abs(h2o_purity_values[i]), 'x', label=label_list[i])
ax[1,0].set_xlabel("Na Fragment", fontsize=10)
for i in range(0, len(filelist)):
    ax[1,0].plot(abs(na_purity_values[i]), 'x', label=label_list[i])
ax[1,1].set_xlabel("Cl Fragment", fontsize=10)
for i in range(0, len(filelist)):
    ax[1,1].plot(abs(cl_purity_values[i]), 'x', label=label_list[i])
ax[1,1].xaxis.set_major_locator(loc)
ax[1,0].xaxis.set_major_locator(loc)

handles, labels = ax[1,1].get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center',ncol=int(len(filelist)/1))      
    
fig.tight_layout(rect=[0, 0.03, 1, 0.95])

fig.savefig("Pictures/"+dataset+".eps")

plt.show()

In [None]:
import matplotlib.ticker as plticker
from numpy import nditer, abs
fig, ax= plt.subplots(1,4, figsize=(12,4), sharex=False, sharey=True,
                      gridspec_kw = {'width_ratios':[2, 3, 1.0, 1.5]})
loc = plticker.MultipleLocator(base=1.0)

ymin = min([min(x) for x in [abs(x) for x in purity_values]])/1.1
ymax = max([max(x) for x in [abs(x) for x in purity_values]])*1.1

for axis in ax.reshape(-1):
    axis.set_ylim(ymin, ymax)
    axis.margins(x=0.1,y=0.1)
ax[0].set_ylabel("Purity Value", fontsize=14)
ax[0].set_ylabel("Purity Value", fontsize=14)
ax[0].set_xlabel("Protein Residues", fontsize=14)
from numpy import arange
filelist = filelist[0:6]
s = ['.', ',', 'o', 'v', '8', 's', 'p', '*', 'x', 'd']
col = ["black",'b', 'g', 'r', 'c', 'm', 'y', 'k', 'b', 'g']

for i in range(0, len(filelist)):
    ax[0].plot(abs(protein_purity_values[i]), 'x', label=label_list[i], c=col[i])
ax[1].set_xlabel("Water Molecules", fontsize=14)
for i in range(0, len(filelist)):
    ax[1].plot(abs(h2o_purity_values[i]), 'x', label=label_list[i], c=col[i])
ax[2].set_xlabel("Na Ions", fontsize=14)
for i in range(0, len(filelist)):
    ax[2].plot(abs(na_purity_values[i]), 'x', label=label_list[i], c=col[i])
ax[3].set_xlabel("Cl Ions", fontsize=14)
for i in range(0, len(filelist)):
    ax[3].plot(abs(cl_purity_values[i]), 'x', label=label_list[i], c=col[i])
ax[3].xaxis.set_major_locator(loc)
ax[2].xaxis.set_major_locator(loc)

handles, labels = ax[1].get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center',ncol=int(len(filelist)/1), title=plot_title)   

plt.subplots_adjust(wspace=0.05, hspace=0)
fig.savefig("Pictures/"+dataset+".eps", bbox_inches="tight")

plt.show()