# Parsing multiple KinExA activity logs.
The KinExA software can export an analysis file for a completed experiment (as a TSV file).  This script operates on a master folder, which itself contains sub-folders each with several TSV files.  Into each of these sub-folders, the script creates an Excel file with the relevant values from all its files, and a graphical summary of the results.

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
import matplotlib as mpl
import glob
import os
import math
import seaborn as sns
from matplotlib import gridspec

sns.set()
sns.set_style("ticks")

# We'll make sure to display the full dataframe, so we're not confused about its nature and contents.
pd.set_option('display.expand_frame_repr', False)
pd.set_option("display.max_rows", None)

os.chdir("/Users/DanielMA/Root/Hinge/Data analysis/Sandbox")







def parse_nCurve_TSV(path_and_name_of_nCurve_TSV):
    """ Accepts a filename for an nCurve TSV file (exported analysis), and returns
    several dataframes, some of them multi-level: 
    
    master: All the information from the file
    curve_master: All datapoints for graphing the individual curves, each as a dataframe
    summary: The summary columns relevant to the whole file (this output is actually a Pandas series)
    curve_summary: The summary columns relevant to the curves in this nCurve.
    
    It also works within an expected folder structure, to search for the component files and draw 
    other useful information out of them (mostly the CBP concentration used in those experiments).
    """

    # Master
    master = pd.read_csv(path_and_name_of_nCurve_TSV, index_col=None, header=0, sep='\t')
    master['File Name'] = os.path.basename(path_and_name_of_nCurve_TSV)
    number_of_curves = master.iloc[:,26].count()
    master['Number of curves'] = number_of_curves



    # curve_master

    first_curve = master.iloc[:,36:42]   
    first_curve.dropna(axis='index',how='all',inplace=True) # Get rid of annoying NaNs
    curve_list = [first_curve]
    proper_headers = first_curve.columns.values.tolist()

    for i in range(number_of_curves)[1:]:
        n = i*6
        
        current_curve = master.iloc[:,36+n:42+n]  # Next set of data
        column_dict = dict(zip(current_curve.columns.values.tolist(),proper_headers)) # Dictionary for renaming headers.
        current_curve.rename(index=str, columns=column_dict,inplace=True)# Headers now match the first set.
        current_curve.dropna(axis='index',how='all',inplace=True) # Get rid of annoying NaNs
        curve_list.append(current_curve)

    curve_master = pd.concat(curve_list, keys=range(number_of_curves)) 


    # curve_summary
    number_of_curves = master.iloc[:,26].count()
    curve_summary = master.iloc[:number_of_curves,26:34]  # These columns always contain the component curve summaries.
    curve_summary.drop(['ABC', 'Drift (%/run)'], axis=1, inplace = True)

    experiment_folder = os.path.dirname(path_and_name_of_nCurve_TSV) + "/Experiments/"
    component_curve_names = curve_summary['Name']
    component_curve_paths = [ experiment_folder + name + ".tsv" for name in component_curve_names ]

    header_list = ['Kd','Kd High','Kd Low','%Error','CBP Molecular Concentration',
                        'CBP Binding Site Concentration','CBP %Activity','Comments']
    index = range(len(header_list))
    column_list = [[x] for x in header_list]

    for name, path in zip(component_curve_names, component_curve_paths):    
        df = pd.read_csv(path,index_col=None, header=0, sep='\t')

        for header, col in zip(header_list, column_list):
            col.append(df[header][0])

    for header, col in zip(header_list,column_list):
        curve_summary[header] = col[1:]

        
    # Summary
    summary = master.loc[0,['File Name','Experiment Name','Number of curves','Kd',
                            'Kd High','Kd Low','%Error','CBP %Activity','Comments']]
    summary['CBP list'] = curve_summary['CBP Molecular Concentration'].values.tolist()


    return (master, summary, curve_master, curve_summary)


    
    
    
    
    
    
def plot_nCurve(summary, curve_master, curve_summary):
    """ Takes several output dataframes from the function parse_nCurve_TSV,
    produces a combined plot of their individual experiments and a log plot of the results."""

    plt.close('all')
    
    sns.set(font_scale=2)
    sns.set_style("ticks")
    color_swatch = sns.color_palette('deep')
#     colors = ["#6a79f7", "#10a674", "#c94cbe", "#feb308","#4f738e","#c14a09"]
#     sns.set_palette(sns.color_palette(colors))

    fig = plt.figure(figsize=(13, 18))
    gs = gridspec.GridSpec(3,1, height_ratios=[4,1,2]) 



    # First, the plot of component curves.
    ax1 = plt.subplot(gs[0, 0])
    ax1.set_xscale('log')
    number_of_curves = len(curve_summary)
    ax1.set_xlim(1e-13,1e-6)
    ax1.set_xticks([1e-12, 1e-9, 1e-6])
    ax1.set_xticklabels(['1 pM','1 nM','1 µM'])
    ax1.set_ylim(-10,110)
#     ax1.vlines(1e-12,20,100, linewidth=2)   #Vertical line at 1 pM
#     ax1.vlines(1e-9,0,100, linewidth=2)  #Vertical line at 1 nM.
#     ax1.grid(which="major", axis='x', color='k', linestyle='--', linewidth=0.5)
#     ax1.grid(which="minor", axis='x', color='b', linestyle='-', linewidth=0.1)
    ax1.set_title('nCurve: '+ summary['File Name'][:-4])


    for i in range(number_of_curves):
        non_empty = curve_master.loc[i]['Calculated Concentration'] > 0  # Removes NaN cells, Sig100, and NSB.
        calculated_x = curve_master.loc[i]['Calculated Concentration'][non_empty]
        calculated_y = curve_master.loc[i]['Calculated % Free'][non_empty]

        theory_x = curve_master.loc[i]['Theory Curve Concentration']
        theory_y = curve_master.loc[i]['Theory Curve % Free']

        ax1.plot(theory_x, theory_y, linestyle='-', linewidth=2)
        ax1.scatter(calculated_x, calculated_y)

    legend_labels = curve_summary['Name'].values.tolist()
    ax1.legend(legend_labels, loc=3)


    #Second, set up a bar plot of the overall results.
    ax2 = plt.subplot(gs[1, 0])
    bar_height = summary.loc['Kd High'] - summary.loc['Kd Low']
    ax2.barh(0, bar_height, left=summary.loc['Kd Low'], align='center', edgecolor='k', linewidth=1, color='#87ae73')

    # Red line marks the Kd prediction.  Other lines mark the CBP concentration for component runs.
    ax2.vlines(summary.loc['Kd'],-0.5,0.5, linewidth=5, color='k')  #Vertical black line at the Kd

    CBP_list = curve_summary['CBP Molecular Concentration']

    for CBP, hue in zip(CBP_list, color_swatch):
        ax2.vlines(CBP,-0.5,0.5, linewidth=5, color=hue)

    x_min = 1e-13
    x_max = 1e-8
    y_min = -0.5
    y_max = 0.5
    ax2.set_ylim(y_min,y_max)
    ax2.set_xscale('log')

    ax2.grid(which="major", axis='x', color='k', linestyle='--', linewidth=0.5)
    ax2.grid(which="minor", axis='x', color='b', linestyle='-', linewidth=0.1)


    ax2.set_xticks([1e-13,1e-12, 1e-11, 1e-10, 1e-9, 1e-8])
    ax2.set_xticklabels(['100 fM','1 pM','10 pM','100 pM','1 nM','10 nM'])
    ax2.tick_params(axis='y',length=0, left='off', labelleft='off')
    
    
    bbox_props = dict(boxstyle="round,pad=0.5", fc="w", ec="k", lw=2)
    box_kd =    "{0:.2f}".format(round(summary['Kd']*1e12,2))
    box_kd_lo = "{0:.2f}".format(round(summary['Kd Low']*1e12,2))
    box_kd_hi = "{0:.2f}".format(round(summary['Kd High']*1e12,2))
    error_ncurve = "{0:.2f}".format(summary['%Error'])
    box_cbp = "{0:.1f}".format(summary['CBP %Activity'])
    box_text = 'Kd: ' + box_kd + " pM (95% CI: " + box_kd_lo + " - " + box_kd_hi + ").        Error: " + error_ncurve + "%" + '        CBP Activity: ' + box_cbp + "%"
      
    ax2.set_title(box_text)



    # Third, create a table with relevant information.
    ax3 = plt.subplot(gs[2, 0])
    col_labels=['File Name','Ratio','% Error','CBP Molecular Concentration','CBP %Activity']

    file_names = curve_summary['Name'].values.tolist()
    kd = ["{0:.0f}".format(x) for x in curve_summary['Kd'].values / 1e-12] # Not included in table.
    ratio = ["{0:.3f}".format(round(x,3)) for x in curve_summary['Ratio'].values]
    error = ["{0:.2f}".format(x) for x in curve_summary['%Error'].values]
    cbp_mol = ["{0:.0f}".format(round(x*1e12,0))+' pM' for x in curve_summary['CBP Molecular Concentration'].values]
    cbp_percent = ["{0:.0f}".format(round(x,0)) for x in curve_summary['CBP %Activity'].values]

    cell_text = np.array([file_names, ratio, error, cbp_mol, cbp_percent]).T.tolist()

    the_table = ax3.table(cellText=cell_text, loc='upper center', colWidths=[0.12,0.03,0.03,0.10,0.06],
                         colLabels=col_labels, colLoc='center')
    the_table.scale(3,3)

    cells = the_table.properties()["celld"]

    for i in range(0, len(col_labels)):
        for j in range (0,len(file_names)+1):
            cells[j, i]._loc = 'center'

    the_table.auto_set_font_size(False)
    the_table.set_fontsize(16)

    ax3.set_title('Individual experiments bundled into this nCurve')
#     mpl.rcParams['axes.titlepad'] = -30 
    ax3.set_xticks([])
    ax3.set_xticklabels([])
    ax3.set_yticks([])
    ax3.set_yticklabels([])
    ax3.tick_params(axis='both',length=0, left='off', labelleft='off')
    sns.despine(ax=ax3, top=True, right=True, left=True, bottom=True)
    
    gs.tight_layout(fig, h_pad=3.0)

    
    
    
def plot_summary(summary_of_summaries):
    """ Plots the most important bits of several nCurves together, for ready comparison between them."""

    plt.close('all')

    sns.set(font_scale=1.5)
    sns.set_style("ticks")
    color_swatch = sns.color_palette('deep') + sns.color_palette('deep') # Doubled so we have extra colors

    fig = plt.figure(figsize=(13, 18))
    number_of_ncurves = len(summary_of_summaries.columns)
    gs = gridspec.GridSpec(number_of_ncurves,1) 
    axarr = [ plt.subplot(gs[x, 0]) for x in range(number_of_ncurves) ]

    for i in range(number_of_ncurves):
        ax = axarr[i]

        bar_height = summary_of_summaries.loc['Kd High'][i] - summary_of_summaries.loc['Kd Low'][i]
        ax.barh(0, bar_height, left=summary_of_summaries.loc['Kd Low'][i], align='center', edgecolor='k',
                linewidth=1, color='#87ae73')

        # Red line marks the Kd prediction.  Other lines mark the CBP concentration for component runs.
        ax.vlines(summary_of_summaries.loc['Kd'][i],-0.5,0.5, linewidth=5, color='k')  #Vertical red line at the Kd

        CBP_list = summary_of_summaries.loc['CBP list'][i]

        for CBP, hue in zip(CBP_list, color_swatch):
            ax.vlines(CBP,-0.5,0.5, linewidth=5, color=hue)

        x_min = 1e-13
        x_max = 1e-8
        y_min = -0.5
        y_max = 0.5
        ax.set_ylim(y_min,y_max)
        ax.set_xscale('log')

        ax.grid(which="major", axis='x', color='k', linestyle='--', linewidth=0.5)
        ax.grid(which="minor", axis='x', color='b', linestyle='-', linewidth=0.2)


        ax.set_xticks([1e-13,1e-12, 1e-11, 1e-10, 1e-9, 1e-8])
        ax.set_xticklabels(['100 fM','1 pM','10 pM','100 pM','1 nM','10 nM'])
        ax.tick_params(axis='y',length=0, left='off', labelleft='off')


        box_kd =    "{0:.2f}".format(round(summary_of_summaries.loc['Kd'][i]*1e12,2))
        box_kd_lo = "{0:.2f}".format(round(summary_of_summaries.loc['Kd Low'][i]*1e12,2))
        box_kd_hi = "{0:.2f}".format(round(summary_of_summaries.loc['Kd High'][i]*1e12,2))
        error_ncurve = "{0:.2f}".format(summary_of_summaries.loc['%Error'][i])
        box_cbp = "{0:.1f}".format(summary_of_summaries.loc['CBP %Activity'][i])
        

        ax.set_title('nCurve: '+ summary_of_summaries.columns[i][:-4] + '        Kd: '+ 
                      box_kd + " pM (95% CI: " + box_kd_lo + " - " + box_kd_hi + 
                     ").    Error: " + error_ncurve + "%" + 
                     '    CBP Activity: ' + box_cbp + "%")
    gs.tight_layout(fig, h_pad=1.0)
#     plt.show()
    
    
    
    
def process_nCurves_in_folder(folder_path):
    """Parse all the nCurves in a particular folder and export both Excel files and figures into 
    the same folder."""

    print('Now processing Excel summary')
    file_paths = glob.glob(folder_path+"/*.tsv")  # Parse all the TSV files in this folder into a list.
    file_paths.sort()  # Sort them, because the list of files out of glob.glob isn't alphabetical.
    file_names = [os.path.basename(x) for x in file_paths]

    frames = [ parse_nCurve_TSV(path)[1] for path in file_paths ] # Collect the summary from each.
    summary_of_summaries = pd.concat(frames, axis=1, keys=file_names) 
    if not os.path.exists(folder_path + '/Results/'):
        os.makedirs(folder_path + '/Results/')
    writer = pd.ExcelWriter(folder_path + '/Results/Summary Table.xlsx')
    summary_of_summaries.to_excel(writer,'Sheet1')
    writer.save()
    
    print('Now processing Summary Graph')
    plot_summary(summary_of_summaries)
    plt.savefig(folder_path + '/Results/Summary Graph.png',bbox_inches='tight')
    
    for path in file_paths:
        master, summary, curve_master, curve_summary = parse_nCurve_TSV(path)
        plot_nCurve(summary, curve_master, curve_summary)
        parent_directory = os.path.dirname(path)
        this_file_name = os.path.basename(path)[:-4]
        plt.savefig(parent_directory + '/Results/'+ this_file_name +'.png',bbox_inches='tight')
        print('Now processing figure: ' + this_file_name)
    
    
    
    print('Done!')

In [8]:
# Test Cell 2

# For one folder with many nCurves
folder_path = "/Users/DanielMA/Root/Hinge/Data analysis/Sandbox"
process_nCurves_in_folder(folder_path)

Now processing Excel summary
Now processing Summary Graph
Now processing figure: Fab_K0
Now processing figure: Fab_K1
Now processing figure: Fab_O0
Now processing figure: Fab_O1
Now processing figure: K0
Now processing figure: K1
Now processing figure: O0
Now processing figure: O1
Now processing figure: Y0
Now processing figure: Y1
Now processing figure: Y2
Now processing figure: Y4
Done!


In [3]:
# # Test Cell 3 (creates a summary of summaries)

# folder_path = "/Users/DanielMA/Root/Hinge/Data analysis/Sandbox"

# file_paths = glob.glob(folder_path+"/*.tsv")  # Parse all the TSV files in this folder into a list.
# file_paths.sort()  # Sort them, because the list of files out of glob.glob isn't alphabetical.
# file_names = [os.path.basename(x) for x in file_paths]

# frames = [ parse_nCurve_TSV(path)[1] for path in file_paths ] # Collect the summary from each.
# summary_of_summaries = pd.concat(frames, axis=1, keys=file_names) 



In [None]:
# Test Cell 1

# # For one nCurve
# path = "/Users/DanielMA/Root/Hinge/Data analysis/Sandbox/Y2_all.tsv"
# master, summary, curve_master, curve_summary = parse_nCurve_TSV(path)
# plot_nCurve(summary, curve_master, curve_summary)
# # plt.show()