In [None]:
%load_ext autoreload
%autoreload 2

from IPython.display import display, Markdown

import sys, os
sys.path.insert(0, '../py_scripts')

import numpy as np
import scipy as sp
import pandas as pd
import numpy.linalg as la
import numpy.random as rand
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import scipy.stats as stats


import time
import pickle


import noise_models as noise
import model_fitting as fit
import fig_plot as fplot
import thermo_models as thermo



sns.set(context='talk', font_scale=1.0, color_codes=True, palette='deep', style='ticks', 
        rc={'mathtext.fontset': 'cm', 'xtick.direction': 'in','ytick.direction': 'in',
            'axes.linewidth': 1.5, 'figure.dpi':100, 'text.usetex':False})

# Load Data

In [None]:
seed = 42

print("Seed:", seed)

rand.seed(seed)


# name of dataset folder
# label = "21_10_15_medhighgating"
label = "22_05_05_twolayer"

components = ["phospho", "substrate", "kinase", 'pptase', 'kinase2', 'kinase2phospho']


df_dataset_key = pd.read_csv("../data/"+label+"/dataset_key.csv", sep='\s*,\s*', engine='python').set_index("exp_name")
# df_dataset_key = df_dataset_key.query("model!='pushpull'")
display(df_dataset_key)

df_MOCU_key = pd.read_csv("../data/"+label+"/MOCU_key.csv", sep='\s*,\s*', engine='python').set_index("component")
display(df_MOCU_key)


# load datasets

df_list = []
for exp_name, row in df_dataset_key.iterrows():
    
    df = pd.read_csv("../data/{}/{}.csv".format(label, row['file_name']))
    df = df.drop("Unnamed: 0", axis=1, errors='ignore').sample(frac=1.0, replace=True, random_state=seed).reset_index(drop=True)

    df = df.rename(columns={row['substrate_col']:'substrate_anti_exp', 
                         row['phospho_col']:'phospho_anti_exp', 
                         row['kinase_col']:'kinase_anti_exp'})
    
    if row['model'] == 'pushpull' or row['model'] == 'two_layer':
        df = df.rename(columns={row['pptase_col']:'pptase_anti_exp'})
    else:
        df['pptase_anti_exp'] = 1e-8
        
    
    if row['model'] == 'two_layer':
        df = df.rename(columns={row['kinase2_col']:'kinase2_anti_exp'})
        df = df.rename(columns={row['kinase2phospho_col']:'kinase2phospho_anti_exp'})
        df['kinase2phospho_anti_exp'] = 1/3 * df['kinase2phospho_anti_exp']

    else:
        df['kinase2_anti_exp'] = 1e-8
        df['kinase2phospho_anti_exp'] = 1e-8
        
   
    df.drop(df.columns.difference(['substrate_anti_exp','phospho_anti_exp', 'kinase_anti_exp', 'pptase_anti_exp', 'kinase2_anti_exp', 'kinase2phospho_anti_exp']), 1, inplace=True)
    
    df['exp_name'] = exp_name
    df.index.rename('cell_index', inplace=True)
    
    
    
    df_list.append(df)
    
# dataframe containing all datasets   
df_data = pd.concat(df_list) #.drop("Unnamed: 0", axis=1, errors='ignore')
df_data = df_data.reset_index().set_index(['cell_index', 'exp_name'])
df_data = df_data.reorder_levels(df_data.index.names[::-1])

print(len(df_data.index))
df_data.dropna(inplace=True)
print(len(df_data.index))
df_data = df_data[(df_data[df_data.columns] > 0.0).all(axis=1)]
print(len(df_data.index))


# setup noise model dictionary
noise_models = {c:dict() for c in components}
print(noise_models)


display(df_data)

# Plot raw data for each experiment

In [None]:
for exp_name, row in df_dataset_key.iterrows():
    
    if row['model'] == 'push' or row['model'] == "substrate_only" or row['model'] == "non-pplatable":
        fplot.plot_push_dataset_summary(df_data, exp_name)
    elif row['model'] == 'pushpull' or row['model'] == 'two_layer':
        fplot.plot_pushpull_dataset_summary(df_data, exp_name)
                

    

# Load noise model data and construct probability density functions

In [None]:
# points per bin
ppbin = 100

for c in components:
    
    # distribution of antibody values w GFP and GFP for non-empty cells
    df = pd.read_csv("../data/{}/{}.csv".format(label, df_MOCU_key.loc[c, 'file_name']))    
    anti = df[df_MOCU_key.loc[c, 'anti_col_name']].values
    GFP = df[df_MOCU_key.loc[c, 'GFP_col_name']].values
    idx = (anti > 0.0) & (GFP > 0.0)
    noise_models[c]['anti'] = noise.Density(anti[idx], ppbin=ppbin)
    noise_models[c]['GFP'] = noise.Density(GFP[idx], ppbin=ppbin)
    
    # distribution of antibodies and GFP for empty cells
    df = pd.read_csv("../data/{}/{}.csv".format(label, df_MOCU_key.loc['empty_'+c, 'file_name']))
    anti = df[df_MOCU_key.loc['empty_'+c, 'anti_col_name']].values
    GFP = df[df_MOCU_key.loc['empty_'+c, 'GFP_col_name']].values
    idx = (anti > 0.0) & (GFP > 0.0)
    noise_models[c]['empty_anti'] = noise.Density(anti[idx], ppbin=ppbin)
    noise_models[c]['empty_GFP'] = noise.Density(GFP[idx], ppbin=ppbin)
    
    
    binrange = (0, 6)
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 4), squeeze=False)
    
    fig.suptitle(c, y=1.05)

    ax = axes[0, 0]
    
    sns.histplot(noise_models[c]['empty_anti'].df['vals'], binrange=binrange, log_scale=True, bins=64, ax=ax, 
                 label='empty control', element='step', fill=False, color='g', stat='density')
    sns.histplot(noise_models[c]['anti'].df['vals'], binrange=binrange, log_scale=True, bins=64, ax=ax, 
                 label='control w/ GFP', element='step', fill=False, color='r', stat='density')
  


    ax.set_xscale('log')
    ax.set_xlabel("Antibody")

    
    ax.legend(loc='lower left', fontsize='xx-small', bbox_to_anchor=(0.0, 1.0), ncol=3)


    ax = axes[0, 1]

    sns.histplot(noise_models[c]['empty_GFP'].df['vals'], binrange=binrange, log_scale=True, bins=64, ax=ax, 
                 label='empty control', element='step', fill=False, color='g', stat='density')
    sns.histplot(noise_models[c]['GFP'].df['vals'], binrange=binrange, log_scale=True, bins=64, ax=ax, 
                 label='control', element='step', fill=False, color='b', stat='density')

    ax.set_xscale('log')
    ax.set_xlabel("GFP")


    plt.show()
    

# Noise models for converting antibody measurements to MOCU (GFP units)

In [None]:
# points per bin
ppbin = 10

gaussian_cutoff_percentile = 0.99
empty_prior = 0.5

for c in components:
        
    noise_models[c]['empty_anti2GFP'] =  noise.RandomConditionalNoise(noise_models[c]['empty_anti'].get_data(), 
                                            noise_models[c]['empty_GFP'].get_data(), ppbin=ppbin)

    noise_models[c]['nonempty_anti2GFP'] =  noise.RandomConditionalNoise(noise_models[c]['anti'].get_data(), 
                                            noise_models[c]['GFP'].get_data(), ppbin=ppbin)
        

    noise_models[c]['composite_anti2GFP'] = noise.CompositeConditionalNoise(noise_models[c]['empty_anti2GFP'], 
                                                                            noise_models[c]['nonempty_anti2GFP'],
                                                                           empty_prob=empty_prior, 
                                                                            cutoff_percent=gaussian_cutoff_percentile)

#     if c == 'phospho':
        
#         noise_models[c]['composite_anti2GFP'] = noise.CompositeConditionalNoise(noise_models[c]['empty_anti2GFP'], 
#                                                                                 noise_models[c]['nonempty_anti2GFP'],
#                                                                                empty_prob=empty_prior, 
#                                                                                 cutoff_percent=gaussian_cutoff_percentile)
#     else:
        
#         noise_models[c]['composite_anti2GFP'] = noise.CompositeConditionalNoiseNoEmpty(noise_models[c]['nonempty_anti2GFP'],
#                                                                                   cutoff_percent=gaussian_cutoff_percentile)

    fig = plt.figure(figsize=(14, 4))
    ax1 = fig.add_subplot(1, 3, 1)
    ax2 = fig.add_subplot(1, 3, 2, sharex=ax1, sharey=ax1) 
    ax3 = fig.add_subplot(1, 3, 3, sharex=ax1, sharey=ax1)
    
    ax = ax1
        
    noise_models[c]['composite_anti2GFP'].plot(ax)
    ax.set_title(c)

    
    ax.set_ylabel('out:GFP')
    ax.set_xlabel('in:antibody')
    
    ax.set_aspect('equal')
    
    
    ax = ax2
    
    noise_models[c]['composite_anti2GFP'].plot_composite(ax)

    ax.set_ylabel('out:GFP')
    ax.set_xlabel('in:antibody')

    ax.set_aspect('equal')
    
    ax = ax3
    
    noise_models[c]['composite_anti2GFP'].plot_conditional_prob(ax, cbar=True)
    
    ax.set_ylabel('out:GFP')
    ax.set_xlabel('in:antibody')

    ax.set_aspect('equal')
    

    plt.show()
    
        
# reverse noise model for pplated substrate

c = 'phospho'

noise_models[c]['empty_GFP2anti'] =  noise.RandomConditionalNoise(noise_models[c]['empty_GFP'].get_data(), 
                                            noise_models[c]['empty_anti'].get_data(), ppbin=ppbin)

noise_models[c]['nonempty_GFP2anti'] =  noise.RandomConditionalNoise(noise_models[c]['GFP'].get_data(), 
                                        noise_models[c]['anti'].get_data(), ppbin=ppbin)

noise_models[c]['composite_GFP2anti'] = noise.CompositeConditionalNoise(noise_models[c]['empty_GFP2anti'], 
                                                                        noise_models[c]['nonempty_GFP2anti'],
                                                                       empty_prob=empty_prior, 
                                                                        cutoff_percent=gaussian_cutoff_percentile)

fig = plt.figure(figsize=(14, 4))
ax1 = fig.add_subplot(1, 3, 1)
ax2 = fig.add_subplot(1, 3, 2, sharex=ax1, sharey=ax1) 
ax3 = fig.add_subplot(1, 3, 3, sharex=ax1, sharey=ax1)

ax = ax1

noise_models[c]['composite_GFP2anti'].plot(ax)
ax.set_title(c)

ax.set_ylabel('out:antibody')
ax.set_xlabel('in:GFP')


ax.set_aspect('equal')


ax = ax2

noise_models[c]['composite_GFP2anti'].plot_composite(ax)

ax.set_ylabel('out:antibody')
ax.set_xlabel('in:GFP')

ax.set_aspect('equal')

ax = ax3

noise_models[c]['composite_GFP2anti'].plot_conditional_prob(ax, cbar=True)

ax.set_ylabel('out:antibody')
ax.set_xlabel('in:GFP')

ax.set_aspect('equal')
    

plt.show()



# reverse noise model for pplated substrate

c = 'kinase2phospho'

noise_models[c]['empty_GFP2anti'] =  noise.RandomConditionalNoise(noise_models[c]['empty_GFP'].get_data(), 
                                            noise_models[c]['empty_anti'].get_data(), ppbin=ppbin)

noise_models[c]['nonempty_GFP2anti'] =  noise.RandomConditionalNoise(noise_models[c]['GFP'].get_data(), 
                                        noise_models[c]['anti'].get_data(), ppbin=ppbin)

noise_models[c]['composite_GFP2anti'] = noise.CompositeConditionalNoise(noise_models[c]['empty_GFP2anti'], 
                                                                        noise_models[c]['nonempty_GFP2anti'],
                                                                       empty_prob=empty_prior, 
                                                                        cutoff_percent=gaussian_cutoff_percentile)

fig = plt.figure(figsize=(14, 4))
ax1 = fig.add_subplot(1, 3, 1)
ax2 = fig.add_subplot(1, 3, 2, sharex=ax1, sharey=ax1) 
ax3 = fig.add_subplot(1, 3, 3, sharex=ax1, sharey=ax1)

ax = ax1

noise_models[c]['composite_GFP2anti'].plot(ax)
ax.set_title(c)

ax.set_ylabel('out:antibody')
ax.set_xlabel('in:GFP')


ax.set_aspect('equal')


ax = ax2

noise_models[c]['composite_GFP2anti'].plot_composite(ax)

ax.set_ylabel('out:antibody')
ax.set_xlabel('in:GFP')

ax.set_aspect('equal')

ax = ax3

noise_models[c]['composite_GFP2anti'].plot_conditional_prob(ax, cbar=True)

ax.set_ylabel('out:antibody')
ax.set_xlabel('in:GFP')

ax.set_aspect('equal')
    

plt.show()



# Transform to MOCU

In [None]:
for c in components:
    df_data[c+'_GFP_infer'] = 0.0
    
    if c == 'phospho':
        df_data[c+'_frac_infer'] = 0.0
        
    
for exp_name, row in df_dataset_key.iterrows():
    
    print(exp_name)
        
    df_tmp = df_data.query("exp_name==@exp_name").dropna()
    
    for c in components:
        
        # a weird way to check for nans or empty values
        if row[c+'_col'] != row[c+'_col']:
            continue
            
        anti = df_data.loc[df_tmp.index, c+'_anti_exp']
    
        df_data.loc[df_tmp.index, c+'_GFP_infer'] = noise_models[c]['composite_anti2GFP'].transform(anti)
        
display(df_data)

# Save Inferred MOCU Values

In [None]:
df_data.to_csv("../data/"+label+"/model_predictions.csv", sep=',')

In [None]:

for exp_name, row in df_dataset_key.iterrows():
    
    df_tmp = df_data.query("exp_name==@exp_name").dropna()
    
    binrange = (0, 6)
        
    for c in components:
        
        # a weird way to check for nans or empty values
        if row[c+'_col'] != row[c+'_col']:
            continue
        
        fig, axes = plt.subplots(1, 2, figsize=(12, 4), squeeze=False)
    
        fig.suptitle(exp_name+": " + c)
    
        ax = axes[0, 0]
        
#         ax.text(0.05, 0.95, c, va='top', ha='left', transform=ax.transAxes, fontsize='large')

        sns.histplot(noise_models[c]['empty_anti'].df['vals'], binrange=binrange, log_scale=True, bins=64, ax=ax, 
                     label='empty control', color='g', stat='density')
        sns.histplot(noise_models[c]['anti'].df['vals'], binrange=binrange, log_scale=True, bins=64, ax=ax, 
                     label='active control', color='r', stat='density')

        sns.histplot(df_data.loc[df_tmp.index, c+'_anti_exp'], binrange=binrange, log_scale=True, bins=64, ax=ax, 
                     label='experiment', element='step', fill=False, color='k', stat='density')

        ax.vlines([noise_models[c]['composite_anti2GFP'].low_cutoff, 
                   noise_models[c]['composite_anti2GFP'].high_cutoff], ymin=0, ymax=1.5, color='k', ls='--')
        
        if c == 'phospho' or c == 'kinase2phospho':
            GFP_infer = df_data.loc[df_tmp.index, c+'_GFP_infer']
            anti_check = noise_models[c]['composite_GFP2anti'].transform(GFP_infer)
        
            sns.histplot(anti_check, binrange=binrange, log_scale=True, bins=64, ax=ax, 
                     label='backwards conversion check', element='step', fill=False, color='b', stat='density')

        ax.set_xscale('log')
        ax.set_xlabel("Antibody")

        ax.legend(loc='upper right', fontsize='xx-small', title=c)



        ax = axes[0, 1]

        sns.histplot(noise_models[c]['empty_GFP'].df['vals'], binrange=binrange, log_scale=True, bins=64, ax=ax, 
                     label='empty control', color='g', stat='density')
        sns.histplot(noise_models[c]['GFP'].df['vals'], binrange=binrange, log_scale=True, bins=64, ax=ax, 
                     label='active control', color='b', stat='density')
        sns.histplot(df_tmp[c+"_GFP_infer"], binrange=binrange, log_scale=True, bins=64, ax=ax, 
                     label='inferred GFP', element='step', fill=False, color='k', stat='density')


        ax.set_xscale('log')
        ax.set_xlabel("GFP")
        
        ax.legend(loc='upper right', fontsize='xx-small', title=c)
            


        plt.tight_layout()
        plt.show()
        

                

    

In [None]:

for exp_name, row in df_dataset_key.iterrows():
    
    df_tmp = df_data.query("exp_name==@exp_name").dropna()
    
   
        
    fig, axes = plt.subplots(1,2, constrained_layout=True, figsize=(8, 5), squeeze=False)
    ax = axes[0, 0]
    sns.histplot(df_tmp, x='substrate_anti_exp', y='phospho_anti_exp', ax=ax, log_scale=(True, True))
    ax.plot([1e0, 1e5], [1e0*3, 1e5*3], 'k--')
    
    
    ax = axes[0, 1]
    sns.histplot(df_tmp, x='substrate_GFP_infer', y='phospho_GFP_infer', ax=ax, log_scale=(True, True))
    ax.plot([1e0, 1e5], [1e0*3, 1e5*3], 'k--')
    
    fig.suptitle(exp_name)
    
    plt.show()
        
                

    

# Load prefit parameters

In [None]:
try:
    df_params = pd.read_csv("../data/"+label+"/model_params.csv", sep=',', engine='python')   
except:
    df_params = None

# Uncomment this to overwrite all previous fits
# df_params = None
    
display(df_params)

# Fit all datasets simultaneously (except two layer)

In [None]:
df_params, variance = fit.fit(df_dataset_key.query("model!='non-pplatable' and model!='two_layer'"), df_data, df_params=df_params)

df_params = fit.calc_error(df_dataset_key.query("model!='non-pplatable' and model!='two_layer'"), df_data, df_params=df_params)

display(df_params)

# Save model parameters


In [None]:
df_params.to_csv("../data/"+label+"/model_params.csv", sep=',', index=False)

# Fit substrate only data sets

In [None]:
df_params = fit.fit(df_dataset_key.query("model=='substrate_only'"), df_data, df_params=df_params)

df_params = fit.calc_error(df_dataset_key.query("model=='substrate_only'"), df_data, df_params=df_params)

display(df_params)

# Save model parameters


In [None]:
df_params.to_csv("../data/"+label+"/model_params.csv", sep=',', index=False)

# Fit push data sets

In [None]:
df_params = fit.fit(df_dataset_key.query("model=='push'"), df_data, df_params=df_params)

df_params = fit.calc_error(df_dataset_key.query("model=='push'"), df_data, df_params=df_params)

display(df_params)

# Save model parameters


In [None]:
df_params.to_csv("../data/"+label+"/model_params.csv", sep=',', index=False)

# Fit push-pull data sets

In [None]:
df_params = fit.fit(df_dataset_key.query("model=='pushpull'"), df_data, df_params=df_params)

df_params = fit.calc_error(df_dataset_key.query("model=='pushpull'"), df_data, df_params=df_params)

display(df_params)

# Save model parameters


In [None]:
df_params.to_csv("../data/"+label+"/model_params.csv", sep=',', index=False)

# Fit two layer data sets

In [None]:
df_params = fit.fit(df_dataset_key.query("model=='two_layer'"), df_data, df_params=df_params)

df_params = fit.calc_error(df_dataset_key.query("model=='two_layer'"), df_data, df_params=df_params)

display(df_params)

# Save model parameters


In [None]:
df_params.to_csv("../data/"+label+"/model_params.csv", sep=',', index=False)

# Calculate model predictions

In [None]:
prefit_params, param_to_index, dataset_to_params, x0, bounds = fit.setup_model_params(df_dataset_key, df_params=df_params)

x = np.zeros_like(x0)
for p in param_to_index:
    x[param_to_index[p]] = df_params.query("name==@p").iloc[0]['val']

args = (df_dataset_key, df_data, prefit_params, param_to_index, dataset_to_params)

fit.predict(x, args, df_data)

display(df_data)

print(len(df_data))
print(len(df_data.dropna()))

# Convert predicted MOCU values to antibody values

In [None]:
df_data['phospho_anti_predict'] = 0.0
df_data['kinase2phospho_anti_predict'] = 0.0

for exp_name, row in df_dataset_key.iterrows():
    
    df_tmp = df_data.query("exp_name==@exp_name").dropna()

    df_data.loc[df_tmp.index, 'phospho_anti_predict'] = noise_models['phospho']['composite_GFP2anti'].transform(df_data.loc[df_tmp.index, 'phospho_GFP_predict'])

    df_data.loc[df_tmp.index, 'kinase2phospho_anti_predict'] = noise_models['kinase2phospho']['composite_GFP2anti'].transform(df_data.loc[df_tmp.index, 'kinase2phospho_GFP_predict'])

    
display(df_data)

# Save predictions

In [None]:
df_data.to_csv("../data/"+label+"/model_predictions.csv", sep=',')