In [None]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore", "use_inf_as_na")

from IPython.display import display

import sys
sys.path.insert(0, '../src')

import numpy as np
import pandas as pd
import numpy.random as rand
import matplotlib.pyplot as plt
import seaborn as sns


import pickle


import noise_models as noise
import model_fitting as fit
import fig_plot as fplot
import thermo_models as thermo

In [None]:
seed = 42

print("Seed:", seed)

rand.seed(seed)


# Name of dataset folder
label = "220520_pushpull"

components = ["phospho", "substrate", "kinase", 'pptase', 'kinase2', 'kinase2_phospho']


df_dataset_key = pd.read_csv("../data/"+label+"/dataset_key.csv", sep='\s*,\s*', engine='python').set_index("exp_name")
display(df_dataset_key)


df_MOCU_key = pd.read_csv("../data/"+label+"/MOCU_key.csv", sep='\s*,\s*', engine='python').set_index("component")
display(df_MOCU_key)


# load datasets

df_list = []
for exp_name, row in df_dataset_key.iterrows():
    
    df = pd.read_csv("../data/{}/{}.csv".format(label, row['file_name']))
    df = df.drop("Unnamed: 0", axis=1, errors='ignore').sample(frac=1.0, replace=False, random_state=seed).reset_index(drop=True)

    df = df.rename(columns={row['substrate_col']:'substrate_anti_exp', 
                         row['phospho_col']:'phospho_anti_exp', 
                         row['kinase_col']:'kinase_anti_exp'})
    
    if row['model'] == 'pushpull' or row['model'] == 'two_layer' or row['model'] == 'two_layer_nowriter' or row['model'] == 'two_layer_noeraser':
        df = df.rename(columns={row['pptase_col']:'pptase_anti_exp'})
    else:
        df['pptase_anti_exp'] = 1e-8
        
    
    if row['model'] == 'two_layer' or row['model'] == 'two_layer_nowriter' or row['model'] == 'two_layer_noeraser':
        df = df.rename(columns={row['kinase2_col']:'kinase2_anti_exp'})
        df = df.rename(columns={row['kinase2_phospho_col']:'kinase2_phospho_anti_exp'})
        df['kinase2_phospho_anti_exp'] = df['kinase2_phospho_anti_exp']

    else:
        df['kinase2_anti_exp'] = 1e-8
        df['kinase2_phospho_anti_exp'] = 1e-8
        
   
    df.drop(df.columns.difference(['substrate_anti_exp','phospho_anti_exp', 'kinase_anti_exp', 'pptase_anti_exp', 'kinase2_anti_exp', 'kinase2_phospho_anti_exp']), axis=1, inplace=True)
    
    df['exp_name'] = exp_name
    df.index.rename('cell_index', inplace=True)
    
    
    df_list.append(df)
    
# dataframe containing all datasets   
df_data = pd.concat(df_list)
df_data = df_data.reset_index().set_index(['cell_index', 'exp_name'])
df_data = df_data.reorder_levels(df_data.index.names[::-1])

print(len(df_data.index))
df_data.dropna(inplace=True)
print(len(df_data.index))


df_data = df_data[(df_data[df_data.columns] > 0.0).all(axis=1)]
print(len(df_data.index))




display(df_data)

# Load Noise Models

In [None]:
# setup noise model dictionary
noise_models = {c:dict() for c in components}
print(noise_models)

with open("../data/"+label+"/results/noise_model_params.pkl", 'rb') as pkl_file:
    noise_model_params = pickle.load(pkl_file)

    
    
    
display(noise_model_params)

# points per bin
ppbin = 100

for c in components:
    
    print(c)
    
    # distribution of antibodies and GFP for non-empty cells
    df = pd.read_csv("../data/{}/{}.csv".format(label, df_MOCU_key.loc[c, 'file_name']))    
    anti = df[df_MOCU_key.loc[c, 'anti_col_name']].values
    GFP = df[df_MOCU_key.loc[c, 'GFP_col_name']].values
    print(len(anti), len(GFP))
    idx = (anti > 0.0) & (GFP > 0.0)
    anti = anti[idx]
    GFP = GFP[idx]
    
    print(len(anti), len(GFP))
    
    noise_models[c]['anti'] = noise.BackgroundDist(anti, ppbin=ppbin)
    noise_models[c]['GFP'] = noise.BackgroundDist(GFP, ppbin=ppbin)
    
    # linear model for converting antibody to GFP measurements
    noise_models[c]['anti2GFP'] = noise.LinearNoise(anti, GFP)
    
    
    # distribution of antibodies and GFP for empty cells
    df = pd.read_csv("../data/{}/{}.csv".format(label, df_MOCU_key.loc['empty_'+c, 'file_name']))
    anti = df[df_MOCU_key.loc['empty_'+c, 'anti_col_name']].values
    GFP = df[df_MOCU_key.loc['empty_'+c, 'GFP_col_name']].values
    print(len(anti), len(GFP))
    idx = (anti > 0.0) & (GFP > 0.0)
    anti = anti[idx]
    GFP = GFP[idx]
    print(len(anti), len(GFP))
    
    noise_models[c]['anti_background'] = noise.BackgroundDist(anti, ppbin=ppbin)
    noise_models[c]['GFP_background'] = noise.BackgroundDist(GFP, ppbin=ppbin)

    
    # convert antibody background to GFP units
    empty_anti_as_GFP = noise_models[c]['anti2GFP'].transform(noise_models[c]['anti_background'].get_data())       
    
    noise_models[c]['anti_as_GFP_background'] = noise.BackgroundDist(empty_anti_as_GFP, ppbin=ppbin)
    
    # lognormal noise model with background
    noise_models[c]['GFP2MOCU'] = noise.LogNormalBGNoise(noise_models[c]['anti_as_GFP_background'])
        

# Convert antibody measurements to GFP

In [None]:
for c in components:
    df_data[c+'_GFP_infer'] = 0.0
    
for exp_name, row in df_dataset_key.iterrows():
    
    print(exp_name)
        
    df_tmp = df_data.query("exp_name==@exp_name")
    
    for c in components:

        # a weird way to check for nans or empty values
        if row[c+'_col'] != row[c+'_col']:
            continue
            

        df_data.loc[df_tmp.index, c+'_GFP_infer'] = noise_models[c]['anti2GFP'].transform(df_data.loc[df_tmp.index, c+'_anti_exp'])
        
        
display(df_data)

# Convert to AU (remove background)

In [None]:
for c in components:
    
    if c == 'phospho' or c == 'kinase2_phospho':
            continue
    
    df_data[c+'_MOCU_infer'] = 0.0

for exp_name, row in df_dataset_key.iterrows():
    
    print(exp_name)
        
    df_tmp = df_data.query("exp_name==@exp_name")
    
    for c in components:
        
        if c == 'phospho' or c == 'kinase2_phospho':
            continue

        # a weird way to check for nans or empty values
        if row[c+'_col'] != row[c+'_col']:
            continue
            
        MOCU_noise = noise_models[c]['GFP2MOCU']
        
        (c0, sigma) = noise_model_params[exp_name][c]
            
        GFP = df_data.loc[df_tmp.index, c+'_GFP_infer'] 
        
        MOCU = MOCU_noise.cal_mean_conc(GFP, c0*np.ones_like(GFP), sigma)
        
        df_data.loc[df_tmp.index, c+'_MOCU_infer']  = MOCU
        
        df_data.loc[df_tmp.index, c+'_GFP_denoise'] = MOCU + noise_models[c]['anti_as_GFP_background'].mean

display(df_data)

# Load fit parameters

In [None]:
df_params = pd.read_csv("../data/"+label+"/results/model_params.csv", sep=',', engine='python')   

display(df_params)


# Calculate model predictions

In [None]:

prefit_params, param_to_index, dataset_to_params, x0, bounds = fit.setup_model_params(df_dataset_key, df_params=df_params, noise_models=noise_models)

x = np.zeros_like(x0)
for p in param_to_index:
    x[param_to_index[p]] = df_params.query("name==@p").iloc[0]['val']

args = (df_dataset_key, df_data, prefit_params, param_to_index, dataset_to_params, noise_models)

fit.predict(x, args, df_data)

display(df_data)

print(len(df_data))
print(len(df_data.dropna()))


In [None]:
exp_name = "pptase"
df_subset = df_data.query(f"exp_name=='{exp_name}'")

kinase_avg = np.exp(np.mean(np.log(df_subset['kinase_MOCU_infer'].values)))
pptase_avg = np.exp(np.mean(np.log(df_subset['pptase_MOCU_infer'].values)))
substrate_avg = np.exp(np.mean(np.log(df_subset['substrate_MOCU_infer'].values)))

print(kinase_avg, pptase_avg, substrate_avg)

In [None]:
dataset_to_params[exp_name]

In [None]:

params = []
for idx, p in enumerate(dataset_to_params[exp_name]):
    print(idx, p, prefit_params[p])
    params.append(prefit_params[p])

print(params)

params = 10.0**np.array(params)




samples = 1000

# params[2] = 1e-4
kinase = np.logspace(1, 7, samples, base=10)
pptase = pptase_avg * np.ones(samples)

# params[4] = 1e-4
# kinase = kinase_avg  * np.ones(samples)
# pptase = np.logspace(1, 7, samples, base=10)

substrate = substrate_avg * np.ones(samples)

pplation = thermo.predict_pushpull(kinase, pptase, substrate, *params)



In [None]:
df_subset = df_data.query(f"exp_name=='{exp_name}'")

fig, ax = plt.subplots(1, 1, figsize=(8, 6))


ax.scatter(df_subset['kinase_MOCU_infer']/df_subset['pptase_MOCU_infer'], 
           df_subset["phospho_MOCU_predict"] / df_subset["substrate_MOCU_infer"],
           marker='.')


ax.plot(kinase / pptase, pplation/substrate, 'k--')

ax.set_xlabel("synKin (AU) / synPhos (AU)")
ax.set_ylabel("pplation (AU) / synSub (AU)")


ax.set_xscale("log")
# ax.set_yscale("log")
plt.show()