# Profiles
This notebook computes threshold-based profiles.

### imports

In [None]:
%run notebook_imports
%run notebooks/matplotlib_nb

In [None]:
# 3rd party libraries
import pandas as pd
import numpy as np
import scipy.stats
from collections import defaultdict
import glob
from tqdm import tqdm
import pickle
from os import makedirs
from os.path import exists
from functools import reduce
from scipy.stats import kruskal
from os import listdir, mkdir

In [None]:
from utilities import PartialPca
from utilities.paths import path
from utilities.constants import column_order, index_order, CONC, TREAT
from utilities.data_cleaning import label_data, format_column_name, normalise_data, clean_data
from utilities.thresholds import _THRESHOLD_ABOVE, _THRESHOLD_BELOW, threshold_dict,extract_threshold_percentile, extract_from_replicate
from utilities.plots import plot_heatmap

### parameters

`_TRHESHOLD` can be set to either `_THRESHOLD_BELOW` or `_THRESHOLD_ABOVE`. Some parts of this notebook consider the percentage of cells going below (or above the) thresholds, let's call them *"one-sided"*. This constant value modifies the behaviour of those parts.

In [None]:
_THRESHOLD = _THRESHOLD_BELOW

whether to save figures, and show plots in the notebook.

In [None]:
show_figures = True
save_figures = False

In [None]:
figure_path = path.plots
# save figures in two different paths according to
# which kind of calculation we're doing
if _THRESHOLD == _THRESHOLD_ABOVE:
    figure_path_onesided = figure_path + 'threshold_above/'
elif _THRESHOLD == _THRESHOLD_BELOW:
    figure_path_onesided = figure_path + 'threshold_below/'

### data loading

In [None]:
data_path = path.data_thresholds
files_list = listdir(data_path)
dataset_list=[filename for filename in files_list if filename.endswith('.csv')]

load both thresholds above and below. if they have not been calculated yet, do the calculation and save them.

In [None]:
PICKLE_NAME = 'results.pkl'
pickle_path = path.pickles

try:
    with open(pickle_path +'threshold_below-' + PICKLE_NAME,'rb') as file:
        thresholds_below = pickle.load(file)
    with open(pickle_path +'counts-' + PICKLE_NAME,'rb') as file:
        counts = pickle.load(file)
    with open(pickle_path +'normalised_count-' + PICKLE_NAME,'rb') as file:
        normalised_counts = pickle.load(file)
except FileNotFoundError:
    thresholds_below = dict()
    counts = dict()
    normalised_counts = dict()
    for file in tqdm(dataset_list):
        data = pd.read_csv(data_path+file)
        preprocessed_data, count, normalised_count = clean_data(data)
        thresholds_below[file] = extract_from_replicate(preprocessed_data,_THRESHOLD = _THRESHOLD_BELOW)
        counts[file] = count
        normalised_counts[file] = normalised_count
    with open(pickle_path +'threshold_below-' + PICKLE_NAME,'wb') as file:
        pickle.dump(thresholds_below,file)
    with open(pickle_path +'counts-' + PICKLE_NAME,'wb') as file:
        pickle.dump(counts,file)
    with open(pickle_path +'normalised_count-' + PICKLE_NAME,'wb') as file:
        pickle.dump(normalised_counts,file)


try:
    with open(pickle_path +'threshold_above-' + PICKLE_NAME,'rb') as file:
        thresholds_above = pickle.load(file)
except FileNotFoundError:
    thresholds_above = dict()
    counts = dict()
    normalised_counts = dict()
    for file in tqdm(dataset_list):
        data = pd.read_csv(data_path+file)
        preprocessed_data, count, normalised_count = clean_data(data)
        thresholds_above[file] = extract_from_replicate(preprocessed_data,_THRESHOLD = _THRESHOLD_ABOVE)
        counts[file] = count
        normalised_counts[file] = normalised_count
    with open(pickle_path +'threshold_above-' + PICKLE_NAME,'wb') as file:
        pickle.dump(thresholds_above,file)

# for those parts of the notebook that only work
# for one kind of thresholding at a time
if _THRESHOLD == _THRESHOLD_ABOVE:
    thresholds = thresholds_above
elif _THRESHOLD == _THRESHOLD_BELOW:
    thresholds = thresholds_below

Limit features to those that are common to all files. Discard others

In [None]:
f = [set(thresholds[file].keys()) for file in thresholds.keys()]
common_features = reduce(lambda x,y: x&y, f)
for file in thresholds.keys():
    for feature in list(thresholds[file].keys()):
        if feature not in common_features:
            thresholds[file].pop(feature)
            print("from", file, "dropped", feature)

## analysis

### thresholds plot

this cell will determine which features are in the plot and in what order

In [None]:
selected_features = [
# 'Perimeter  (above)',
# 'Perimeter  (below)',
# 'Clipped Perimeter  (above)',
# 'Clipped Perimeter  (below)',
# 'Tperimeter  (above)',
#  'Tperimeter  (below)',
#  'Clipped Tperimeter  (above)',
#  'Clipped Tperimeter  (below)',
#  'Length  (above)',
#  'Length  (below)',
'Count', 'Mito Int  (above)',
'Mito Int  (below)',  
'Mempercell DxA (above)',
#  'Mito Int Cell DxA (above)',
'Mito Int Cell DxA (below)',
#  'Mito Int Nuc DxA (above)',
'Area Nuc (above)',
'Form Factor (below)',
'WMOI (above)',
#  'Feret X  (above)',
#  'Feret X  (below)',
#  'Feret Y  (above)',
#  'Feret Y  (below)',
#  'Form Factor (above)',
#  'Weighted Relative Moment Of Inertia (above)',
#  'Weighted Relative Moment Of Inertia (below)',
#  'Moment (above)',
#  'Moment (below)',
#  'WMOI (below)',
#  'Area Nuc (below)',
#  'Mem Per Nuc DxA (above)',
#  'Mem Per Nuc DxA (below)',
#  'Memperm (above)',
#  'Memperm (below)',
#  'Mempercell DxA (below)',
'Area Cell (above)',
#  'Area Cell (below)',
#  'Avg Area Vac2 Per Cell (above)',
#  'Avg Area Vac2 Per Cell (below)',
#  'Feret X Cell (above)',
#  'Feret X Cell (below)',
#  'WMOI Cell (above)',
#  'WMOI Cell (below)',
#  'Length Cell (above)',
#  'Length Cell (below)',
#  'Feret Y Cell (above)',
#  'Feret Y Cell (below)',
#  'Moment Cell (above)',
#  'Moment Cell (below)',
#  'Weighted Relative Moment Inertia (above)',
#  'Weighted Relative Moment Inertia (below)',
#  'Clippedtperimeter (above)',
#  'Clippedtperimeter (below)',
#  'Tperimeter (above)',
#  'Tperimeter (below)',
#  'Clipped Perimeter (above)',
#  'Clipped Perimeter (below)',
#  'Form Factor Cell (above)',
'Form Factor Cell (below)',
'Perimeter Cell (above)',
'Perimeter Cell (below)',
'Total Area Vac2 Per Cell (above)',
'Total Area Vac2 Per Cell (below)',
#  'Avg Diam Vac Per Cell (above)',
#  'Avg Diam Vac Per Cell (below)',
#  'Area All Vac Per Cell (above)',
#  'Area All Vac Per Cell (below)',
#  '% Area Vac2 Per Cell (above)',
#  '% Area Vac2 Per Cell (below)',
#  'Count Vacuols Cell (above)',
#  'Count Vacuols Cell (below)',
#  'Form Factor Vacuols (above)',
#  'Form Factor Vacuols (below)',
#  'Mito Int Nuc DxA (below)'
]

This lists the colors to be assigned in the plot to each concentration.

In [None]:
conc_colors = {
    '0 ug/mL':'black',
    #'0.137 ug/mL':'#fee8c8',
    #'0.412 ug/mL':'#ff3333',
    #'1.235 ug/mL':'#fdbb84',
    '3.704 ug/mL':'#fee8c8',#'#990000',
    #'11.11 ug/mL':'#fc8d59',
    '33.33 ug/mL':'#fdbb84', #'#d7301f',
    #'100 ug/mL':'#ef6548',
    '300ug/mL':'#e34a33'#'#7f0000'
    
}

commenting a concentration here removes it from the plot.

In [None]:
concentrations = [
    '0 ug/mL',
#     '0.412 ug/mL',
    '3.704 ug/mL',
    '33.33 ug/mL',
    '300ug/mL']

these functions provide data manipulation necessary to make the plot.

In [None]:
# name of negative  control
NEG_CONTROL = concentrations[0] # i.e. '0 ug/mL'
# features and treatments list
features = list(thresholds[list(thresholds.keys())[0]].keys())
treatments = thresholds[list(thresholds.keys())[0]][features[0]].index.tolist()
# remove Triton
treatments = [x for x in treatments if 'Triton' not in x]
def panel_feature(feat,abovebelow='above'):
    '''returns the panel for selected feature.
    the panel can be accessed as
    panel.loc[experiment,treatment,concentration]
    
    abovebelow decides wheter the function will return
    the percent of cells that went above or below the thresholds.'''
    if abovebelow == 'above':
        return pd.Panel({n:df for n,df in enumerate([thresholds_above[file][feat] for file in thresholds])})
    if abovebelow == 'below':
        return pd.Panel({n:df for n,df in enumerate([thresholds_below[file][feat] for file in thresholds])})
    
def calc_plot_data(panel):
    """calculates the data that needs to be plotted, for each feature"""
    results = defaultdict(list)
    # calculate kruskal-wallis test, and mean/std of altered pop
    # vs randomly chosen control well (cn-ppv)
    neg_control = panel.loc[:,'PP-CNPPV',NEG_CONTROL].tolist()
    for conc in concentrations:
        conc_altered = panel.loc[:,TREATMENT,conc].tolist()
        if conc == '0 ug/mL': conc_altered = neg_control
        try:
            results['kruskal'].append(kruskal(neg_control, conc_altered).pvalue)
        except ValueError:
            results['kruskal'].append('ALLZEROS')
        results['mean'].append(np.mean(conc_altered))
        results['std'].append(np.std(conc_altered))
    return results

def plot_altered_pop_pretty(ax):
    xaxis = np.array(list(range(len(selected_features)))).astype(int)
    ax.set_title(TREATMENT)
    for x in xaxis:
        ax.axvline(x-.5,linestyle='dashed',linewidth=.2,alpha=.2,color='black')
    for n,conc in enumerate(concentrations):
        if conc=='control': continue
        line = [feature_results[f]['mean'][n] for f in selected_features]
        bars = [feature_results[f]['std'][n] for f in selected_features]
        ax.errorbar(x=xaxis#+((n-(all_concs/2))/(2*all_concs)
                        ,  #uncomment above and different conc's won't all end on the same line
                    y=line,
                    elinewidth=0.1,
                    linewidth=1.5,
                    yerr=bars,
                    label=conc,
                    color=conc_colors[conc],
                    alpha=.9,
                    capsize=0.5,
                    capthick=0.1,
                    #fmt='o', # remove this to put back lines in errorbar
                    #marker = '_' # https://matplotlib.org/api/markers_api.html
                    )
    for n,s in enumerate(significances):
        ax.plot(1,0,alpha=0,label='{}: p < {} '.format((n+1)*'*',s))
    ax.axhline(0,linewidth=.05,color='k', linestyle='dashed',alpha=.3)
    ax.set_ylim(top=1.5, bottom=0)
    # let's plot the statistics results in the bottom (20% / scale_stat) of the plot     
    upper_stat = ax.get_ylim()[0]-0.2
    scale_stat = 1.3
    stat_plot_height = (ax.get_ylim()[1]-ax.get_ylim()[0])/(scale_stat*3.5)
    lower_stat = upper_stat - stat_plot_height+0.1
    step = stat_plot_height/all_concs
    for n,conc in enumerate(concentrations):
        if conc=='control': continue
        for xpos,fe in zip(xaxis,selected_features):
            signif_level = feature_results[fe]['kruskal'][n]
            stringprint = ''
            try:
                for s in significances:
                    if signif_level<=s: stringprint+='*'
            except:
                if signif_level=='ALLZEROS':
                    pass
                else:
                    raise TypeError
            if stringprint:
                ax.text(xpos,upper_stat-n*step,stringprint,
                       horizontalalignment='center',
                       color=conc_colors[conc], fontsize=18)

    ax.set_xticks(xaxis)
    ax.set_ylabel('Fraction of cell population')
    ax.legend(loc='upper right',fontsize = 7, labelspacing=0.2, frameon=1,framealpha=0.5)
    ax.set_ylim(lower_stat,ax.get_ylim()[1])

counts_by_concentration = defaultdict(dict)
for treat in treatments:
    for conc in concentrations:
        try:
            counts_by_concentration[treat][conc] = [counts[file][treat][conc] for file in counts]
        except KeyError:
            print('err',treat,conc)

def kruskal_skiperror(*args):
    """some features, especially when thresholding below,
    give zero cells over threshold.
    This would cause error in kruskal
    were it not for this error-skip wrapper."""
    try:
        return kruskal(*args).pvalue
    except ValueError:
        return np.nan

In [None]:
# make plot dir in case it doesn't exist
try: mkdir(figure_path + 'pretty_plots/')
except FileExistsError: pass

# now looping over treatments and features, compute all data
# necessary for the plot
for TREATMENT in treatments:
    feature_results = dict()
    for feat in features:
        panel = panel_feature(feat,'above')
        feature_results['{} (above)'.format(feat)] = calc_plot_data(panel)
        panel = panel_feature(feat,'below')
        feature_results['{} (below)'.format(feat)] = calc_plot_data(panel)

    negative_control_sample = counts_by_concentration[TREATMENT][NEG_CONTROL]
    normalization = np.mean(negative_control_sample)
    feature_results['Count'] = {
        'std':[np.std(counts_by_concentration[TREATMENT][conc])/normalization
               for conc in counts_by_concentration[TREATMENT]],
        'mean':[np.mean(counts_by_concentration[TREATMENT][conc])/normalization
                for conc in counts_by_concentration[TREATMENT]],
        'kruskal':[kruskal_skiperror(negative_control_sample,counts_by_concentration[TREATMENT][conc])
                   for conc in counts_by_concentration[TREATMENT]],
    }

    # signigicance levels to test
    significances = [.05,.01,.005]
    # just need the number of concentrations
    all_concs = len(concentrations)

    # plot!
    fig = plt.figure(figsize=(3,2.5))
    ax = fig.subplots()
    plot_altered_pop_pretty(ax)
    fig.tight_layout()

    if save_figures: fig.savefig(figure_path + 'pretty_plots/' + TREATMENT + '.png', dpi=600)
    
# only show last figure made as an example
if show_figures: fig.show()