In [1]:
################################################################################
#
# Imporatbles, just the basics :P

    # My module, cuz I'm cool
from helpers import mpl_plotting_helpers as mph
from helpers import stats_helpers as sh
from helpers import general_helpers as gh
from helpers import western_helpers as wh
from helpers.mph_modules.dotplots import get_data_info, add_errorbar
#from functions import *

    # Standard packages
import matplotlib.pyplot as plt
import matplotlib.font_manager as mpl_fm
from math import floor, ceil, log2
import pandas as pd
import glob

#
#
################################################################################

Loading the module: helpers.mpl_plotting_helpers

Loading the module: helpers.general_helpers

Loading the module: helpers.argcheck_helpers

Loading the module: helpers.pandas_helpers

Loading the module: helpers.stats_helpers.py

numpy        2.2.5
scipy         1.15.2
pandas        2.2.3

pandas        2.2.3
numpy         2.2.5

matplotlib    3.10.1
numpy         2.2.5



In [2]:
def _logical_ignore_comps(labelled_line_groups,
                          group_strs,
                          xgroup_strs):
    """
    Only want to compare along a line group (e.g. timecourse) or
    down an x-column (e.g. JE6 DMSO 0m vs JE6 U0126 0m), but not
    all the random other comparisons because statistically they're
    kind of useless
    
    So this function will find all of the pairs that are useless
    """
    groups_unpacked = []
    for group in labelled_line_groups:
        groups_unpacked += group
    # This will hold the ignored pairs
    ignore_me_senpai = []
    # First, get all pairs
    paired = gh.make_pairs(groups_unpacked,
                           dupes = False,
                           reverse = False)
    # Then iterate over and check the labels
    for p in paired:
        gs_check = 0
        xs_check = 0
        # Check all the group strings
        for gs in group_strs:
            if gs_check == 1:
                pass
            elif gs in p[0][0] and gs in p[1][0]:
                gs_check = 1
        # Check all the xgroup strings
        for xs in xgroup_strs:
            if xs_check == 1:
                pass
            elif xs in p[0][0] and xs in p[1][0]:
                xs_check = 1
        # If there isn't a match, in either, ignore
        if gs_check == 0 and xs_check == 0:
            ignore_me_senpai.append(p)
    # Return the ignored pairs at the end
    return ignore_me_senpai

def perform_line_statistics(labelled_line_groups,
                            ignore_comps,
                            comp_type,
                            statsfile):
    """
    labelled_line_groups -> data with labels
                            list of lists of [label, [d1,d2,...,dn]]
    ignore_comps -> list of pairs ("group 1", "group 2") to not be
                    compared
    comp_type -> statistics to use, currently only
                 ["HolmSidak", "TukeyHSD"] are supported
                 (both do an ANOVA first by default)
    statsfile -> a string to the output path and filename
                 for the statistics file output
    #####
    Returns None, just dumps the statsfile
    """
    assert comp_type in ["HolmSidak", "TukeyHSD"], f"Invalid comparison type: {comp_type}"
    groups_unpacked = []
    for group in labelled_line_groups:
        groups_unpacked += group
    if comp_type == "HolmSidak":
        comparison = sh.HolmSidak(*groups_unpacked,
                                  labels = True,
                                  override = True,
                                  alpha = 0.05,
                                  no_comp = ignore_comps)
    elif comp_type == "TukeyHSD":
        comparison = sh.TukeyHSD(*groups_unpacked,
                                  labels = True,
                                  override = True,
                                  alpha = 0.05,
                                  no_comp = ignore_comps)
    comparison.write_output(filename = statsfile,
                            file_type = "csv")
    return None

def find_centres(plotting_info):
    """
    plotting_info -> output from get_data_info, a list of
                     data info and the raw data
                     
    goal: grab the centres for xticks
    """
    centres = []
    for group in plotting_info:
        if len(centres) <= len(group[0]["centers"]):
            centres = group[0]["centers"]
    return centres

def line_plot(labelled_line_groups,
              show_points = False,
              show_legend = False,
              colours = ["grey" for _ in range(20)],
              group_labs = [f"Thing {i}" for i in range(20)],
              markers = ["s" for _ in range(20)],
              linestyles = ["solid" for _ in range(20)],
              xlabels = [f"Time {i}" for i in range(20)],
              ylabel = ["Fold change"],
              ylims = None,
              ignore_comps = [],
              statsfile = None,
              comp_type = "HolmSidak",
              figfile = None):
    """
    labelled_line_groups -> list of lists, where each sublist contains labelled groups
    """
    # First, get some basic plotting information
    plotting_info = [get_data_info(line) for line in labelled_line_groups]
    # Then manage the statistics
    if statsfile != None:
        perform_line_statistics(labelled_line_groups, 
                                ignore_comps, 
                                comp_type, 
                                statsfile)
    # Begin plotting c::
    if ylims == None:
        ylims = floor(min([item for item in gh.unpack_list(labelled_line_groups) if type(item) in [int, float]])), ceil(max([item for item in gh.unpack_list(labelled_line_groups) if type(item) in [int, float]]))
    # 
    fig, ax = plt.subplots(figsize = (6,6))
    # 
    for i in range(len(labelled_line_groups)):
        #
        ax.plot(plotting_info[i][0]["centers"],
                plotting_info[i][0]["means"],
                color = colours[i],
                label = group_labs[i],
                linestyle = linestyles[i])
        #
        for j in range(len(labelled_line_groups[i])):
            add_errorbar(ax, 
                         plotting_info[i][0]["centers"][j],
                         plotting_info[i][0]["means"][j],
                         plotting_info[i][0]["sems"][j],
                         color = colours[i])
            if show_points:
            #
                ax.scatter(plotting_info[i][0]["xs"][j],
                           plotting_info[i][1][j][1],
                           color = colours[i],
                           edgecolor = "black", alpha = 0.3,
                           marker = markers[i],
                           s = 10)
            else:
            #
                ax.scatter(plotting_info[i][0]["centers"],
                           plotting_info[i][0]["means"],
                           color = colours[i],
                           edgecolor = "black", alpha = 0.3,
                           marker = markers[i],
                           s = 30)
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    xticks = find_centres(plotting_info)
    ax.set_xticks(xticks)
    ax.set_xticklabels(xlabels[:len(xticks)],
                       fontfamily = "sans-serif", 
                       font = "Arial", 
                       fontweight = "bold", 
                       fontsize = 12,
                       rotation = 45,
                       ha = "center")
    ax.set_ylim(*ylims)
    mph.update_ticks(ax, which = "y")
    ax.set_ylabel(ylabel, fontfamily = "sans-serif",
                  font = "Arial", fontweight = "bold",
                  fontsize = 14)
    if show_legend:
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5),
                  prop = mpl_fm.FontProperties(family = "sans-serif",
                                               weight = "bold"))
    if figfile == None:
        plt.show()
    else:
        plt.savefig(figfile)
    plt.close()
    return None

def replace_neg(a_list, value = float("nan")):
    """
    replace any value <0 with 0
    """
    newlist = []
    for item in a_list:
        try:
            truth = item < 0
        except:
            newlist.append(item)
        else:
            if truth:
                newlist.append(value)
            else:
                newlist.append(item)
    return newlist

def safe_log2(number):
    try:
        log2(number)
    except:
        return float("nan")
    else:
        return log2(number)
    
#
#
###################################################################

In [3]:
################################################################################
#
#
exp_types = ["proteomics", "titr"]

targets = ["ERK", "LAT", "LCK", "PLC"]
proteomics_targets = ["PLC", "LAT"]
cell_stims = ["car", "t2kb", "ab"]
titr_conds = ["0 min", "2.5 m", "5 m", "7.5 m", "10 m", "15 m", "20 m"]
prot_conds = ["0 min", "2.5 m", "10 m"]


#
#
#################################################################################

In [4]:
def replace_negs(a_list):
    newlist = []
    for item in a_list:
        if item < 0:
            newlist.append(float("nan"))
        else:
            newlist.append(item)
    return newlist

def replace_partner_nans(target_list, load_list):
    assert len(target_list) == len(load_list), "You need the same number of loading controls as target signals, silly"
    newtarg = []
    newload = []
    for i in range(len(target_list)):
        if target_list[i] != target_list[i] or load_list[i] != load_list[i]:
            newtarg.append(float("nan"))
            newload.append(float("nan"))
        else:
            newtarg.append(target_list[i])
            newload.append(load_list[i])
    return newtarg, newload

def manage_a_file(file, 
                  groups,
                  signals_per_group = 2,
                  target = "685Ex-720Em", 
                  load = "785Ex-820Em",
                  norm_string = "0 min"):
    # First, read the file
    df = pd.read_excel(file)
    #next, extract the signal values.
    # replace any negative values with float("nan") -> these are not quantifiable
    target_signal = replace_negs(wh.get_signal(df, target))
    load_signal = replace_negs(wh.get_signal(df, load))
    # replace partner values with NANs
    target_signal, load_signal = replace_partner_nans(target_signal, load_signal)
    # do the LI-COR correction, which will give NANs if NAN is present
    cor_sig = wh.licor_correction(target_signal, load_signal)
    # group the signals using the groups list (should already be in order)
    labelled_groups = [[groups[i], cor_sig[signals_per_group*i:signals_per_group*(i+1)]] for i in range(len(groups))]
    # find the mean of the norm group
    mean = [sh.mean(group[1]) for group in labelled_groups if group[0] == norm_string][0]
    # normalise and log-transform
    labelled_groups = [[group[0], [log2(val/mean) for val in group[1]]] for group in labelled_groups]
    return labelled_groups

def manage_all_files(files_list,
                     stim_list,
                     target_list,
                     return_dict,
                     timepoints = titr_conds,
                     signals_per_group = 2,
                     norm_string = "0 min"):
    keys = list(return_dict[stim_list[0]][target_list[0]].keys())
    # Loop over all the files
    for f in files_list:
        # and loop over all the stim conditions
        for s in stim_list:
            # then, if that stim condition is in the file
            if s in f:
                # then loop over the targets
                for t in target_list:
                    # and if that target is in the file
                    if t in f:
                        # loop over the keys
                        for k in keys:
                            # if the key is in this file, then we have everything we need
                            if k in f:
                                # Grab the data
                                data = manage_a_file(f,
                                                     timepoints,
                                                     signals_per_group = signals_per_group,
                                                     norm_string = norm_string)
                                # and add it to the appropriate subsubsubsubdict
                                if return_dict[s][t][k] == {}:
                                    for d in data:
                                        return_dict[s][t][k][d[0]] = d[1]
                                else:
                                    for d in data:
                                        return_dict[s][t][k][d[0]] += d[1]
    return return_dict
                
                
    

In [8]:
titr_ylims = {"ERK": [-1,7],
         "LCK": [-2,4],
         "PLC": [-1,7],
         "LAT": [-2,6]}
          

all_titr_data = { stim : {targ : {"dmso" : {},
                          "01rdn" : {},
                          "_1rdn" : {},
                          "10rdn" : {},
                          "01soq" : {},
                          "_1soq" : {},
                          "10soq" : {}} for targ in targets}
            for stim in cell_stims}


titr_files = glob.glob("./input_excels/titr*/*.xls")

d = manage_all_files(titr_files, cell_stims, targets, all_titr_data)
d = {key1 : {key2 : {key3 : [[key4, value4] for key4, value4 in value3.items()] for key3, value3 in value2.items()}
                    for key2, value2 in value1.items()}
     for key1, value1 in d.items()}

# Key1 = stim type
for key1, value1 in d.items():
    # key2 = western target
    for key2, value2 in value1.items():
        # value has drug_condition : timecourse_data, and we want to plot all of these on 1 graph for a target
        # first, logically ignore comparisons that arent't relevant
        g_labs = [key3 for key3, value3 in value2.items()]
        labelled_vals = [[[key3+pair[0], pair[1]] for pair in value3]for key3, value3 in value2.items()]
        ignore = _logical_ignore_comps(labelled_vals,
                                       group_strs = g_labs,
                                       xgroup_strs = titr_conds)
        # Then, we need to plot with all the fun stuff
        line_plot(labelled_vals,
                  ylims = titr_ylims[key2],
                  colours = ["hotpink",
                             "lightskyblue",
                             "steelblue",
                             "blue",
                             "lavender",
                             "mediumpurple",
                             "indigo"],
                  markers = ["o", "s", "s", "s", "D", "D", "D"],
                  linestyles = ["dashdot" for i in range(7)],
                  xlabels = titr_conds,
                  show_points = False,
                  show_legend = True,
                  group_labs = ["DMSO", 
                                r"0.1 $\mu$M RDN", 
                                r"1 $\mu$M RDN", 
                                r"10 $\mu$M RDN",
                                r"0.1 $\mu$M Soq.", 
                                r"1 $\mu$M Soq.", 
                                r"10 $\mu$M Soq."],
                  ignore_comps = ignore,
                  statsfile = f"./stats/{key1}_{key2}_timecourse_stats",
              figfile = f"./figs/{key1}_{key2}_timecourse_graph.pdf",
              comp_type = "HolmSidak",
              ylabel = r"$\log_{2}$ Fold Change")
        

# Now I have all the data in dictionaries organised as follows:
    # Cell type/Stimulation type (ab, t2kb, car)
        # Western target (PLC ERK LAT LCK)
            # Drug/concentration
                # timepoints/data (as a key:value pair, turn into list things)
# Once I turn the sub-most dicts into lists, this should be an easy line plot <3



In [31]:
"""
prot_ylims = {"PLC": [-1,7],
              "LAT": [-2,6]}
          

all_prot_data = { stim : {targ : {"dmso" : {},
                          "rdn" : {},
                          "soq" : {}} for targ in proteomics_targets}
            for stim in cell_stims}


prot_files = glob.glob("./input_excels/proteomics*/*.xls")

dd = manage_all_files(titr_files, cell_stims, proteomics_targets, all_prot_data, signals_per_group = 5)
dd = {key1 : {key2 : {key3 : [[key4, value4] for key4, value4 in value3.items()] for key3, value3 in value2.items()}
                    for key2, value2 in value1.items()}
     for key1, value1 in dd.items()}

# Key1 = stim type
for key1, value1 in dd.items():
    # key2 = western target
    for key2, value2 in value1.items():
        print(key1, key2)
        # value has drug_condition : timecourse_data, and we want to plot all of these on 1 graph for a target
        # first, logically ignore comparisons that arent't relevant
        g_labs = [key3 for key3, value3 in value2.items()]
        ignore = _logical_ignore_comps([value3 for key3, value3 in value2.items()],
                                       group_strs = g_labs,
                                       xgroup_strs = prot_conds)
        print([value3 for key3, value3 in value2.items()])
        # Then, we need to plot with all the fun stuff
        line_plot([value3 for key3, value3 in value2.items()],
                  #ylims = prot_ylims[key2],
                  colours = ["hotpink",
                             "blue",
                             "indigo"],
                  markers = ["o", "s", "D"],
                  linestyles = ["dashdot" for i in range(3)],
                  xlabels = titr_conds,
                  show_points = False,
                  show_legend = True,
                  group_labs = ["DMSO", 
                                r"10 $\mu$M RDN",
                                r"10 $\mu$M Soq."],
                  ignore_comps = ignore,
                  statsfile = f"./stats/{key1}_{key2}_proteomics_stats",
              figfile = f"./figs/{key1}_{key2}_proteomics_graph.pdf",
              comp_type = "HolmSidak",
              ylabel = r"$\log_{2}$ Fold Change")
        break
    break
        

# Now I have all the data in dictionaries organised as follows:
    # Cell type/Stimulation type (ab, t2kb, car)
        # Western target (PLC ERK LAT LCK)
            # Drug/concentration
                # timepoints/data (as a key:value pair, turn into list things)
# Once I turn the sub-most dicts into lists, this should be an easy line plot <3

"""

  """


'\nprot_ylims = {"PLC": [-1,7],\n              "LAT": [-2,6]}\n\n\nall_prot_data = { stim : {targ : {"dmso" : {},\n                          "rdn" : {},\n                          "soq" : {}} for targ in proteomics_targets}\n            for stim in cell_stims}\n\n\nprot_files = glob.glob("./input_excels/proteomics*/*.xls")\n\ndd = manage_all_files(titr_files, cell_stims, proteomics_targets, all_prot_data, signals_per_group = 5)\ndd = {key1 : {key2 : {key3 : [[key4, value4] for key4, value4 in value3.items()] for key3, value3 in value2.items()}\n                    for key2, value2 in value1.items()}\n     for key1, value1 in dd.items()}\n\n# Key1 = stim type\nfor key1, value1 in dd.items():\n    # key2 = western target\n    for key2, value2 in value1.items():\n        print(key1, key2)\n        # value has drug_condition : timecourse_data, and we want to plot all of these on 1 graph for a target\n        # first, logically ignore comparisons that arent\'t relevant\n        g_labs = [ke