In [None]:
# Package Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import mplcursors
from scipy.optimize import curve_fit
import scipy.stats
# Qt popups for matplots instead of inline plots in jupyter notebook
%matplotlib qt

## Use the following cell if you want to use a file dialogue to choose the folders for files

##### Note that each dataset (e.g. FUS FL, SS18 PLD, etc.) will require its variable be set in its own cell with this method

FUS_FL_directory = select_directory()

New Cell 

SS18_PLD_directory = select_directory() 

etc. 

In [None]:
# Activate the Qt integration with Jupyter's event loop
%gui qt

from PyQt5.QtWidgets import QFileDialog

def select_directory():
    # No QApplication instance is needed here; %gui qt handles the integration
    folder_path = QFileDialog.getExistingDirectory(None, "Select Folder")
    return folder_path

## Function Definitions

In [None]:
def read_and_combine_csv_files(root_directory, file_suffix, include_source_file=True):
    """
    Reads CSV files with a specific suffix from all subdirectories within a given root directory,
    optionally adds a source_file column indicating the relative path of the CSV file, combines
    them into a single DataFrame, and returns it. This function navigates through the root directory 
    and its subdirectories, reads the CSV files that match the specified suffix, and optionally
    adds metadata about the file's source path.

    Parameters
    ----------
    root_directory : str
        The path to the root directory containing subdirectories with CSV files.
    file_suffix : str
        The suffix that file names must end with to be included. This helps in filtering the relevant files.
    include_source_file : bool, optional
        If True, adds a 'source_file' column to the DataFrame indicating the relative path from
        the root_directory to the file. Default is True.

    Returns
    -------
    pd.DataFrame
        A DataFrame containing the combined data from all relevant CSV files. If no files are found
        or all files are empty, returns an empty DataFrame.

    Raises
    ------
    Warning
        If a file is empty or cannot be read, it logs a warning and skips that file.

    Notes
    -----
    This function is designed to be robust in handling various errors such as file read errors or
    non-existent paths. It is intended to be used in data preprocessing tasks where multiple CSV files
    are to be consolidated into a single dataset for analysis.
    """
    # List to hold the dataframes
    dataframes = []


    # Check files in the root directory
    for filename in os.listdir(root_directory):
        if filename.endswith(file_suffix):
            file_path = os.path.join(root_directory, filename)
            try:
                df = pd.read_csv(file_path)
                if not df.empty:
                    if include_source_file:
                        df['source_file'] = filename
                    dataframes.append(df)
                else:
                    print(f"Warning: '{file_path}' is empty and was skipped.")
            except Exception as e:
                print(f"Error reading '{file_path}': {e}")

    # Loop through each directory within the root_directory
    for folder_name in os.listdir(root_directory):
        directory = os.path.join(root_directory, folder_name)
        if os.path.isdir(directory):  # Ensure it's a directory
            # Loop through the files in the directory
            for filename in os.listdir(directory):
                if filename.endswith(file_suffix):
                    # Construct the full file path
                    file_path = os.path.join(directory, filename)
                    try:
                        # Read the CSV file
                        df = pd.read_csv(file_path)
                        if not df.empty:
                            if include_source_file:
                                # Add a column for the source file
                                df['source_file'] = os.path.join(folder_name, filename)
                            # Append the DataFrame to the list
                            dataframes.append(df)
                        else:
                            print(f"Warning: '{file_path}' is empty and was skipped.")
                    except Exception as e:
                        print(f"Error reading '{file_path}': {e}")

    # Combine all dataframes into one, if any were successfully added
    if dataframes:
        combined_df = pd.concat(dataframes, ignore_index=True)
        return combined_df
    else:
        return pd.DataFrame()  # Return an empty DataFrame if no files were read successfully


In [None]:
def generalized_relu(x, m, x0):
    """Generalized Rectified Linear Unit (ReLU) function, with the linear portion parameterized by the x-intercept."""
    return np.maximum(m * (x - x0), 0)

def fit_c_sat_threshold(df_sorted, x_col, y_col):
    """
    Fits a generalized ReLU function to the x and y data specified in a DataFrame. This function is tailored
    to optimize parameters for a model where the response begins at a certain threshold, considering data sorted
    by the x values. The fitting process can be influenced by signal-to-noise ratio (SNR) as weights, though
    this implementation requires adjustments to utilize SNR.

    Parameters
    ----------
    df_sorted : pd.DataFrame
        DataFrame containing the data, which must be pre-sorted based on the x_col values.
    x_col : str
        Name of the column in df_sorted to use as the x data.
    y_col : str
        Name of the column in df_sorted to use as the y data.

    Returns
    -------
    m_opt : float
        Optimized slope parameter of the generalized ReLU function.
    x0_opt : float
        Optimized threshold parameter (x-offset) of the generalized ReLU function.
    pcov : 2D array
        Covariance matrix of the parameter estimates, indicative of the estimate uncertainties.

    Notes
    -----
    The function assumes the presence of a 'standard deviation' (std_dev) which is used to influence
    the bounds for the fit. If the dataset or selection of the x_col and y_col does not facilitate a
    straightforward fitting, the function may fail to compute the parameters, handling errors by returning None.

    The implementation expects numerical stability and finite values in the dataset. Handling for non-finite
    values and other irregularities should be considered during pre-processing or within this function as needed.
    """
    try:
        std_dev = df_sorted[x_col].std()

        # Finding indices for first non-zero and last zero values in y_col
        first_nonzero_idx = df_sorted[y_col].ne(0).idxmax()
        last_zero_idx = df_sorted[df_sorted[y_col] == 0].index[-1]   
        if first_nonzero_idx >= last_zero_idx:
            # Swap the indices 
            first_nonzero_idx, last_zero_idx = last_zero_idx, first_nonzero_idx
        
        # Define target value based on standard deviation and find the max index for fitting
        target_value = df_sorted[x_col].iloc[last_zero_idx] + 3*std_dev
        max_idx = df_sorted.index[-1] if target_value > df_sorted[x_col].max() else df_sorted[df_sorted[x_col] >= target_value].index[0]

        # Ensure x_data and y_data are 1D and finite
        x_data = np.ravel(df_sorted[x_col][:max_idx].to_numpy())
        y_data = np.ravel(df_sorted[y_col][:max_idx].to_numpy())

        # Remove any rows with NaN or Inf values
        finite_indices = np.isfinite(x_data) & np.isfinite(y_data)
        x_data = x_data[finite_indices]
        y_data = y_data[finite_indices]

        # Determine weight adjustments based on signal-to-noise ratio (SNR)
        #snr_data = df_sorted['gaussian_snr_estimate']
        #snr_weights = snr_data[first_nonzero_idx:last_zero_idx] / snr_data[first_nonzero_idx:last_zero_idx].max()
        # rescale snr weights to be between 0.1 and 1
        #snr_weights = 0.1 + 0.9 * (snr_weights - snr_weights.min()) / (snr_weights.max() - snr_weights.min())

        # Setup bounds for curve fitting
        upper_bound_idx = df_sorted[df_sorted[x_col] > (df_sorted[x_col].iloc[last_zero_idx] + std_dev/2)].index[0]
        bounds = ([-np.inf, df_sorted[x_col].iloc[first_nonzero_idx]], [np.inf, df_sorted[x_col].iloc[upper_bound_idx]])
        #print(bounds)
        
        # Initialize and adjust weights
        weights = np.ones_like(x_data)
        #weights[first_nonzero_idx:last_zero_idx] = snr_weights * (y_data[first_nonzero_idx:last_zero_idx] != 0)
        
        # Additional weight adjustments if needed
        if max_idx < len(x_data):
            weights_array2 = np.linspace(1.0, 0.1, len(x_data[max_idx:]))
            weights[max_idx:] = weights_array2


        # Ensure weights are 1D if used
        weights = np.ravel(weights)
        #weights = np.ones_like(weights)

        #print(x_data.shape, y_data.shape, weights.shape)

        # Fit the generalized ReLU model with weighted data
        popt, pcov = curve_fit(generalized_relu, x_data, y_data, bounds=bounds, sigma=1/(weights + 1e-8), absolute_sigma=True)

        # Extract optimized parameters
        m_opt, x0_opt = popt

        return m_opt, x0_opt, pcov
    except Exception as e:
        print(f"An error occurred during curve fitting: {e}")
        return None, None, None


In [None]:
def calculate_confidence_interval(x_data, y_data, y_pred, m_opt, x0_opt):
    """
    Calculates the confidence intervals for the predicted values of a fitted line.
    
    Parameters
    ----------
    x_data : array-like
        Independent variable values used in the fitting.
    y_data : array-like
        Observed values corresponding to x_data.
    y_pred : array-like
        Predicted values from the fitted model for x_data.
    m_opt : float
        Optimized slope parameter from the fit.
    x0_opt : float
        Optimized intercept parameter from the fit.

    Returns
    -------
    y_lower : array-like
        Lower bounds of the 95% confidence interval for the predicted values.
    y_upper : array-like
        Upper bounds of the 95% confidence interval for the predicted values.

    Notes
    -----
    The confidence intervals are calculated assuming normally distributed errors and use
    the Student's t-distribution for the critical value. This method calculates point-wise
    confidence intervals, which provide an estimated range that the true values are expected
    to fall into with 95% confidence.
    """
    # Confidence Interval Calculation and Plotting
    alpha = 0.01  # 95% confidence interval
    n = len(y_data)
    p = len([m_opt, x0_opt])
    dof = max(0, n - p)  # degrees of freedom
    t_stat = scipy.stats.t.ppf(1 - alpha / 2, dof)
    
    # Standard error of the prediction
    y_pred_std = np.sqrt(np.sum((y_data - y_pred)**2) / dof)
    ci = t_stat * y_pred_std * np.sqrt(1/n + (x_data - np.mean(x_data))**2 / np.sum((x_data - np.mean(x_data))**2))
    
    y_upper = y_pred + ci
    y_lower = y_pred - ci    

    return y_lower, y_upper

def calculate_r_squared(y_true, y_pred):
    """
    Calculates the coefficient of determination, R^2, which quantifies the goodness of fit of the model.

    Parameters
    ----------
    y_true : array-like
        True values for the dependent variable.
    y_pred : array-like
        Predicted values from the model for the same dependent variable.

    Returns
    -------
    R^2 : float
        The coefficient of determination, a statistical measure of how well the regression predictions
        approximate the real data points. An R^2 of 1 indicates perfect agreement.

    Notes
    -----
    R^2 is a common measure of fit quality in linear regression and other modeling contexts. It
    provides an indication of how much variance in the dependent variable is explained by the model.
    """
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    return 1 - ss_res / ss_tot

def custom_legend(construct_name, color):
    """
    Generates a custom legend for plots, associating colors and labels with specific data or model components.

    Parameters
    ----------
    construct_name : str
        The name of the construct to be labeled, representing a specific dataset or model part.
    color : str
        The color assigned to the construct in the plot.

    Notes
    -----
    This function creates legend elements for data points, the fitted line, and confidence intervals
    associated with a given construct, enhancing the visual interpretation of plotted data.
    """
    from matplotlib.lines import Line2D
    legend_elements = [
        Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10, label=f'{construct_name}: Data', alpha=0.2),
        Line2D([0], [0], color=color, lw=4, label=f'{construct_name}: Fitted Line'),
        Line2D([0], [0], color='w', lw=4, label=f'{construct_name}: 95% CI', alpha=0.2, markerfacecolor=color, markersize=15)
    ]
    plt.legend(handles=legend_elements, loc='best', fontsize='small')


In [None]:
def plot_c_sat_func(ax, df, x_col, y_col, construct_name, color, add_cursor=False):
    """
    Plots the data, fitted line, and confidence intervals for a given DataFrame on a specified Axes object,
    highlighting specific aspects such as data points, fitted function, and confidence intervals, with
    optional interactivity via a data cursor.

    Parameters
    ----------
    ax : matplotlib.axes.Axes
        The Axes object where the data will be plotted.
    df : pandas.DataFrame
        DataFrame containing the data to be plotted. It should include the specified x_col and y_col.
    x_col : str
        The name of the column to use as the x-axis data.
    y_col : str
        The name of the column to use as the y-axis data.
    construct_name : str
        The name of the construct being analyzed, used for labeling in the plot.
    color : str
        The color to use for all plot elements associated with this construct.
    add_cursor : bool, optional
        If True, adds an interactive cursor that displays additional data points information on hover. Default is False.

    Returns
    -------
    scatter : matplotlib.collections.PathCollection
        The scatter plot object representing the data points.

    Notes
    -----
    The function assumes a pre-existing curve fitting function `fit_c_sat_threshold` is available for use. It involves
    sorting and cleaning the data, performing curve fitting, plotting the results, and optionally adding interactive
    features. Errors during the fitting process are caught and logged, and the function will terminate early if fitting
    fails. R-squared value for the fit is calculated and output to the console for reference.

    The plot produced includes:
    - Scatter plot of the raw data points.
    - Line plot of the fitted model.
    - Shaded area representing the 95% confidence intervals of the fit.

    Example
    -------
    fig, ax = plt.subplots()
    plot_c_sat_func(ax, my_dataframe, 'time', 'response', 'Experiment 1', 'blue')
    plt.show()

    This function is flexible and intended for general use in data visualization tasks involving fitted models.
    """
    # Ensure DataFrame is sorted by x_col for consistent plotting
    df_sorted = df.sort_values(by=x_col).reset_index(drop=True)
    df_sorted = df_sorted.dropna(subset=[x_col, y_col])  # Drop rows with NaN values in x_col or y_col

    # Apply the curve fitting function
    try:
        m_opt, x0_opt, pcov = fit_c_sat_threshold(df_sorted, x_col, y_col)
        perr = np.sqrt(np.diag(pcov))  # Standard errors of the fitted parameters
    except Exception as e:
        print(f"Error in curve fitting: {e}")
        return

    # Prepare data for plotting
    x_data = df_sorted[x_col]
    y_data = df_sorted[y_col]
    y_pred = np.maximum(0, m_opt * (x_data - x0_opt))  # Predicted y values using the fitted function

    # Scatter plot of the data points
    scatter = ax.scatter(x_data, y_data, alpha=0.2, s=80, color=color) #, label=f'{construct_name}: Data')

    # Plot the fitted line
    ax.plot(x_data, y_pred, label=f'{construct_name}: Fitted Line (m={m_opt:.2f}, x0={x0_opt:.2f})', color=color, linewidth=2)

    # Calculate and plot confidence intervals
    ci_lower, ci_upper = calculate_confidence_interval(x_data, y_data, y_pred, m_opt, x0_opt)
    ax.fill_between(x_data, ci_lower, ci_upper, color=color, alpha=0.2) #, label=f'{construct_name}: 95% Confidence Interval')

    # R^2 Calculation
    r_squared = calculate_r_squared(y_data, y_pred)
    print(f'{construct_name} - Fitted Parameters: m={m_opt:.2f} ± {perr[0]:.2f}, x0={x0_opt:.2f} ± {perr[1]:.2f}, R^2={r_squared:.3f}')

    # Optionally add cursor if add_cursor is True
    if add_cursor:
        cursor = mplcursors.cursor(scatter, hover=True)
        cursor.connect("add", lambda sel: sel.annotation.set_text(
            f'File: {df_sorted["source_file"].iloc[sel.index]}\nCell: {df_sorted["label"].iloc[sel.index]}'))

    return scatter

In [None]:
def scatter_plot_func(ax, df, x_col, y_col, label, color):
    """
    Creates a scatter plot on a specified matplotlib axes object using data from a DataFrame. This function
    plots data points based on specified column names for the x and y axes, applies a label, and sets the color
    for the data points.

    Parameters
    ----------
    ax : matplotlib.axes.Axes
        The axes object on which the data will be plotted.
    df : pandas.DataFrame
        The DataFrame containing the data to plot.
    x_col : str
        The name of the column in the DataFrame to use for the x-axis values.
    y_col : str
        The name of the column in the DataFrame to use for the y-axis values.
    label : str
        The label to use for the plotted data in the legend.
    color : str
        The color to apply to the scatter plot points.

    Returns
    -------
    scatter : matplotlib.collections.PathCollection
        The scatter plot object created by this function.

    Notes
    -----
    - The function assumes the presence of `x_col` and `y_col` in the DataFrame.
    - The plot points are semi-transparent (alpha set to 0.33) and have a size of 60 for visibility.
    - This function adds the scatter plot to the given Axes object without modifying other aspects of the figure,
      such as axis labels or titles. These should be configured separately if required.

    Example
    -------
    fig, ax = plt.subplots()
    scatter_plot_func(ax, df, 'time', 'response', 'Dataset Label', 'blue')
    ax.set_xlabel('Time')
    ax.set_ylabel('Response')
    plt.legend()
    plt.show()

    This utility is especially useful for quick visual comparisons of datasets within the same figure.
    """
    scatter = ax.scatter(df[x_col], df[y_col], color=color, alpha=0.33, s=60, label=label)


    return scatter

In [None]:
def generate_scatter_plot(df_label_color_zip, x_col, y_col, x_title, y_title, x_lim, y_lim, fig_size, add_cursor=True, c_sat_mode=False):
    """
    Generates a scatter plot or a customized saturation plot for multiple datasets on a single figure.

    This function creates a scatter plot for each dataset provided in the iterable `df_label_color_zip`, 
    allowing for customization of plot aesthetics and interactivity options.

    Parameters
    ----------
    df_label_color_zip : iterable of tuples
        An iterable containing tuples of (DataFrame, label, color), where each tuple represents a dataset
        to plot, the label for the dataset, and the color for the plot points or line.
    x_col : str
        The name of the column to use as the x-axis data.
    y_col : str
        The name of the column to use as the y-axis data.
    x_title : str
        The title to set for the x-axis.
    y_title : str
        The title to set for the y-axis.
    x_lim : tuple
        The limit for the x-axis (min, max).
    y_lim : tuple
        The limit for the y-axis (min, max).
    fig_size : tuple
        The size of the figure to create (width, height).
    add_cursor : bool, optional
        If True, adds an interactive cursor that displays additional data point information on hover. Default is True.
    c_sat_mode : bool, optional
        If True, the function will use `plot_c_sat_func` to plot data points and their corresponding saturation
        thresholds, otherwise it uses `scatter_plot_func` for basic scatter plots. Default is False.

    Returns
    -------
    None
        The function directly modifies the matplotlib figure and shows it.

    Notes
    -----
    - `mplcursors` is used to add interactivity to the plot elements, allowing for more informative visualizations.
    - The function handles both basic scatter plots and more specialized plots based on the mode selected with `c_sat_mode`.

    Example
    -------
    df_label_color_zip = [(df1, 'Experiment 1', 'blue'), (df2, 'Experiment 2', 'green')]
    generate_scatter_plot(df_label_color_zip, 'Time', 'Response', 'Time (s)', 'Response Value', (0, 100), (0, 10), (10, 8))

    This example sets up a plot with specified axis limits and labels, suitable for presenting time-response data from multiple experiments.
    """
    # Create figure and axes
    fig, ax = plt.subplots(figsize=fig_size)

    scatters = []  # List to hold scatter objects for cursor
    data_reference = {}  # Dictionary to map scatters to dataframes

    # Loop through datasets
    for df, construct_label, color in df_label_color_zip:
        if c_sat_mode:
            scatter = plot_c_sat_func(ax, df, x_col, y_col, construct_label, color)
        else:
            scatter = scatter_plot_func(ax, df, x_col, y_col, construct_label, color)
        scatters.append(scatter)
        data_reference[scatter] = df  # Store reference to dataframe

    # Configure global plot properties
    ax.set_xlabel(x_title, fontsize=16, labelpad=12)
    ax.set_ylabel(y_title, fontsize=16, labelpad=12)
    #custom_legend(construct_name, color)  # Apply the custom legend
    ax.legend()

    ax.set_xlim(x_lim)
    ax.set_ylim(y_lim)
    fig.tight_layout()  # Optimize layout for readability on the figure level


    if add_cursor:
        # Initialize cursor for all scatters
        cursor = mplcursors.cursor(scatters, hover=True)
        cursor.connect("add", lambda sel: [
            sel.annotation.set_text(
                f"Location: {os.path.dirname(data_reference[sel.artist]['source_file'].iloc[sel.index])}\n"
                f"File: {os.path.basename(data_reference[sel.artist]['source_file'].iloc[sel.index])}\n"
                f"Cell: {data_reference[sel.artist]['label'].iloc[sel.index]}"),
        ])

    #if add_cursor:
        # Initialize cursor for all scatters
    #    cursor = mplcursors.cursor(scatters, hover=True)
    #    cursor.connect("add", lambda sel: sel.annotation.set_text(
    #        f'File: {data_reference[sel.artist]["source_file"].iloc[sel.index]}\nCell: {data_reference[sel.artist]["label"].iloc[sel.index]}'))

    plt.show()

## The following cell is an example of how to set the directory path manually and read dataframe csv files

In [None]:
FUS_FL_dir = select_directory()

In [None]:
#FUS_FL_dir = '/Users/christian.neureuter/Library/CloudStorage/Box-Box/Nuclear Condensates Shared Folder/Sam Analysis Folder/FUS Analysis Combined (latest version)/FUS FL Analysis Combined'
FUS_FL_df = read_and_combine_csv_files(FUS_FL_dir, '_cell_df.csv', include_source_file=True)

In [None]:
FUS_FL_df

## The following cell is an example of how to plot one dataset 

In [None]:
# Define your datasets
df_list = [FUS_FL_df]
construct_list = ['FUS FL']
color_list = ['blue']

# Combine the datasets with the construct names and colors
df_label_color_zip = zip(df_list, construct_list, color_list)

x_column = 'intensity_total'
y_column = 'puncta_intensity_total'
x_title = 'Total Intensity (a.u.)'
y_title = 'Puncta Intensity (a.u.)'
x_limits = (0, 150)
y_limits = (0, 50)

fig_size = (10, 6)

generate_scatter_plot(df_label_color_zip, x_column, y_column, x_title, y_title, x_limits, y_limits, fig_size, add_cursor=True, c_sat_mode=True)

## Reading multiple datasets as separate dataframes

In [None]:
FUS_FL_directory = '/Users/christian.neureuter/Library/CloudStorage/Box-Box/Nuclear Condensates Shared Folder/Sam Data Folder/New Q2 Data/Analysis/FUS Analysis/FUS Analysis Combined (latest version)/FUS FL Analysis Combined'
FUS_SS18_directory = '/Users/christian.neureuter/Library/CloudStorage/Box-Box/Nuclear Condensates Shared Folder/Sam Data Folder/New Q2 Data/Analysis/FUS Analysis/FUS Analysis Combined (latest version)/FUS SS18 Analysis Combined'
SS18_PLD_directory = '/Users/christian.neureuter/Library/CloudStorage/Box-Box/Nuclear Condensates Shared Folder/Sam Data Folder/New Q2 Data/Analysis/FUS Analysis/FUS Analysis Combined (latest version)/SS18 PLD Analysis Combined'
FUS_del_RGG2_directory = '/Users/christian.neureuter/Library/CloudStorage/Box-Box/Nuclear Condensates Shared Folder/Sam Data Folder/New Q2 Data/Analysis/FUS Analysis/FUS Analysis Combined (latest version)/FUS del RGG2 Analysis Combined'
FUS_del_RGG3_directory = '/Users/christian.neureuter/Library/CloudStorage/Box-Box/Nuclear Condensates Shared Folder/Sam Data Folder/New Q2 Data/Analysis/FUS Analysis/FUS Analysis Combined (latest version)/FUS del RGG3 Analysis Combined'
FUS_del_RGG2_RGG3_directory = '/Users/christian.neureuter/Library/CloudStorage/Box-Box/Nuclear Condensates Shared Folder/Sam Data Folder/New Q2 Data/Analysis/FUS Analysis/FUS Analysis Combined (latest version)/FUS del RGG2 RGG3 Analysis Combined'
FUS_del_RRM_directory = '/Users/christian.neureuter/Library/CloudStorage/Box-Box/Nuclear Condensates Shared Folder/Sam Data Folder/New Q2 Data/Analysis/FUS Analysis/FUS Analysis Combined (latest version)/FUS del RRM Analysis Combined'
FUS_del_ZnF_directory = '/Users/christian.neureuter/Library/CloudStorage/Box-Box/Nuclear Condensates Shared Folder/Sam Data Folder/New Q2 Data/Analysis/FUS Analysis/FUS Analysis Combined (latest version)/FUS del ZnF Analysis Combined'

In [None]:
FUS_FL_df = read_and_combine_csv_files(FUS_FL_directory, '_cell_df.csv', include_source_file=True)
FUS_SS18_df = read_and_combine_csv_files(FUS_SS18_directory, '_cell_df.csv', include_source_file=True)
SS18_PLD_df = read_and_combine_csv_files(SS18_PLD_directory, '_cell_df.csv', include_source_file=True)
FUS_del_RGG2_df = read_and_combine_csv_files(FUS_del_RGG2_directory, '_cell_df.csv', include_source_file=True)
FUS_del_RGG3_df = read_and_combine_csv_files(FUS_del_RGG3_directory, '_cell_df.csv', include_source_file=True)
FUS_del_RGG2_RGG3_df = read_and_combine_csv_files(FUS_del_RGG2_RGG3_directory, '_cell_df.csv', include_source_file=True)
##FUS_del_RRM_df = read_csv_files_from_subdirs(FUS_del_RRM_directory, '_cell_df.csv', include_source_file=True)
##FUS_del_ZnF_df = read_csv_files_from_subdirs(FUS_del_ZnF_directory, '_cell_df.csv', include_source_file=True)
#FUS_FL_puncta_df = read_csv_files_from_subdirs(FUS_FL_directory, '_puncta_df.csv', include_source_file=True)
#FUS_SS18_puncta_df = read_csv_files_from_subdirs(FUS_SS18_directory, '_puncta_df.csv', include_source_file=True)
#SS18_PLD_puncta_df = read_csv_files_from_subdirs(SS18_PLD_directory, '_puncta_df.csv', include_source_file=True)
#FUS_del_RGG2_puncta_df = read_csv_files_from_subdirs(FUS_del_RGG2_directory, '_puncta_df.csv', include_source_file=True)
#FUS_del_RGG3_puncta_df = read_csv_files_from_subdirs(FUS_del_RGG3_directory, '_puncta_df.csv', include_source_file=True)
#FUS_del_RGG2_RGG3_puncta_df = read_csv_files_from_subdirs(FUS_del_RGG2_RGG3_directory, '_puncta_df.csv', include_source_file=True)

## Setting up plot options and plotting multiple datasets on one graph (C-Sat)

In [None]:
# Define your datasets
df_list = [FUS_FL_df, FUS_SS18_df, SS18_PLD_df]
construct_list = ['FUS FL', 'FUS SS18', 'SS18 PLD', 'FUS del RGG2 del RGG3']
color_list = ['g', 'b', 'r', 'orange']

# Combine the datasets with the construct names and colors
df_label_color_zip = zip(df_list, construct_list, color_list)

x_column = 'intensity_total'
y_column = 'puncta_intensity_total'
x_title = 'Total Intensity (a.u.)'
y_title = 'Puncta Intensity (a.u.)'
x_limits = (0, 150)
y_limits = (0, 50)

fig_size = (10, 6)

generate_scatter_plot(df_label_color_zip, x_column, y_column, x_title, y_title, x_limits, y_limits, fig_size, add_cursor=True, c_sat_mode=True)

#### Generic scatter plot setup

In [None]:
# Define your datasets
df_list = [FUS_FL_df, FUS_SS18_df, SS18_PLD_df]
construct_list = ['FUS FL', 'FUS SS18', 'SS18 PLD', 'FUS del RGG2 del RGG3']
color_list = ['g', 'b', 'r', 'orange']

# Combine the datasets with the construct names and colors
df_label_color_zip = zip(df_list, construct_list, color_list)

x_column = 'intensity_total'
y_column = 'number_of_puncta'
x_title = 'Total Intensity (a.u.)'
y_title = 'Number of Foci per Cell'
x_limits = (0, 200)
y_limits = None

fig_size = (10, 6)

generate_scatter_plot(df_label_color_zip, x_column, y_column, x_title, y_title, x_limits, y_limits, fig_size, add_cursor=True, c_sat_mode=False)

In [None]:
# Different constucts plotted with C-Sat function

In [None]:
df_list = [FUS_FL_df, FUS_del_RRM_df, FUS_del_ZnF_df]
construct_list = ['FUS FL','FUS del RRM', 'FUS del ZnF']
color_list = ['g', 'y', 'k']

# Combine the datasets with the construct names and colors
df_label_color_zip = zip(df_list, construct_list, color_list)

x_column = 'intensity_total'
y_column = 'puncta_intensity_total'
x_title = 'Total Intensity (a.u.)'
y_title = 'Puncta Intensity (a.u.)'
x_limits = (0, 150)
y_limits = (0, 50)

fig_size = (10, 6)

generate_scatter_plot(df_label_color_zip, x_column, y_column, x_title, y_title, x_limits, y_limits, fig_size, add_cursor=True, c_sat_mode=True)

In [None]:
# Different constructs again, but with the same plot options, hence not listed

In [None]:
df_list = [FUS_FL_df, FUS_del_RGG2_RGG3_df, FUS_del_RGG2_df, FUS_del_RGG3_df]
construct_list = ['FUS FL','FUS del RGG2 del RGG3', 'FUS del RGG2', 'FUS del RGG3']
color_list = ['g', 'orange', 'c', 'm']

# Combine the datasets with the construct names and colors
df_label_color_zip = zip(df_list, construct_list, color_list)

generate_scatter_plot(df_label_color_zip, x_column, y_column, x_title, y_title, x_limits, y_limits, fig_size, add_cursor=True, c_sat_mode=True)

## Example of boxplot 

#### Can be easily modified to work with various metrics for evaluation

In [None]:
# Assuming df_list and construct_list are already defined as per your message
df_list = [FUS_FL_puncta_df, FUS_SS18_puncta_df, SS18_PLD_puncta_df, FUS_del_RGG2_RGG3_puncta_df]
construct_list = ['FUS FL', 'FUS SS18', 'SS18 PLD', 'FUS del RGG2 del RGG3']
color_list = ['g', 'b', 'r', 'orange']

In [None]:
# Create a new DataFrame
df = pd.DataFrame()

# Append each DataFrame in df_list to df with an additional 'construct' column
for df_i, construct in zip(df_list, construct_list):
    df_i = df_i.copy()
    df_i['construct'] = construct
    df = pd.concat([df, df_i])

# Create a boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(x='construct', y='micron area', hue='construct', data=df, palette=color_list, showfliers=False, legend=False)
#change the axis titles 
plt.xlabel('Construct', fontsize=16, labelpad=12)
plt.ylabel('Foci Area (um^2)', fontsize=16, labelpad=12)
plt.tight_layout()
plt.show()

## Example of how to load and analyze other dataframes from PyCAT

In [None]:
GFP_mCherry_dir = '/Users/christian.neureuter/Library/CloudStorage/Box-Box/Nuclear Condensates Shared Folder/Anushka Data Folder/Colocalization analysis for mCherry and GFP/mcherry and GFP 5-2-2024'
FUS_FOXG1_dir = '/Users/christian.neureuter/Library/CloudStorage/Box-Box/Nuclear Condensates Shared Folder/Anushka Data Folder/Colocalization analysis for FUSPLD and FOXG1/FUS-PLD and FOXG1 -IDR1 in vitro (2)/Colocalization analysis'


In [None]:
PWCCA_df = read_and_combine_csv_files(GFP_mCherry_dir, '_PWCCA_coefficient_df.csv', include_source_file=True)

In [None]:
df = PWCCA_df.copy()

In [None]:
# Selecting rows for a specific method
pearsons = df[df['Method'] == "Pearson's R value"]
weighted_taus = df[df['Method'] == "Weighted Tau value"]

# Summing the coefficients for each selected method
sum_pearsons = pearsons.filter(like='Coefficient').mean(axis=1).values[0]
sum_weighted_taus = weighted_taus.filter(like='Coefficient').mean(axis=1).values[0]

print("Avg of Pearson's Coefficients:", sum_pearsons)
print("Avg of Weighted Tau's Coefficients:", sum_weighted_taus)

In [None]:
df

In [None]:
pearsons

# Playground

Use this area to modify code, test different parameters, etc. 

In [None]:
FUS_FL_dir = select_directory()

In [None]:
FUS_del_RGG2_RGG3_dir = select_directory()

In [None]:
FUS_FL_df = read_and_combine_csv_files(FUS_FL_dir, '_cell_df.csv', include_source_file=True)

In [None]:
FUS_del_RGG2_RGG3_df = read_and_combine_csv_files(FUS_del_RGG2_RGG3_dir, '_cell_df.csv', include_source_file=True)

In [None]:
# Define your datasets
df_list = [FUS_FL_df, FUS_SS18_df, SS18_PLD_df]
construct_list = ['FUS FL', 'FUS SS18', 'SS18 PLD', 'FUS del RGG2 del RGG3']
color_list = ['g', 'b', 'r', 'orange']

# Combine the datasets with the construct names and colors
df_label_color_zip = zip(df_list, construct_list, color_list)

x_column = 'intensity_total'
y_column = 'puncta_intensity_total'
x_title = 'Total Intensity (a.u.)'
y_title = 'Puncta Intensity (a.u.)'
x_limits = (0, 150)
y_limits = (0, 50)

fig_size = (10, 6)

generate_scatter_plot(df_label_color_zip, x_column, y_column, x_title, y_title, x_limits, y_limits, fig_size, add_cursor=True, c_sat_mode=True)