# **Spots analysis plotting**

This notebook plots:

*   peak distances in conditions
*   peak fwhm in conditions
* heatmaps of tracks
* Quality control of the spots data (peaks)

Written by Joanna Pylvänäinen

joanna.pylvanainen@abo.fi

In [None]:
# @title #Mount GDrive


from google.colab import drive
drive.mount('/content/drive')

!pip install fastdtw


In [None]:
# @title #Load useful functions


def normalize_to_01(series):
    """
    Normalize a given time series (1D numpy array) between 0 and 1.

    Parameters:
        series (numpy.ndarray): The time series to be normalized.

    Returns:
        numpy.ndarray: The normalized time series.
    """
    Imin = np.min(series)
    Imax = np.max(series)

    if Imax == Imin:
        return np.zeros_like(series)

    return (series - Imin) / (Imax - Imin)

# Function to normalize by maximum amplitude
def normalize_by_max_amplitude(series):
    return series / np.max(np.abs(series))



---


# **Part 2: Analysing spot data**



---



In [None]:
# @title #Define results folder (raw data). csv files will be fetched from here.

# Set the Results_Folder path
Results_Folder = "/content/drive/MyDrive/Kurppa/2025_reanalysis/Cleaned_notebooks/results_spots"  # @param {type: "string"}

print(f"Results will be saved in: {Results_Folder}")


In [None]:
# @title #Load average_fwhm data and plot (raw data)

import pandas as pd
from scipy.stats import ttest_ind
from numpy import std
import os
import numpy as np
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt

# Construct the file path within the Results_Folder
file_path = os.path.join(Results_Folder, 'data_for_Average_fwhm.csv')

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Initialize PDF
pdf_path = os.path.join(Results_Folder, 'average_fwhm_raw.pdf')
print(pdf_path)
pdf_pages = PdfPages(pdf_path)

# Create a new figure and axes
fig, ax = plt.subplots(figsize=(10, 5))  # Create a figure and an axes object.

# Sort the 'Condition' column alphabetically
group_order = df['Condition'].sort_values().unique()

# Create the boxplot for pooled data on the specified axes
sns.boxplot(x='Condition', y='Average_fwhm', data=df,
            color='lightgray', order=group_order, ax=ax)  # Specify the axes to plot on

# Overlay with a stripplot showing individual repeats on the specified axes
sns.stripplot(x='Condition', y='Average_fwhm', data=df,
              hue='Repeat', dodge=True, jitter=True,
              palette='magma', alpha=0.5, order=group_order, ax=ax)  # Specify the axes to plot on

# Add a title to the plot
plt.title('Average Full Width at Half Maximum (FWHM) by Condition and Repeat (raw)')

# Adjust layout and save to PDF
plt.tight_layout()
pdf_pages.savefig(fig)  # Save the figure to the PDF
pdf_pages.close()








In [None]:
# @title #Calculate statistics between conditions for Average_fwhm (raw data)

import pandas as pd
from scipy.stats import ttest_ind
from numpy import std
import os

# Load the CSV file into a pandas DataFrame
file_path = os.path.join(Results_Folder, 'data_for_Average_fwhm.csv')
df = pd.read_csv(file_path)

# Drop rows with NaN values in the Average_fwhm column
df = df.dropna(subset=['Average_fwhm'])

# Define the specific conditions to compare
comparisons = [
    ('Control pool', 'Mutation #34'),
    ('Control pool', 'Mutation #38'),
    ('Control single cell', 'Mutation #34'),
    ('Control single cell', 'Mutation #38')
]

# Create a dictionary to store condition data
condition_groups = {condition: df[df['Condition'] == condition]['Average_fwhm'] for condition in df['Condition'].unique()}

# Display group lengths
for condition, data in condition_groups.items():
    print(f'{condition} length: {len(data)}')

# Perform specified t-tests and Cohen's d calculations
t_statistics = []
p_values = []
cohen_ds = []

def cohen_d(group_a, group_b):
    mean_diff = group_a.mean() - group_b.mean()
    pooled_std = std(pd.concat([group_a, group_b], axis=0), ddof=1)
    return mean_diff / pooled_std

comparison_labels = []
for cond_a, cond_b in comparisons:
    if cond_a in condition_groups and cond_b in condition_groups:
        group_a, group_b = condition_groups[cond_a], condition_groups[cond_b]

        t_stat, p_value = ttest_ind(group_a, group_b)
        d_value = cohen_d(group_a, group_b)

        comparison_labels.append(f'{cond_a} vs {cond_b}')
        t_statistics.append(t_stat)
        p_values.append(p_value)
        cohen_ds.append(d_value)
    else:
        print(f'Skipping comparison {cond_a} vs {cond_b} due to missing data.')

# Create a DataFrame for the results
results_df = pd.DataFrame({
    'Comparison': comparison_labels,
    'T-statistic': t_statistics,
    'P-value': p_values,
    "Cohen's d": cohen_ds
})

# Export results to a CSV file with p-values formatted to 5 decimal places
results_csv_path = os.path.join(Results_Folder, 't_test_results_Average_fwhm_raw.csv')
results_df.to_csv(results_csv_path, index=False, float_format='%.5f')

# Print the results
print('T-test results:')
print(results_df)
print(f'Results exported to: {results_csv_path}')


In [None]:
# @title #Load average peak distance data, add Group information and plot (raw data)


import pandas as pd
from scipy.stats import ttest_ind
from numpy import std
import os
import numpy as np
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt

# Load the CSV file into a pandas DataFrame
# Construct the file path within the Results_Folder
file_path = os.path.join(Results_Folder, 'data_for_Average_Peak_Distance.csv')
df = pd.read_csv(file_path)

# Initialize PDF2
pdf_path = os.path.join(Results_Folder, 'average_peak_distance_raw.pdf')
print(pdf_path)
pdf_pages = PdfPages(pdf_path)

# Create a new figure and axes
fig, ax = plt.subplots(figsize=(10, 5)) #Create a figure and an axes object.

# Sort the 'Condition' column alphabetically
group_order = df['Condition'].sort_values().unique()

# Create the boxplot for pooled data on the specified axes
sns.boxplot(x='Condition', y='Average_Peak_Distance', data=df,
            color='lightgray', order=group_order, ax=ax) #Specify the axes to plot on

# Overlay with a stripplot showing individual repeats on the specified axes
sns.stripplot(x='Condition', y='Average_Peak_Distance', data=df,
              hue='Repeat', dodge=True, jitter=True,
              palette='magma', alpha=0.5, order=group_order, ax=ax) #Specify the axes to plot on

# Add a title to the plot
plt.title('Average Peak Distance by Condition and Repeat (raw)')

# Adjust layout and save to PDF
plt.tight_layout()
pdf_pages.savefig(fig)  # Save the figure to the PDF
pdf_pages.close()







In [None]:
# @title #Calculate statistics between conditions for average_peak_distance (raw data)

import pandas as pd
from scipy.stats import ttest_ind
from numpy import std
import os

# Load the CSV file into a pandas DataFrame
file_path = os.path.join(Results_Folder, 'data_for_Average_Peak_Distance.csv')
df = pd.read_csv(file_path)

# Drop rows with NaN values in the Average_fwhm column
df = df.dropna(subset=['Average_Peak_Distance'])

# Define the specific conditions to compare
comparisons = [
    ('Control pool', 'Mutation #34'),
    ('Control pool', 'Mutation #38'),
    ('Control single cell', 'Mutation #34'),
    ('Control single cell', 'Mutation #38')
]

# Create a dictionary to store condition data
condition_groups = {condition: df[df['Condition'] == condition]['Average_Peak_Distance'] for condition in df['Condition'].unique()}

# Display group lengths
for condition, data in condition_groups.items():
    print(f'{condition} length: {len(data)}')

# Perform specified t-tests and Cohen's d calculations
t_statistics = []
p_values = []
cohen_ds = []

def cohen_d(group_a, group_b):
    mean_diff = group_a.mean() - group_b.mean()
    pooled_std = std(pd.concat([group_a, group_b], axis=0), ddof=1)
    return mean_diff / pooled_std

comparison_labels = []
for cond_a, cond_b in comparisons:
    if cond_a in condition_groups and cond_b in condition_groups:
        group_a, group_b = condition_groups[cond_a], condition_groups[cond_b]

        t_stat, p_value = ttest_ind(group_a, group_b)
        d_value = cohen_d(group_a, group_b)

        comparison_labels.append(f'{cond_a} vs {cond_b}')
        t_statistics.append(t_stat)
        p_values.append(p_value)
        cohen_ds.append(d_value)
    else:
        print(f'Skipping comparison {cond_a} vs {cond_b} due to missing data.')

# Create a DataFrame for the results
results_df = pd.DataFrame({
    'Comparison': comparison_labels,
    'T-statistic': t_statistics,
    'P-value': p_values,
    "Cohen's d": cohen_ds
})

# Export results to a CSV file with p-values formatted to 5 decimal places
results_csv_path = os.path.join(Results_Folder, 't_test_Average_Peak_Distance_raw.csv')
results_df.to_csv(results_csv_path, index=False, float_format='%.5f')

# Print the results
print('T-test results:')
print(results_df)
print(f'Results exported to: {results_csv_path}')





---


# **Part 3: Data Balancing**



---



In [None]:
# @title ##Check the number of track per condition per repeats for plotting fwhm

import os
import matplotlib.pyplot as plt
import pandas as pd # Import pandas

data_for_fwhm = os.path.join(Results_Folder, 'data_for_Average_fwhm.csv')
data_for_df= pd.read_csv(data_for_fwhm) # Read the CSV file into a DataFrame

def count_tracks_by_condition_and_repeat(df, Results_Folder, condition_col='Condition', repeat_col='Repeat', track_id_col='Unique_ID'):
    """
    Counts the number of unique tracks for each combination of condition and repeat in the given DataFrame and
    saves a stacked histogram plot as a PDF in the QC folder with annotations for each stack.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the data.
    Results_Folder (str): The base folder where the results will be saved.
    condition_col (str): The name of the column representing the condition. Default is 'Condition'.
    repeat_col (str): The name of the column representing the repeat. Default is 'Repeat'.
    track_id_col (str): The name of the column representing the track ID. Default is 'Unique_ID'.
    """
    track_counts = df.groupby([condition_col, repeat_col])[track_id_col].nunique()
    track_counts_df = track_counts.reset_index()
    track_counts_df.rename(columns={track_id_col: 'Number_of_Tracks'}, inplace=True)

    # Pivot the data for plotting
    pivot_df = track_counts_df.pivot(index=condition_col, columns=repeat_col, values='Number_of_Tracks').fillna(0)

    # Plotting
    fig, ax = plt.subplots(figsize=(12, 6))
    bars = pivot_df.plot(kind='bar', stacked=True, ax=ax)
    ax.set_xlabel('Condition')
    ax.set_ylabel('Number of Tracks')
    ax.set_title('Stacked Histogram of Track Counts per Condition and Repeat')
    ax.legend(title=repeat_col)
    ax.grid(axis='y', linestyle='--')

    # Hide horizontal grid lines
    ax.yaxis.grid(False)

    # Add number annotations on each stack
    for bar in bars.patches:
        ax.text(bar.get_x() + bar.get_width() / 2,
                bar.get_y() + bar.get_height() / 2,
                int(bar.get_height()),
                ha='center', va='center', color='black', fontweight='bold', fontsize=8)

    # Save the plot as a PDF
    pdf_file = os.path.join(Results_Folder, 'Track_Counts_Histogram_fwhm.pdf')
    plt.savefig(pdf_file, bbox_inches='tight')
    print(f"Saved histogram to {pdf_file}")

    plt.show()

    return track_counts_df


# Make sure the QC folder exists
qc_folder = os.path.join(Results_Folder, "QC")
if not os.path.exists(qc_folder):
    os.makedirs(qc_folder)

result_df = count_tracks_by_condition_and_repeat(data_for_df, qc_folder) # Pass the DataFrame 'data_for_df' to the function





In [None]:
# @title ##Run this cell to downsample and balance your fwhm dataset

!pip install tqdm  # Install the tqdm module
from tqdm import tqdm  # Import the tqdm function
import pandas as pd
import numpy as np
import os  # Import os to handle file paths

def balance_dataset_by_condition_and_repeat(df, condition_col='Condition', repeat_col='Repeat', random_seed=None):
    """
    Balances the dataset by downsampling rows for each (Condition, Repeat) group
    to match the smallest group size.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the data.
    condition_col (str): The name of the column representing the condition.
    repeat_col (str): The name of the column representing the repeat.
    random_seed (int, optional): The seed for the random number generator. Default is None.

    Returns:
    pandas.DataFrame: A new DataFrame with balanced row counts across (Condition, Repeat) groups.
    """
    np.random.seed(random_seed)  # Ensure reproducibility

    # Count the number of rows per (Condition, Repeat) combination
    group_counts = df.groupby([condition_col, repeat_col]).size()
    min_row_count = group_counts.min()  # Find the smallest group size

    print(f"Balancing to {min_row_count} rows per (Condition, Repeat)")

    # Function to sample rows
    def sample_rows(group):
        return group.sample(n=min_row_count, replace=False, random_state=random_seed)

    # Apply sampling to ensure equal row counts per (Condition, Repeat)
    balanced_df = df.groupby([condition_col, repeat_col], group_keys=False).apply(sample_rows)

    return balanced_df.reset_index(drop=True)

def replace_inf_with_nan(df, df_name):
    """
    Replaces all infinite values (positive or negative infinity) in the DataFrame with NaN
    and prints a message for each column where infinities are found.

    Args:
    df (pd.DataFrame): DataFrame to replace inf values.
    df_name (str): The name of the DataFrame as a string, used for printing.

    Returns:
    pd.DataFrame: DataFrame with infinity values replaced by NaN.
    """
    inf_columns = df.columns[(df == np.inf).any() | (df == -np.inf).any()].tolist()

    if inf_columns:
        for col in inf_columns:
            inf_count = ((df[col] == np.inf) | (df[col] == -np.inf)).sum()
            print(f"Column '{col}' in {df_name} contains {inf_count} infinity values. Replacing with NaN.")

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    return df  # Return the modified DataFrame

def check_for_nans(df, df_name):
    """
    Checks the given DataFrame for NaN values and prints the count for each column containing NaNs.
    It first converts infinite values to NaNs before the check.

    Args:
    df (pd.DataFrame): DataFrame to be checked for NaN values.
    df_name (str): The name of the DataFrame as a string, used for printing.
    """
    df = replace_inf_with_nan(df, df_name)

    nan_columns = df.columns[df.isna().any()].tolist()

    if nan_columns:
        for col in nan_columns:
            nan_count = df[col].isna().sum()
            print(f"Column '{col}' in {df_name} contains {nan_count} NaN values.")
    else:
        print(f"No NaN values found in {df_name}.")

def save_dataframe_with_progress(df, path, desc="Saving", chunk_size=50000):
    """Save a DataFrame with a progress bar without compression."""

    num_chunks = int(len(df) / chunk_size) + 1

    with tqdm(total=len(df), unit="rows", desc=desc) as pbar:
        with open(path, "w") as f:
            df.head(0).to_csv(f, index=False)

            for chunk in np.array_split(df, num_chunks):
                chunk.to_csv(f, mode="a", header=False, index=False)
                pbar.update(len(chunk))

random_seed = 42

if not os.path.exists(f"{Results_Folder}/Balanced_dataset"):
    os.makedirs(f"{Results_Folder}/Balanced_dataset")

# Check how many tracks exist per condition and repeat BEFORE balancing
print(data_for_df.groupby(['Condition', 'Repeat'])['Unique_ID'].nunique())

# Run the balancing function
balanced_data_for_df = balance_dataset_by_condition_and_repeat(data_for_df, random_seed=42)

# Check how many rows exist per (Condition, Repeat) AFTER balancing
print(balanced_data_for_df.groupby(['Condition', 'Repeat']).size())

result_df = count_tracks_by_condition_and_repeat(balanced_data_for_df, f"{Results_Folder}/Balanced_dataset")

check_for_nans(balanced_data_for_df, "balanced_data_for_df")
save_dataframe_with_progress(balanced_data_for_df, Results_Folder + '/Balanced_dataset/fwhm_balanced_dataset.csv')


In [None]:
# @title #Load average_fwhm data and plot (balanced data)


import pandas as pd
from scipy.stats import ttest_ind
from numpy import std
import os
import numpy as np
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt

# Load the CSV file into a pandas DataFrame
file_path = os.path.join(Results_Folder, 'Balanced_dataset/fwhm_balanced_dataset.csv')
df = pd.read_csv(file_path)

# Initialize PDF2
pdf_path = os.path.join(Results_Folder, 'average_fwhm_balanced.pdf')
print(pdf_path)
pdf_pages = PdfPages(pdf_path)

# Create a new figure and axes
fig, ax = plt.subplots(figsize=(10, 5)) #Create a figure and an axes object.

# Sort the 'Condition' column alphabetically
group_order = df['Condition'].sort_values().unique()

# Create the boxplot for pooled data on the specified axes
sns.boxplot(x='Condition', y='Average_fwhm', data=df,
            color='lightgray', order=group_order, ax=ax) #Specify the axes to plot on

# Overlay with a stripplot showing individual repeats on the specified axes
sns.stripplot(x='Condition', y='Average_fwhm', data=df,
              hue='Repeat', dodge=True, jitter=True,
              palette='magma', alpha=0.5, order=group_order, ax=ax) #Specify the axes to plot on

# Add a title to the plot
plt.title('Average Full Width at Half Maximum (FWHM) by Condition and Repeat')

# Set the y-axis limits # This is the added line
ax.set_ylim(0, 60)

# Adjust layout and save to PDF
plt.tight_layout()
pdf_pages.savefig(fig)  # Save the figure to the PDF
pdf_pages.close()








In [None]:
# @title #Calculate statistics between conditions for Average_fwhm (balanced data)

import pandas as pd
from scipy.stats import ttest_ind
from numpy import std
import os

# Load the CSV file into a pandas DataFrame
file_path = os.path.join(Results_Folder, 'Balanced_dataset/fwhm_balanced_dataset.csv')
df = pd.read_csv(file_path)

# Drop rows with NaN values in the Average_fwhm column
df = df.dropna(subset=['Average_fwhm'])

# Define the specific conditions to compare
comparisons = [
    ('Control pool', 'Mutation #34'),
    ('Control pool', 'Mutation #38'),
    ('Control single cell', 'Mutation #34'),
    ('Control single cell', 'Mutation #38')
]

# Create a dictionary to store condition data
condition_groups = {condition: df[df['Condition'] == condition]['Average_fwhm'] for condition in df['Condition'].unique()}

# Display group lengths
for condition, data in condition_groups.items():
    print(f'{condition} length: {len(data)}')

# Perform specified t-tests and Cohen's d calculations
t_statistics = []
p_values = []
cohen_ds = []

def cohen_d(group_a, group_b):
    mean_diff = group_a.mean() - group_b.mean()
    pooled_std = std(pd.concat([group_a, group_b], axis=0), ddof=1)
    return mean_diff / pooled_std

comparison_labels = []
for cond_a, cond_b in comparisons:
    if cond_a in condition_groups and cond_b in condition_groups:
        group_a, group_b = condition_groups[cond_a], condition_groups[cond_b]

        t_stat, p_value = ttest_ind(group_a, group_b)
        d_value = cohen_d(group_a, group_b)

        comparison_labels.append(f'{cond_a} vs {cond_b}')
        t_statistics.append(t_stat)
        p_values.append(p_value)
        cohen_ds.append(d_value)
    else:
        print(f'Skipping comparison {cond_a} vs {cond_b} due to missing data.')

# Create a DataFrame for the results
results_df = pd.DataFrame({
    'Comparison': comparison_labels,
    'T-statistic': t_statistics,
    'P-value': p_values,
    "Cohen's d": cohen_ds
})

# Export results to a CSV file with p-values formatted to 5 decimal places
results_csv_path = os.path.join(Results_Folder, 't_test_results_Average_fwhm_balanced.csv')
results_df.to_csv(results_csv_path, index=False, float_format='%.5f')

# Print the results
print('T-test results:')
print(results_df)
print(f'Results exported to: {results_csv_path}')


In [None]:
# @title ##Check the number of track per condition per repeats for plotting deak distances

import os
import matplotlib.pyplot as plt
import pandas as pd # Import pandas

data_for_peaks = os.path.join(Results_Folder, 'data_for_Average_Peak_Distance.csv')
peaks_df= pd.read_csv(data_for_peaks) # Read the CSV file into a DataFrame

def count_tracks_by_condition_and_repeat(df, Results_Folder, condition_col='Condition', repeat_col='Repeat', track_id_col='Unique_ID'):
    """
    Counts the number of unique tracks for each combination of condition and repeat in the given DataFrame and
    saves a stacked histogram plot as a PDF in the QC folder with annotations for each stack.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the data.
    Results_Folder (str): The base folder where the results will be saved.
    condition_col (str): The name of the column representing the condition. Default is 'Condition'.
    repeat_col (str): The name of the column representing the repeat. Default is 'Repeat'.
    track_id_col (str): The name of the column representing the track ID. Default is 'Unique_ID'.
    """
    track_counts = df.groupby([condition_col, repeat_col])[track_id_col].nunique()
    track_counts_df = track_counts.reset_index()
    track_counts_df.rename(columns={track_id_col: 'Number_of_Tracks'}, inplace=True)

    # Pivot the data for plotting
    pivot_df = track_counts_df.pivot(index=condition_col, columns=repeat_col, values='Number_of_Tracks').fillna(0)

    # Plotting
    fig, ax = plt.subplots(figsize=(12, 6))
    bars = pivot_df.plot(kind='bar', stacked=True, ax=ax)
    ax.set_xlabel('Condition')
    ax.set_ylabel('Number of Tracks')
    ax.set_title('Stacked Histogram of Track Counts per Condition and Repeat')
    ax.legend(title=repeat_col)
    ax.grid(axis='y', linestyle='--')

    # Hide horizontal grid lines
    ax.yaxis.grid(False)

    # Add number annotations on each stack
    for bar in bars.patches:
        ax.text(bar.get_x() + bar.get_width() / 2,
                bar.get_y() + bar.get_height() / 2,
                int(bar.get_height()),
                ha='center', va='center', color='black', fontweight='bold', fontsize=8)

    # Save the plot as a PDF
    pdf_file = os.path.join(Results_Folder, 'Track_Counts_Histogram_peaks.pdf')
    plt.savefig(pdf_file, bbox_inches='tight')
    print(f"Saved histogram to {pdf_file}")

    plt.show()

    return track_counts_df


# Make sure the QC folder exists
qc_folder = os.path.join(Results_Folder, "QC")
if not os.path.exists(qc_folder):
    os.makedirs(qc_folder)

result_df = count_tracks_by_condition_and_repeat(peaks_df, qc_folder) # Pass the DataFrame 'data_for_df' to the function





In [None]:
# @title ##Run this cell to downsample and balance your peaks dataset

!pip install tqdm  # Install the tqdm module
from tqdm import tqdm  # Import the tqdm function
import pandas as pd
import numpy as np
import os  # Import os to handle file paths

def balance_dataset_by_condition_and_repeat(df, condition_col='Condition', repeat_col='Repeat', random_seed=None):
    """
    Balances the dataset by downsampling rows for each (Condition, Repeat) group
    to match the smallest group size.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the data.
    condition_col (str): The name of the column representing the condition.
    repeat_col (str): The name of the column representing the repeat.
    random_seed (int, optional): The seed for the random number generator. Default is None.

    Returns:
    pandas.DataFrame: A new DataFrame with balanced row counts across (Condition, Repeat) groups.
    """
    np.random.seed(random_seed)  # Ensure reproducibility

    # Count the number of rows per (Condition, Repeat) combination
    group_counts = df.groupby([condition_col, repeat_col]).size()
    min_row_count = group_counts.min()  # Find the smallest group size

    print(f"Balancing to {min_row_count} rows per (Condition, Repeat)")

    # Function to sample rows
    def sample_rows(group):
        return group.sample(n=min_row_count, replace=False, random_state=random_seed)

    # Apply sampling to ensure equal row counts per (Condition, Repeat)
    balanced_df = df.groupby([condition_col, repeat_col], group_keys=False).apply(sample_rows)

    return balanced_df.reset_index(drop=True)

def replace_inf_with_nan(df, df_name):
    """
    Replaces all infinite values (positive or negative infinity) in the DataFrame with NaN
    and prints a message for each column where infinities are found.

    Args:
    df (pd.DataFrame): DataFrame to replace inf values.
    df_name (str): The name of the DataFrame as a string, used for printing.

    Returns:
    pd.DataFrame: DataFrame with infinity values replaced by NaN.
    """
    inf_columns = df.columns[(df == np.inf).any() | (df == -np.inf).any()].tolist()

    if inf_columns:
        for col in inf_columns:
            inf_count = ((df[col] == np.inf) | (df[col] == -np.inf)).sum()
            print(f"Column '{col}' in {df_name} contains {inf_count} infinity values. Replacing with NaN.")

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    return df  # Return the modified DataFrame

def check_for_nans(df, df_name):
    """
    Checks the given DataFrame for NaN values and prints the count for each column containing NaNs.
    It first converts infinite values to NaNs before the check.

    Args:
    df (pd.DataFrame): DataFrame to be checked for NaN values.
    df_name (str): The name of the DataFrame as a string, used for printing.
    """
    df = replace_inf_with_nan(df, df_name)

    nan_columns = df.columns[df.isna().any()].tolist()

    if nan_columns:
        for col in nan_columns:
            nan_count = df[col].isna().sum()
            print(f"Column '{col}' in {df_name} contains {nan_count} NaN values.")
    else:
        print(f"No NaN values found in {df_name}.")

def save_dataframe_with_progress(df, path, desc="Saving", chunk_size=50000):
    """Save a DataFrame with a progress bar without compression."""

    num_chunks = int(len(df) / chunk_size) + 1

    with tqdm(total=len(df), unit="rows", desc=desc) as pbar:
        with open(path, "w") as f:
            df.head(0).to_csv(f, index=False)

            for chunk in np.array_split(df, num_chunks):
                chunk.to_csv(f, mode="a", header=False, index=False)
                pbar.update(len(chunk))

random_seed = 42

if not os.path.exists(f"{Results_Folder}/Balanced_dataset"):
    os.makedirs(f"{Results_Folder}/Balanced_dataset")

# Check how many tracks exist per condition and repeat BEFORE balancing
print(peaks_df.groupby(['Condition', 'Repeat'])['Unique_ID'].nunique())

# Run the balancing function
balanced_peaks_df = balance_dataset_by_condition_and_repeat(peaks_df, random_seed=42)

# Check how many rows exist per (Condition, Repeat) AFTER balancing
print(balanced_peaks_df.groupby(['Condition', 'Repeat']).size())

result_df = count_tracks_by_condition_and_repeat(balanced_peaks_df, f"{Results_Folder}/Balanced_dataset")

check_for_nans(balanced_peaks_df, "balanced_peaks_df")
save_dataframe_with_progress(balanced_peaks_df, Results_Folder + '/Balanced_dataset/peaks_balanced_dataset.csv')


In [None]:
# @title #Load peak_distances data and plot (balanced data)


import pandas as pd
from scipy.stats import ttest_ind
from numpy import std
import os
import numpy as np
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt

# Load the CSV file into a pandas DataFrame

file_path = os.path.join(Results_Folder, 'Balanced_dataset/peaks_balanced_dataset.csv')
df = pd.read_csv(file_path)

# Initialize PDF2
pdf_path = os.path.join(Results_Folder, 'average_peaks_balanced.pdf')
print(pdf_path)
pdf_pages = PdfPages(pdf_path)

# Create a new figure and axes
fig, ax = plt.subplots(figsize=(10, 5)) #Create a figure and an axes object.

# Sort the 'Condition' column alphabetically
group_order = df['Condition'].sort_values().unique()

# Create the boxplot for pooled data on the specified axes
sns.boxplot(x='Condition', y='Average_Peak_Distance', data=df,
            color='lightgray', order=group_order, ax=ax) #Specify the axes to plot on

# Overlay with a stripplot showing individual repeats on the specified axes
sns.stripplot(x='Condition', y='Average_Peak_Distance', data=df,
              hue='Repeat', dodge=True, jitter=True,
              palette='magma', alpha=0.5, order=group_order, ax=ax) #Specify the axes to plot on

# Add a title to the plot
plt.title('Average Peak distance by Condition and Repeat')

# Adjust layout and save to PDF
plt.tight_layout()
pdf_pages.savefig(fig)  # Save the figure to the PDF
pdf_pages.close()


In [None]:
# @title #Calculate statistics between conditions for average_peak_distance balanced

import pandas as pd
from scipy.stats import ttest_ind
from numpy import std
import os

# Load the CSV file into a pandas DataFrame
file_path = os.path.join(Results_Folder, 'Balanced_dataset/peaks_balanced_dataset.csv')
df = pd.read_csv(file_path)

# Drop rows with NaN values in the Average_fwhm column
df = df.dropna(subset=['Average_Peak_Distance'])

# Define the specific conditions to compare
comparisons = [
    ('Control pool', 'Mutation #34'),
    ('Control pool', 'Mutation #38'),
    ('Control single cell', 'Mutation #34'),
    ('Control single cell', 'Mutation #38')
]

# Create a dictionary to store condition data
condition_groups = {condition: df[df['Condition'] == condition]['Average_Peak_Distance'] for condition in df['Condition'].unique()}

# Display group lengths
for condition, data in condition_groups.items():
    print(f'{condition} length: {len(data)}')

# Perform specified t-tests and Cohen's d calculations
t_statistics = []
p_values = []
cohen_ds = []

def cohen_d(group_a, group_b):
    mean_diff = group_a.mean() - group_b.mean()
    pooled_std = std(pd.concat([group_a, group_b], axis=0), ddof=1)
    return mean_diff / pooled_std

comparison_labels = []
for cond_a, cond_b in comparisons:
    if cond_a in condition_groups and cond_b in condition_groups:
        group_a, group_b = condition_groups[cond_a], condition_groups[cond_b]

        t_stat, p_value = ttest_ind(group_a, group_b)
        d_value = cohen_d(group_a, group_b)

        comparison_labels.append(f'{cond_a} vs {cond_b}')
        t_statistics.append(t_stat)
        p_values.append(p_value)
        cohen_ds.append(d_value)
    else:
        print(f'Skipping comparison {cond_a} vs {cond_b} due to missing data.')

# Create a DataFrame for the results
results_df = pd.DataFrame({
    'Comparison': comparison_labels,
    'T-statistic': t_statistics,
    'P-value': p_values,
    "Cohen's d": cohen_ds
})

# Export results to a CSV file with p-values formatted to 5 decimal places
results_csv_path = os.path.join(Results_Folder, 't_test_Average_Peak_Distance_balanced.csv')
results_df.to_csv(results_csv_path, index=False, float_format='%.5f')

# Print the results
print('T-test results:')
print(results_df)
print(f'Results exported to: {results_csv_path}')

In [None]:
# @title #Load average peak distance data, add Group information and plot

import pandas as pd
from scipy.stats import ttest_ind
from numpy import std
import os
import numpy as np
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt

# Load the CSV file into a pandas DataFrame
file_path = os.path.join(Results_Folder, 'Balanced_dataset/peaks_balanced_dataset.csv')

# User input for distance cutoff
distance_cutoff_frames = 6  # @param {type: "integer"}

df = pd.read_csv(file_path)

# Filter out data points below the distance cutoff
df_filt = df[df['Average_Peak_Distance'] >= distance_cutoff_frames]

# Initialize PDF2
pdf_path = os.path.join(Results_Folder, f'average_peak_distance_balanced_(Distance >= {distance_cutoff_frames}).pdf')  # Updated file name
print(pdf_path)
pdf_pages = PdfPages(pdf_path)

# Create a new figure and axes
fig, ax = plt.subplots(figsize=(10, 5))

# Sort the 'Condition' column alphabetically
group_order = df_filt['Condition'].sort_values().unique()

# Create the boxplot for pooled data
sns.boxplot(x='Condition', y='Average_Peak_Distance', data=df_filt,
            color='lightgray', order=group_order, ax=ax)

# Overlay with a stripplot showing individual repeats
sns.stripplot(x='Condition', y='Average_Peak_Distance', data=df_filt,
              hue='Repeat', dodge=True, jitter=True,
              palette='magma', alpha=0.5, order=group_order, ax=ax)

# Add a title to the plot
plt.title(f'Average Peak Distance by Condition and Repeat (Distance >= {distance_cutoff_frames})')  # Updated title

# Adjust layout and save to PDF
plt.tight_layout()
pdf_pages.savefig(fig)
pdf_pages.close()

In [None]:
# @title #Calculate statistics between conditions for average_peak_distance with cut-off from previous cell

import pandas as pd
from scipy.stats import ttest_ind
from numpy import std
import os

# Drop rows with NaN values in the Average_fwhm column
df = df_filt.dropna(subset=['Average_Peak_Distance'])

# Define the specific conditions to compare
comparisons = [
    ('Control pool', 'Mutation #34'),
    ('Control pool', 'Mutation #38'),
    ('Control single cell', 'Mutation #34'),
    ('Control single cell', 'Mutation #38')
]

# Create a dictionary to store condition data
condition_groups = {condition: df[df['Condition'] == condition]['Average_Peak_Distance'] for condition in df['Condition'].unique()}

# Display group lengths
for condition, data in condition_groups.items():
    print(f'{condition} length: {len(data)}')

# Perform specified t-tests and Cohen's d calculations
t_statistics = []
p_values = []
cohen_ds = []

def cohen_d(group_a, group_b):
    mean_diff = group_a.mean() - group_b.mean()
    pooled_std = std(pd.concat([group_a, group_b], axis=0), ddof=1)
    return mean_diff / pooled_std

comparison_labels = []
for cond_a, cond_b in comparisons:
    if cond_a in condition_groups and cond_b in condition_groups:
        group_a, group_b = condition_groups[cond_a], condition_groups[cond_b]

        t_stat, p_value = ttest_ind(group_a, group_b)
        d_value = cohen_d(group_a, group_b)

        comparison_labels.append(f'{cond_a} vs {cond_b}')
        t_statistics.append(t_stat)
        p_values.append(p_value)
        cohen_ds.append(d_value)
    else:
        print(f'Skipping comparison {cond_a} vs {cond_b} due to missing data.')

# Create a DataFrame for the results
results_df = pd.DataFrame({
    'Comparison': comparison_labels,
    'T-statistic': t_statistics,
    'P-value': p_values,
    "Cohen's d": cohen_ds
})

# Export results to a CSV file with p-values formatted to 5 decimal places

results_csv_path = os.path.join(Results_Folder, (f't_test_Average_Peak_Distance_balanced (Distance >= {distance_cutoff_frames}).csv'))
results_df.to_csv(results_csv_path, index=False, float_format='%.5f')

# Print the results
print('T-test results:')
print(results_df)
print(f'Results exported to: {results_csv_path}')

In [None]:
# @title #Plot heatmaps (balanced, normalized, red)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# User inputs
Folder_path = os.path.join(Results_Folder, 'merged_Spots_with_NumPeaks.csv')

# Load data
merged_df_balanced = pd.read_csv(Folder_path)

# Google Colab Form Inputs
min_NUMBER_SPOTS = False  # @param {type:"boolean"}
min_TRACK_DURATION = True  # @param {type:"boolean"}
min_value = 90  # @param {type:"number"}  # Enter the minimum value for the selected variable
num_tracks_to_select = 20  # @param {type: "integer"}

# Define filter condition based on the selected variable
if min_NUMBER_SPOTS and not min_TRACK_DURATION:
    filtered_df = merged_df_balanced[merged_df_balanced['NUMBER_SPOTS'] >= min_value]
elif min_TRACK_DURATION and not min_NUMBER_SPOTS:
    filtered_df = merged_df_balanced[merged_df_balanced['TRACK_DURATION'] >= min_value]
else:
    print("Select only one variable to filter using the minimum value.")
    filtered_df = merged_df_balanced.copy()

# Ensure 'Condition' column is present
if 'Condition' not in filtered_df.columns:
    filtered_df = pd.merge(filtered_df, merged_df_balanced[['Unique_ID', 'Condition', 'Num_peaks']], on='Unique_ID', how='left')
    print("Warning: 'Condition' column was missing and has been added back.")

# Save filtered dataframe for inspection
Results_Folder = os.path.dirname(Folder_path)
filtered_df.to_csv(os.path.join(Results_Folder, 'filtered_df.csv'), index=False)

# Sort tracks by TRACK_DURATION in descending order
#filtered_df = filtered_df.sort_values(by='TRACK_DURATION', ascending=False)
#filtered_df = filtered_df.sort_values(by=['TRACK_DURATION', 'Num_Peaks'], ascending=False) #Filtering is done here


# Correctly select num_tracks_to_select unique tracks per condition using Unique_ID
selected_tracks = (
    filtered_df.groupby('Condition')
    .apply(lambda x: x['Unique_ID'].drop_duplicates().head(min(num_tracks_to_select, len(x))))
    .reset_index(drop=True)
)

# Filter the original dataframe to include only selected tracks based on Unique_ID
filtered_tracks_df = filtered_df[filtered_df['Unique_ID'].isin(selected_tracks)]

# Create a combined column for "Condition_Unique_ID" (using Unique_ID instead of TRACK_ID)
filtered_tracks_df['Condition_Unique_ID'] = (
    filtered_tracks_df['Condition'].astype(str) + '_' + filtered_tracks_df['Unique_ID'].astype(str)
)

# Create a pivot table with the new column (using Condition_Unique_ID)
heatmap_data = (
    filtered_tracks_df
    .pivot_table(index='POSITION_T', columns='Condition_Unique_ID', values='MEAN_INTENSITY_CH2', aggfunc='mean')
)

# Normalize each column between 0 and 1 (before filling NaN values)
def normalize(series):
    return (series - series.min()) / (series.max() - series.min())


heatmap_data = heatmap_data.apply(normalize, axis=0)

# Create a mask for NaN values (True where NaN, False elsewhere)
nan_mask = heatmap_data.isna()

# Plotting the heatmap with masked NaN values in light gray
plt.figure(figsize=(14, 10))
sns.set(style="white")  # Set background to white


# Create a custom colormap with light gray for NaN values
cmap = plt.cm.viridis  # Your original colormap
cmap.set_bad('black')  # Set color for bad (NaN) values also lightgray

# Use mask to hide NaN regions and show them in light gray
ax = sns.heatmap(
    heatmap_data,
    mask=nan_mask,
    cmap=cmap,  # Use the modified colormap
    cbar_kws={'label': 'Normalized Mean Intensity (CH2)'},
    linewidths=0.0,
    linecolor='white'
)

# Adjust x-axis labels to display conditions only
conditions, track_ids = zip(*[col.split('_', 1) for col in heatmap_data.columns])
heatmap_data.columns = conditions
unique_conditions, condition_positions = np.unique(conditions, return_index=True)
centered_positions = [pos + (num_tracks_to_select / 2) - 0.5 for pos in condition_positions]
ax.set_xticks(centered_positions)
ax.set_xticklabels(unique_conditions, rotation=0, ha='center', fontsize=10, fontweight='bold')

plt.xlabel('Condition')
plt.ylabel('Time (POSITION_T)')
plt.title('Normalized Heatmap of MEAN_INTENSITY_CH2 Over Time by Condition')

plt.savefig(os.path.join(Results_Folder, 'heatmap_red_normalized.pdf'), bbox_inches='tight') # This line saves the heatmap

plt.show()

plt.show()

# Add a line to print the number of tracks plotted per condition
for condition in unique_conditions:
    num_tracks = len(filtered_tracks_df[filtered_tracks_df['Condition'] == condition]['Unique_ID'].unique())
    print(f"Number of tracks plotted for {condition}: {num_tracks}")

plt.show()

# Print TRACK_DURATION and Num_Peaks for each plotted track
for condition in unique_conditions:
    tracks_in_condition = filtered_tracks_df[filtered_tracks_df['Condition'] == condition]
    unique_track_ids = tracks_in_condition['Unique_ID'].unique()

    print(f"\nTracks plotted for {condition}:")
    for track_id in unique_track_ids:
        track_duration = tracks_in_condition[tracks_in_condition['Unique_ID'] == track_id]['TRACK_DURATION'].iloc[0]
        num_peaks = tracks_in_condition[tracks_in_condition['Unique_ID'] == track_id]['Num_Peaks'].iloc[0]  # Get Num_Peaks
        print(f"  Track ID: {track_id}, TRACK_DURATION: {track_duration}, Num_Peaks: {num_peaks}")

plt.show()


In [None]:
# @title #Plot heatmaps (balanced, not normalized, red)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# User inputs
Folder_path = os.path.join(Results_Folder, 'merged_Spots_with_NumPeaks.csv')

# Load data
merged_df_balanced = pd.read_csv(Folder_path)

# Google Colab Form Inputs
min_NUMBER_SPOTS = False  # @param {type:"boolean"}
min_TRACK_DURATION = True  # @param {type:"boolean"}
min_value = 80  # @param {type:"number"}  # Enter the minimum value for the selected variable
num_tracks_to_select = 20  # @param {type: "integer"}

# Define filter condition based on the selected variable
if min_NUMBER_SPOTS and not min_TRACK_DURATION:
    filtered_df = merged_df_balanced[merged_df_balanced['NUMBER_SPOTS'] >= min_value]
elif min_TRACK_DURATION and not min_NUMBER_SPOTS:
    filtered_df = merged_df_balanced[merged_df_balanced['TRACK_DURATION'] >= min_value]
else:
    print("Select only one variable to filter using the minimum value.")
    filtered_df = merged_df_balanced.copy()

# Ensure 'Condition' column is present
if 'Condition' not in filtered_df.columns:
    filtered_df = pd.merge(filtered_df, merged_df_balanced[['Unique_ID', 'Condition', 'Num_peaks']], on='Unique_ID', how='left')
    print("Warning: 'Condition' column was missing and has been added back.")

# Save filtered dataframe for inspection
Results_Folder = os.path.dirname(Folder_path)
filtered_df.to_csv(os.path.join(Results_Folder, 'filtered_df.csv'), index=False)

# Sort tracks by TRACK_DURATION in descending order
filtered_df = filtered_df.sort_values(by=['TRACK_DURATION', 'Num_Peaks'], ascending=False)

# Correctly select num_tracks_to_select unique tracks per condition using Unique_ID
selected_tracks = (
    filtered_df.groupby('Condition')
    .apply(lambda x: x['Unique_ID'].drop_duplicates().head(min(num_tracks_to_select, len(x))))
    .reset_index(drop=True)
)

# Filter the original dataframe to include only selected tracks based on Unique_ID
filtered_tracks_df = filtered_df[filtered_df['Unique_ID'].isin(selected_tracks)]

# Create a combined column for "Condition_Unique_ID" (using Unique_ID instead of TRACK_ID)
filtered_tracks_df['Condition_Unique_ID'] = (
    filtered_tracks_df['Condition'].astype(str) + '_' + filtered_tracks_df['Unique_ID'].astype(str)
)

# Create a pivot table with the new column (using Condition_Unique_ID)
heatmap_data = (
    filtered_tracks_df
    .pivot_table(index='POSITION_T', columns='Condition_Unique_ID', values='MEAN_INTENSITY_CH2', aggfunc='mean')
)

# Create a mask for NaN values (True where NaN, False elsewhere)
nan_mask = heatmap_data.isna()

# Plotting the heatmap with masked NaN values in light gray
plt.figure(figsize=(14, 10))
sns.set(style="white")  # Set background to white

# Create a custom colormap with light gray for NaN values
cmap = plt.cm.viridis  # Your original colormap
cmap.set_bad('black')  # Set color for bad (NaN) values also lightgray

# Use mask to hide NaN regions and show them in light gray
ax = sns.heatmap(
    heatmap_data,
    mask=nan_mask,
    cmap=cmap,  # Use the modified colormap
    cbar_kws={'label': 'Mean Intensity (CH2)'},  # Removed 'Normalized' from label
    linewidths=0.0,
    linecolor='white'
)

# Adjust x-axis labels to display conditions only
conditions, track_ids = zip(*[col.split('_', 1) for col in heatmap_data.columns])
heatmap_data.columns = conditions
unique_conditions, condition_positions = np.unique(conditions, return_index=True)
centered_positions = [pos + (num_tracks_to_select / 2) - 0.5 for pos in condition_positions]
ax.set_xticks(centered_positions)
ax.set_xticklabels(unique_conditions, rotation=0, ha='center', fontsize=10, fontweight='bold')

plt.xlabel('Condition')
plt.ylabel('Time (POSITION_T)')
plt.title('Heatmap of MEAN_INTENSITY_CH2 Over Time by Condition')  # Updated title
plt.savefig(os.path.join(Results_Folder, 'heatmap_red_raw.pdf'), bbox_inches='tight') # This line saves the heatmap

plt.show()

# Add a line to print the number of tracks plotted per condition
for condition in unique_conditions:
    num_tracks = len(filtered_tracks_df[filtered_tracks_df['Condition'] == condition]['Unique_ID'].unique())
    print(f"Number of tracks plotted for {condition}: {num_tracks}")

plt.show()

# Print TRACK_DURATION and Num_Peaks for each plotted track
for condition in unique_conditions:
    tracks_in_condition = filtered_tracks_df[filtered_tracks_df['Condition'] == condition]
    unique_track_ids = tracks_in_condition['Unique_ID'].unique()

    print(f"\nTracks plotted for {condition}:")
    for track_id in unique_track_ids:
        track_duration = tracks_in_condition[tracks_in_condition['Unique_ID'] == track_id]['TRACK_DURATION'].iloc[0]
        num_peaks = tracks_in_condition[tracks_in_condition['Unique_ID'] == track_id]['Num_Peaks'].iloc[0]  # Get Num_Peaks
        print(f"  Track ID: {track_id}, TRACK_DURATION: {track_duration}, Num_Peaks: {num_peaks}")

plt.show()


In [None]:
# @title #Plot heatmaps (balanced, normalized, green)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# User inputs
Folder_path = os.path.join(Results_Folder, 'merged_Spots_with_NumPeaks.csv')

# Load data
merged_df_balanced = pd.read_csv(Folder_path)

# Google Colab Form Inputs
min_NUMBER_SPOTS = False  # @param {type:"boolean"}
min_TRACK_DURATION = True  # @param {type:"boolean"}
min_value = 70  # @param {type:"number"}  # Enter the minimum value for the selected variable
num_tracks_to_select = 20  # @param {type: "integer"}

# Define filter condition based on the selected variable
if min_NUMBER_SPOTS and not min_TRACK_DURATION:
    filtered_df = merged_df_balanced[merged_df_balanced['NUMBER_SPOTS'] >= min_value]
elif min_TRACK_DURATION and not min_NUMBER_SPOTS:
    filtered_df = merged_df_balanced[merged_df_balanced['TRACK_DURATION'] >= min_value]
else:
    print("Select only one variable to filter using the minimum value.")
    filtered_df = merged_df_balanced.copy()

# Ensure 'Condition' column is present
if 'Condition' not in filtered_df.columns:
    filtered_df = pd.merge(filtered_df, merged_df_balanced[['Unique_ID', 'Condition', 'Num_peaks']], on='Unique_ID', how='left')
    print("Warning: 'Condition' column was missing and has been added back.")

# Save filtered dataframe for inspection
Results_Folder = os.path.dirname(Folder_path)
filtered_df.to_csv(os.path.join(Results_Folder, 'filtered_df.csv'), index=False)

# Sort tracks by TRACK_DURATION in descending order
#filtered_df = filtered_df.sort_values(by='TRACK_DURATION', ascending=False)
filtered_df = filtered_df.sort_values(by=['TRACK_DURATION', 'Num_Peaks'], ascending=False)


# Correctly select num_tracks_to_select unique tracks per condition using Unique_ID
selected_tracks = (
    filtered_df.groupby('Condition')
    .apply(lambda x: x['Unique_ID'].drop_duplicates().head(min(num_tracks_to_select, len(x))))
    .reset_index(drop=True)
)

# Filter the original dataframe to include only selected tracks based on Unique_ID
filtered_tracks_df = filtered_df[filtered_df['Unique_ID'].isin(selected_tracks)]

# Create a combined column for "Condition_Unique_ID" (using Unique_ID instead of TRACK_ID)
filtered_tracks_df['Condition_Unique_ID'] = (
    filtered_tracks_df['Condition'].astype(str) + '_' + filtered_tracks_df['Unique_ID'].astype(str)
)

# Create a pivot table with the new column (using Condition_Unique_ID)
heatmap_data = (
    filtered_tracks_df
    .pivot_table(index='POSITION_T', columns='Condition_Unique_ID', values='MEAN_INTENSITY_CH1', aggfunc='mean')
)

# Normalize each column between 0 and 1 (before filling NaN values)
def normalize(series):
    return (series - series.min()) / (series.max() - series.min())


heatmap_data = heatmap_data.apply(normalize, axis=0)

# Create a mask for NaN values (True where NaN, False elsewhere)
nan_mask = heatmap_data.isna()

# Plotting the heatmap with masked NaN values in light gray
plt.figure(figsize=(14, 10))
sns.set(style="white")  # Set background to white


# Create a custom colormap with light gray for NaN values
cmap = plt.cm.viridis  # Your original colormap
cmap.set_bad('black')  # Set color for bad (NaN) values also lightgray

# Use mask to hide NaN regions and show them in light gray
ax = sns.heatmap(
    heatmap_data,
    mask=nan_mask,
    cmap=cmap,  # Use the modified colormap
    cbar_kws={'label': 'Normalized Mean Intensity (CH1)'},
    linewidths=0.0,
    linecolor='white'
)

# Adjust x-axis labels to display conditions only
conditions, track_ids = zip(*[col.split('_', 1) for col in heatmap_data.columns])
heatmap_data.columns = conditions
unique_conditions, condition_positions = np.unique(conditions, return_index=True)
centered_positions = [pos + (num_tracks_to_select / 2) - 0.5 for pos in condition_positions]
ax.set_xticks(centered_positions)
ax.set_xticklabels(unique_conditions, rotation=0, ha='center', fontsize=10, fontweight='bold')

plt.xlabel('Condition')
plt.ylabel('Time (POSITION_T)')
plt.title('Normalized Heatmap of MEAN_INTENSITY_CH1 Over Time by Condition')
plt.savefig(os.path.join(Results_Folder, 'heatmap_green_normalized.pdf'), bbox_inches='tight') # This line saves the heatmap

plt.show()

# Add a line to print the number of tracks plotted per condition
for condition in unique_conditions:
    num_tracks = len(filtered_tracks_df[filtered_tracks_df['Condition'] == condition]['Unique_ID'].unique())
    print(f"Number of tracks plotted for {condition}: {num_tracks}")

plt.show()

# Print TRACK_DURATION and Num_Peaks for each plotted track
for condition in unique_conditions:
    tracks_in_condition = filtered_tracks_df[filtered_tracks_df['Condition'] == condition]
    unique_track_ids = tracks_in_condition['Unique_ID'].unique()

    print(f"\nTracks plotted for {condition}:")
    for track_id in unique_track_ids:
        track_duration = tracks_in_condition[tracks_in_condition['Unique_ID'] == track_id]['TRACK_DURATION'].iloc[0]
        num_peaks = tracks_in_condition[tracks_in_condition['Unique_ID'] == track_id]['Num_Peaks'].iloc[0]  # Get Num_Peaks
        print(f"  Track ID: {track_id}, TRACK_DURATION: {track_duration}, Num_Peaks: {num_peaks}")

plt.show()


In [None]:
# @title #Plot % of tracks with peaks

# Import necessary libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np  # Added import for np

# Load the dataset (Ensure the file exists in the correct path)
file_path = os.path.join(Results_Folder, 'peak_info_all.csv')
peak_info = pd.read_csv(file_path)

# --- Ensure 'Has_Peak' is properly mapped ---
# Convert to string, handle case sensitivity, and map to boolean
peak_info['Has_Peak'] = peak_info['Has_Peak'].astype(str).str.upper().map({'TRUE': True, 'FALSE': False})

# Check for invalid values in 'Has_Peak'
if peak_info['Has_Peak'].isnull().any():
    print("Warning: Some 'Has_Peak' values could not be converted. Check the data.")

# Convert boolean to numeric (1 for True, 0 for False)
peak_info['Has_Peak'] = peak_info['Has_Peak'].fillna(False).astype(int)

# --- Compute the percentage of peaks grouped by Condition and Repeat ---
peak_percentages = peak_info.groupby(['Condition', 'Repeat'])['Has_Peak'].mean() * 100

# Convert to a DataFrame for plotting
peak_percentages = peak_percentages.reset_index()

# Check if there is data to plot
if peak_percentages.empty:
    print("No data to plot. Check if 'Condition' and 'Repeat' columns are correct.")
else:
    # --- Bar Chart ---
    plt.figure(figsize=(12, 6))
    sns.barplot(x='Condition', y='Has_Peak', hue='Repeat', data=peak_percentages)
    plt.ylabel('% of Tracks with Peaks')
    plt.xlabel('Condition')
    plt.title('Percentage of Tracks with Peaks by Condition and Repeat')
    plt.legend(title='Repeat', bbox_to_anchor=(1, 1))
    plt.xticks(rotation=45)
    plt.savefig(os.path.join(Results_Folder, 'QC/procentage_of_tracks_per_condition.pdf'), bbox_inches='tight') # This line saves the heatmap
    plt.show()

    # --- Heatmap ---
    plt.figure(figsize=(10, 6))
    pivot_table = peak_percentages.pivot(index="Repeat", columns="Condition", values="Has_Peak").fillna(0)
    sns.heatmap(pivot_table, annot=True, cmap="coolwarm", fmt=".1f", linewidths=0.5)
    plt.xlabel('Condition')
    plt.ylabel('Repeat')
    plt.title('Heatmap of % Tracks with Peaks by Condition and Repeat')
    plt.savefig(os.path.join(Results_Folder, 'QC/procentage_of_tracks_per_condition_heatmap.pdf'), bbox_inches='tight') # This line saves the heatmap
    plt.show()


In [None]:
# @title #Plot histogram of the track peak count in each condition

# Import necessary libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Load the dataset (Ensure the file exists in the correct path)
file_path = os.path.join(Results_Folder, 'merged_Spots_with_NumPeaks.csv')
merged_df_balanced = pd.read_csv(file_path)

# Ensure Num_Peaks is numeric
merged_df_balanced['Num_Peaks'] = pd.to_numeric(merged_df_balanced['Num_Peaks'], errors='coerce')

# Drop NaN values in Num_Peaks
merged_df_balanced = merged_df_balanced.dropna(subset=['Num_Peaks'])

# Aggregate by Unique_ID: Taking the AVERAGE Num_Peaks for each Unique_ID within each Condition
agg_df = merged_df_balanced.groupby(['Unique_ID', 'Condition'], as_index=False)['Num_Peaks'].mean()

# Get the max value for setting x-ticks, ensuring it's an integer
max_value = int(agg_df['Num_Peaks'].max())

# Create the figure
plt.figure(figsize=(12, 6))

# Plot line for each Condition
for condition in agg_df['Condition'].unique():
    subset = agg_df[agg_df['Condition'] == condition]

    # Compute count of each Num_Peaks value
    peak_counts = subset['Num_Peaks'].value_counts().sort_index()

    # Normalize to get density
    peak_density = peak_counts / peak_counts.sum()

    # Plot as a line
    plt.plot(peak_density.index, peak_density.values, marker='o', linestyle='-', label=condition)

# Set x-ticks to show all integer values
plt.xticks(range(0, max_value + 1, 1))

# Labels and title
plt.xlabel('Average Number of Peaks per Unique_ID')
plt.ylabel('Density')
plt.title('Number of Peaks Distribution by Condition (Averaged per Unique_ID)')
plt.legend(title='Condition')
plt.savefig(os.path.join(Results_Folder, 'QC/no_of_peaks_histogram.pdf'), bbox_inches='tight') # This line saves the heatmap

# Show the plot
plt.show()



