# **Track analysis for csv files from TrackMate**


This notebook measures:

*   Mean tracking speed
*   Division time mean

Written by Joanna Pylvänäinen

joanna.pylvanainen@abo.fi

In [None]:
# @title #Mount GDrive


from google.colab import drive
drive.mount('/content/drive')

!pip install fastdtw


In [None]:
# @title #Load useful functions


def normalize_to_01(series):
    """
    Normalize a given time series (1D numpy array) between 0 and 1.

    Parameters:
        series (numpy.ndarray): The time series to be normalized.

    Returns:
        numpy.ndarray: The normalized time series.
    """
    Imin = np.min(series)
    Imax = np.max(series)

    if Imax == Imin:
        return np.zeros_like(series)

    return (series - Imin) / (Imax - Imin)

# Function to normalize by maximum amplitude
def normalize_by_max_amplitude(series):
    return series / np.max(np.abs(series))

# **Part 1: Analysing track data**



In [None]:
# @title #Load tracks


import pandas as pd
import glob
import os
import numpy as np

Folder_path = ''  # @param {type: "string"}
Results_Folder = ""  # @param {type: "string"}


# Initialize an empty list to collect DataFrames
df_list = []

# Use glob to match the filename pattern
for filepath in glob.glob(Folder_path+'/*tracks*.csv'):
    # Extract well and FOV information from the filename
    filename = os.path.basename(filepath)
    file_name_without_ext = os.path.splitext(filename)[0].replace('_tracking-tracks', '')
    well_info = filename[0:3]
    fov_info = filename.split('_')[1]

    # Read each CSV file into a DataFrame
    df = pd.read_csv(filepath, skiprows=[1, 2, 3])

    # Add well, FOV, and file name information as new columns to the DataFrame
    df['Well'] = well_info
    df['FOV'] = fov_info
    df['File Name'] = file_name_without_ext

    # Add a new column 'Condition' based on the 'Well' values
    df['Condition'] = np.select(
        [df['Well'].isin(['C02', 'C03', 'C04', 'C05', 'C06']),
         df['Well'].isin(['C07', 'C08', 'C09', 'C10', 'C11']),
         df['Well'].isin(['F02', 'F03', 'F04', 'F05', 'F06']),
         df['Well'].isin(['F07', 'F08', 'F09', 'F10', 'F11'])],
        ['Control pool', 'Control single cell', 'Mutation #34', 'Mutation #38'],
        default='Unknown'
    )

    # Create a new column 'Repeat' to label repeats within each condition based on wells
    well_order = {'C02': 1, 'C03': 2, 'C04': 3, 'C05': 4, 'C06': 5,
                  'C07': 1, 'C08': 2, 'C09': 3, 'C10': 4, 'C11': 5,
                  'F02': 1, 'F03': 2, 'F04': 3, 'F05': 4, 'F06': 5,
                  'F07': 1, 'F08': 2, 'F09': 3, 'F10': 4, 'F11': 5}

    df['Repeat'] = df['Well'].map(lambda well: f'{well_order.get(well, 1)}')

    # Create the 'Unique_ID' column
    df['Unique_ID'] = file_name_without_ext + '_' + df['TRACK_ID'].astype(str)

    # Add this DataFrame to the list
    df_list.append(df)

# Concatenate all the DataFrames together
merged_df = pd.concat(df_list, ignore_index=True)

# Save the merged DataFrame to a CSV file
merged_df.to_csv(os.path.join(Results_Folder, 'merged_Tracks.csv'), index=False)





In [None]:
# @title #Plot and extract useful data

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import os

#print(merged_df.columns)
#print(merged_df.head())


# Assuming that merged_df is your DataFrame
# List of variables to plot
variables_to_plot = ["DIVISION_TIME_MEAN", "TRACK_MEAN_SPEED", "TOTAL_DISTANCE_TRAVELED", "TRACK_DURATION"]
#variables_to_plot = ["TRACK_MEAN_SPEED", "TOTAL_DISTANCE_TRAVELED", "TRACK_DURATION"]


# Initialize PDF
#pdf_pages = PdfPages(Results_Folder +'Boxplots.pdf')

# **Change this part:**
if not os.path.exists(f"{Results_Folder}/plots"):
    os.makedirs(f"{Results_Folder}/plots")

# Initialize PDF2
pdf_path = os.path.join(Results_Folder, 'plots', 'Boxplots_raw.pdf') # Save PDF in 'plots' folder
pdf_pages = PdfPages(pdf_path)

# Create a single figure with 4 subplots, one for each variable
fig, axes = plt.subplots(len(variables_to_plot), 1, figsize=(10, 20))

for ax, var in zip(axes, variables_to_plot):
    # Extract the data for this variable
    data_for_var = merged_df[['Well', 'FOV', var]]

    # Save this data to a CSV file
    data_for_var.to_csv(f"{Results_Folder}/data_for_{var}.csv", index=False)

    # Sort the 'Group' column alphabetically
    group_order = merged_df['Well'].sort_values().unique()

    sns.boxplot(x='Well', y=var, data=merged_df, ax=ax, color='lightgray', order=group_order)  # Boxplot
    #sns.stripplot(x='Well', y=var, data=merged_df, ax=ax, hue='FOV', dodge=True, jitter=True, alpha=0.2)  # Individual data points

    ax.set_title(f"{var}")
    ax.set_xlabel('Well')
    ax.set_ylabel(var)

if not os.path.exists(f"{Results_Folder}/plots"):
    os.makedirs(f"{Results_Folder}/plots")

# Save the figure to a PDF
plt.tight_layout()
pdf_pages.savefig(fig)

# Close the PDF
pdf_pages.close()


In [None]:
# @title #Plot and extract useful data and pool conditions

#Filter_tracks = 0  # @param {type: "number"}

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages

# Assuming that merged_df is your DataFrame
# List of variables to plot
variables_to_plot = ["DIVISION_TIME_MEAN", "TRACK_MEAN_SPEED", "TOTAL_DISTANCE_TRAVELED", "TRACK_DURATION"]
#variables_to_plot = ["TRACK_MEAN_SPEED", "TOTAL_DISTANCE_TRAVELED", "TRACK_DURATION"]


# Initialize PDF
#pdf_pages = PdfPages(Results_Folder+'Boxplots_pooled_CF.pdf')

# Initialize PDF2
pdf_path = os.path.join(Results_Folder, 'plots', 'Boxplots_pooled_raw.pdf') # Save PDF in 'plots' folder
pdf_pages = PdfPages(pdf_path)

# Create a single figure with 4 subplots, one for each variable
fig, axes = plt.subplots(len(variables_to_plot), 1, figsize=(10, 20))

# Create

for ax, var in zip(axes, variables_to_plot):
    # Extract the data for this variable
    data_for_var = merged_df[['Well', 'FOV', 'Condition', var]]


    # Save this data to a CSV file
    data_for_var.to_csv(f"{Results_Folder}/data_for_{var}.csv", index=False)

    # Sort the 'Group' column alphabetically
    group_order = merged_df['Condition'].sort_values().unique()

    #sns.boxplot(x='Well', y=var, data=merged_df, ax=ax, color='lightgray')  # Boxplot
    #sns.boxplot(x='Group', y=var, data=merged_df, ax=ax, color='lightgray')  # Boxplot
    sns.boxplot(x='Condition', y=var, data=merged_df, ax=ax, color='lightgray', order=group_order)
    #sns.stripplot(x='Well', y=var, data=merged_df, ax=ax, hue='FOV', dodge=True, jitter=True, alpha=0.2)  # Individual data points

    ax.set_title(f"{var}")
    ax.set_xlabel('Condition')
    ax.set_ylabel(var)

# Save the figure to a PDF
plt.tight_layout()
pdf_pages.savefig(fig)

# Close the PDF
pdf_pages.close()

In [None]:
# @title #Calculate statistics between conditions for DIVISION_TIME_MEAN

import pandas as pd
from scipy.stats import ttest_ind
from numpy import std
import os

# Load the CSV file into a pandas DataFrame
file_path = os.path.join(Results_Folder, 'data_for_DIVISION_TIME_MEAN.csv')
df = pd.read_csv(file_path)

# Drop rows with NaN values in the DIVISION_TIME_MEAN column
df = df.dropna(subset=['DIVISION_TIME_MEAN'])

# Define the specific conditions to compare
comparisons = [
    ('Control pool', 'Mutation #34'),
    ('Control pool', 'Mutation #38'),
    ('Control single cell', 'Mutation #34'),
    ('Control single cell', 'Mutation #38')
]

# Create a dictionary to store condition data
condition_groups = {condition: df[df['Condition'] == condition]['DIVISION_TIME_MEAN'] for condition in df['Condition'].unique()}

# Display group lengths
for condition, data in condition_groups.items():
    print(f'{condition} length: {len(data)}')

# Perform specified t-tests and Cohen's d calculations
t_statistics = []
p_values = []
cohen_ds = []

def cohen_d(group_a, group_b):
    mean_diff = group_a.mean() - group_b.mean()
    pooled_std = std(pd.concat([group_a, group_b], axis=0), ddof=1)
    return mean_diff / pooled_std

comparison_labels = []
for cond_a, cond_b in comparisons:
    if cond_a in condition_groups and cond_b in condition_groups:
        group_a, group_b = condition_groups[cond_a], condition_groups[cond_b]

        t_stat, p_value = ttest_ind(group_a, group_b)
        d_value = cohen_d(group_a, group_b)

        comparison_labels.append(f'{cond_a} vs {cond_b}')
        t_statistics.append(t_stat)
        p_values.append(p_value)
        cohen_ds.append(d_value)
    else:
        print(f'Skipping comparison {cond_a} vs {cond_b} due to missing data.')

# Create a DataFrame for the results
results_df = pd.DataFrame({
    'Comparison': comparison_labels,
    'T-statistic': t_statistics,
    'P-value': p_values,
    "Cohen's d": cohen_ds
})

# Export results to a CSV file with p-values formatted to 5 decimal places
results_csv_path = os.path.join(Results_Folder, 't_test_results_DIVISION_TIME_MEAN.csv')
results_df.to_csv(results_csv_path, index=False, float_format='%.5f')

# Print the results
print('T-test results:')
print(results_df)
print(f'Results exported to: {results_csv_path}')



In [None]:
# @title #Calculate statistics between conditions for TRACK_MEAN_SPEED

import pandas as pd
from scipy.stats import ttest_ind
from numpy import std
import os

# Load the CSV file into a pandas DataFrame
file_path = os.path.join(Results_Folder, 'data_for_TRACK_MEAN_SPEED.csv')
df = pd.read_csv(file_path)

# Drop rows with NaN values in the DIVISION_TIME_MEAN column
df = df.dropna(subset=['TRACK_MEAN_SPEED'])

# Define the specific conditions to compare
comparisons = [
    ('Control pool', 'Mutation #34'),
    ('Control pool', 'Mutation #38'),
    ('Control single cell', 'Mutation #34'),
    ('Control single cell', 'Mutation #38')
]

# Create a dictionary to store condition data
condition_groups = {condition: df[df['Condition'] == condition]['TRACK_MEAN_SPEED'] for condition in df['Condition'].unique()}

# Display group lengths
for condition, data in condition_groups.items():
    print(f'{condition} length: {len(data)}')

# Perform specified t-tests and Cohen's d calculations
t_statistics = []
p_values = []
cohen_ds = []

def cohen_d(group_a, group_b):
    mean_diff = group_a.mean() - group_b.mean()
    pooled_std = std(pd.concat([group_a, group_b], axis=0), ddof=1)
    return mean_diff / pooled_std

comparison_labels = []
for cond_a, cond_b in comparisons:
    if cond_a in condition_groups and cond_b in condition_groups:
        group_a, group_b = condition_groups[cond_a], condition_groups[cond_b]

        t_stat, p_value = ttest_ind(group_a, group_b)
        d_value = cohen_d(group_a, group_b)

        comparison_labels.append(f'{cond_a} vs {cond_b}')
        t_statistics.append(t_stat)
        p_values.append(p_value)
        cohen_ds.append(d_value)
    else:
        print(f'Skipping comparison {cond_a} vs {cond_b} due to missing data.')

# Create a DataFrame for the results
results_df = pd.DataFrame({
    'Comparison': comparison_labels,
    'T-statistic': t_statistics,
    'P-value': p_values,
    "Cohen's d": cohen_ds
})

# Export results to a CSV file with p-values formatted to 5 decimal places
results_csv_path = os.path.join(Results_Folder, 't_test_results_TRACK_MEAN_SPEED.csv')
results_df.to_csv(results_csv_path, index=False, float_format='%.5f')

# Print the results
print('T-test results:')
print(results_df)
print(f'Results exported to: {results_csv_path}')



In [None]:
# @title #Check the number of track per condition per repeats

def count_tracks_by_condition_and_repeat(df, Results_Folder, condition_col='Condition', repeat_col='Repeat', track_id_col='Unique_ID'):
    """
    Counts the number of unique tracks for each combination of condition and repeat in the given DataFrame and
    saves a stacked histogram plot as a PDF in the QC folder with annotations for each stack.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the data.
    Results_Folder (str): The base folder where the results will be saved.
    condition_col (str): The name of the column representing the condition. Default is 'Condition'.
    repeat_col (str): The name of the column representing the repeat. Default is 'Repeat'.
    track_id_col (str): The name of the column representing the track ID. Default is 'Unique_ID'.
    """
    track_counts = df.groupby([condition_col, repeat_col])[track_id_col].nunique()
    track_counts_df = track_counts.reset_index()
    track_counts_df.rename(columns={track_id_col: 'Number_of_Tracks'}, inplace=True)

    # Pivot the data for plotting
    pivot_df = track_counts_df.pivot(index=condition_col, columns=repeat_col, values='Number_of_Tracks').fillna(0)

    # Plotting
    fig, ax = plt.subplots(figsize=(12, 6))
    bars = pivot_df.plot(kind='bar', stacked=True, ax=ax)
    ax.set_xlabel('Condition')
    ax.set_ylabel('Number of Tracks')
    ax.set_title('Stacked Histogram of Track Counts per Condition and Repeat')
    ax.legend(title=repeat_col)
    ax.grid(axis='y', linestyle='--')

    # Hide horizontal grid lines
    ax.yaxis.grid(False)

    # Add number annotations on each stack
    for bar in bars.patches:
        ax.text(bar.get_x() + bar.get_width() / 2,
                bar.get_y() + bar.get_height() / 2,
                int(bar.get_height()),
                ha='center', va='center', color='black', fontweight='bold', fontsize=8)

    # Save the plot as a PDF
    pdf_file = os.path.join(Results_Folder, 'Track_Counts_Histogram.pdf')
    plt.savefig(pdf_file, bbox_inches='tight')
    print(f"Saved histogram to {pdf_file}")

    plt.show()

    return track_counts_df


if not os.path.exists(f"{Results_Folder}/QC"):
    os.makedirs(f"{Results_Folder}/QC")

result_df = count_tracks_by_condition_and_repeat(merged_df, f"{Results_Folder}/QC")





In [None]:
# @title #Run this cell to downsample and balance your dataset

!pip install tqdm  # Install the tqdm module
from tqdm import tqdm # import the tqdm function
import gzip # import gzip to read and write gzipped file

def balance_dataset(df, condition_col='Condition', repeat_col='Repeat', track_id_col='Unique_ID', random_seed=None):
    """
    Balances the dataset by downsampling tracks for each condition and repeat combination.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the data.
    condition_col (str): The name of the column representing the condition.
    repeat_col (str): The name of the column representing the repeat.
    track_id_col (str): The name of the column representing the track ID.
    random_seed (int, optional): The seed for the random number generator. Default is None.

    Returns:
    pandas.DataFrame: A new DataFrame with balanced track counts.
    """
    # Group by condition and repeat, and find the minimum track count
    min_track_count = df.groupby([condition_col, repeat_col])[track_id_col].nunique().min()

    # Function to sample min_track_count tracks from each group
    def sample_tracks(group):
        return group.sample(n=min_track_count, random_state=random_seed)

    # Apply sampling to each group and concatenate the results
    balanced_merged_tracks_df = df.groupby([condition_col, repeat_col]).apply(sample_tracks).reset_index(drop=True)

    return balanced_merged_tracks_df

def replace_inf_with_nan(df, df_name):
    """
    Replaces all infinite values (positive or negative infinity) in the DataFrame with NaN
    and prints a message for each column where infinities are found.

    Args:
    df (pd.DataFrame): DataFrame to replace inf values.
    df_name (str): The name of the DataFrame as a string, used for printing.

    Returns:
    pd.DataFrame: DataFrame with infinity values replaced by NaN.
    """
    # Check for positive and negative infinity
    inf_columns = df.columns[(df == np.inf).any() | (df == -np.inf).any()].tolist()

    # Print message for each column that contains infinity values
    if inf_columns:
        for col in inf_columns:
            inf_count = ((df[col] == np.inf) | (df[col] == -np.inf)).sum()
            print(f"Column '{col}' in {df_name} contains {inf_count} infinity values. Replacing with NaN.")

    # Replace inf and -inf with NaN and update the DataFrame in place
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    return df  # Return the modified DataFrame

def check_for_nans(df, df_name):
    """
    Checks the given DataFrame for NaN values and prints the count for each column containing NaNs.
    It first converts infinite values to NaNs before the check.

    Args:
    df (pd.DataFrame): DataFrame to be checked for NaN values.
    df_name (str): The name of the DataFrame as a string, used for printing.
    """
    # Replace infinity with NaN before checking for NaN values
    df = replace_inf_with_nan(df, df_name)

    # Check if the DataFrame has any NaN values and print a warning if it does.
    nan_columns = df.columns[df.isna().any()].tolist()

    if nan_columns:
        for col in nan_columns:
            nan_count = df[col].isna().sum()
            print(f"Column '{col}' in {df_name} contains {nan_count} NaN values.")
    else:
        print(f"No NaN values found in {df_name}.")

def save_dataframe_with_progress(df, path, desc="Saving", chunk_size=50000):
    """Save a DataFrame with a progress bar and gzip compression."""

    # Estimating the number of chunks based on the provided chunk size
    num_chunks = int(len(df) / chunk_size) + 1

    # Create a tqdm instance for progress tracking
    with tqdm(total=len(df), unit="rows", desc=desc) as pbar:
        # Open the file for writing with gzip compression
        with gzip.open(path, "wt") as f:
            # Write the header once at the beginning
            df.head(0).to_csv(f, index=False)

            for chunk in np.array_split(df, num_chunks):
                chunk.to_csv(f, mode="a", header=False, index=False)
                pbar.update(len(chunk))

random_seed = 42

if not os.path.exists(f"{Results_Folder}/Balanced_dataset"):
    os.makedirs(f"{Results_Folder}/Balanced_dataset")

balanced_merged_tracks_df = balance_dataset(merged_df, random_seed=random_seed)
result_df = count_tracks_by_condition_and_repeat(balanced_merged_tracks_df, f"{Results_Folder}/Balanced_dataset")

check_for_nans(balanced_merged_tracks_df, "balanced_merged_tracks_df")
save_dataframe_with_progress(balanced_merged_tracks_df, Results_Folder + '/Balanced_dataset/merged_Tracks_balanced_dataset.csv.gz')

In [None]:
# @title #Plot and extract useful data and pool conditions (balanced)

#Filter_tracks = 0  # @param {type: "number"}

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages

# Initialize PDF2 - Only once!
pdf_path = os.path.join(Results_Folder, 'plots', 'Boxplots_pooled_balanced.pdf')
pdf_pages = PdfPages(pdf_path)  # This creates the pdf object

# Assuming that merged_df is your DataFrame
# List of variables to plot
variables_to_plot = ["DIVISION_TIME_MEAN", "TRACK_MEAN_SPEED", "TOTAL_DISTANCE_TRAVELED", "TRACK_DURATION"]
#variables_to_plot = ["TRACK_MEAN_SPEED", "TOTAL_DISTANCE_TRAVELED", "TRACK_DURATION"]


# Create a single figure with 4 subplots, one for each variable
fig, axes = plt.subplots(len(variables_to_plot), 1, figsize=(10, 20))

for ax, var in zip(axes, variables_to_plot):
    # Extract the data for this variable
    data_for_var = balanced_merged_tracks_df[['Well', 'FOV', 'Condition', var]]


    # Save this data to a CSV file
    data_for_var.to_csv(f"{Results_Folder}/data_for_{var}_balanced.csv", index=False)

    # Sort the 'Group' column alphabetically
    group_order = balanced_merged_tracks_df['Condition'].sort_values().unique()

    sns.boxplot(x='Condition', y=var, data=balanced_merged_tracks_df, ax=ax, color='lightgray', order=group_order)

    ax.set_title(f"{var}")  # Set title for each subplot
    ax.set_xlabel('Condition')
    ax.set_ylabel(var)

# Save the figure to a PDF - Only once, outside the loop
plt.tight_layout()
pdf_pages.savefig(fig)

# Close the PDF -  Important to close after saving
pdf_pages.close()

In [None]:
# @title #Plot and extract useful data and pool conditions with repeats (balanced)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import os

# Initialize PDF2
pdf_path = os.path.join(Results_Folder, 'Boxplots_pooled_balanced_repeats.pdf')
pdf_pages = PdfPages(pdf_path)

# Assuming 'balanced_merged_tracks_df' is your DataFrame
# List of variables to plot
variables_to_plot = ["DIVISION_TIME_MEAN", "TRACK_MEAN_SPEED", "TOTAL_DISTANCE_TRAVELED", "TRACK_DURATION"]

# Initialize PDF - **This line was causing the error, it's fixed below**
pdf_path = os.path.join(Results_Folder, 'plots',  'Boxplots_pooled_balanced_repeat.pdf')  # Use Results_Folder variable here
pdf_pages = PdfPages(pdf_path)

# Create a single figure with subplots for each variable
fig, axes = plt.subplots(len(variables_to_plot), 1, figsize=(12, 24))

# Loop over variables and plot
for ax, var in zip(axes, variables_to_plot):
    # Extract data for the variable
    data_for_var = balanced_merged_tracks_df[['Well', 'FOV', 'Condition', 'Repeat', var]]

    # Corrected line for saving the CSV
    #data_for_var.to_csv(os.path.join(Results_Folder, f"data_for_{var}_balanced.csv"), index=False)


    # Determine the order of the conditions
    group_order = balanced_merged_tracks_df['Condition'].sort_values().unique()

    # Create the boxplot for pooled data
    sns.boxplot(
        x='Condition',
        y=var,
        data=balanced_merged_tracks_df,
        ax=ax,
        color='lightgray',
        order=group_order
    )

    # Overlay with a stripplot showing individual repeats
    sns.stripplot(
        x='Condition',
        y=var,
        hue='Repeat',
        data=balanced_merged_tracks_df,
        ax=ax,
        dodge=True,
        jitter=True,
        palette='magma',
        alpha=0.5,
        order=group_order
    )

    # Customize plot labels and legend
    ax.set_title(f"{var}")
    ax.set_xlabel('Condition')
    ax.set_ylabel(var)
    ax.legend(title='Repeat', bbox_to_anchor=(1.05, 1), loc='upper left')

# Adjust layout and save to PDF
plt.tight_layout()
pdf_pages.savefig(fig)
pdf_pages.close()

In [None]:
# @title #Calculate statistics between conditions for DIVISION_TIME_MEAN_balanced

import pandas as pd
from scipy.stats import ttest_ind
from numpy import std
import os

# Load the CSV file into a pandas DataFrame
file_path = os.path.join(Results_Folder, 'data_for_DIVISION_TIME_MEAN_balanced.csv')
df = pd.read_csv(file_path)

# Drop rows with NaN values in the DIVISION_TIME_MEAN column
df = df.dropna(subset=['DIVISION_TIME_MEAN'])

# Define the specific conditions to compare
comparisons = [
    ('Control pool', 'Mutation #34'),
    ('Control pool', 'Mutation #38'),
    ('Control single cell', 'Mutation #34'),
    ('Control single cell', 'Mutation #38')
]

# Create a dictionary to store condition data
condition_groups = {condition: df[df['Condition'] == condition]['DIVISION_TIME_MEAN'] for condition in df['Condition'].unique()}

# Display group lengths
for condition, data in condition_groups.items():
    print(f'{condition} length: {len(data)}')

# Perform specified t-tests and Cohen's d calculations
t_statistics = []
p_values = []
cohen_ds = []

def cohen_d(group_a, group_b):
    mean_diff = group_a.mean() - group_b.mean()
    pooled_std = std(pd.concat([group_a, group_b], axis=0), ddof=1)
    return mean_diff / pooled_std

comparison_labels = []
for cond_a, cond_b in comparisons:
    if cond_a in condition_groups and cond_b in condition_groups:
        group_a, group_b = condition_groups[cond_a], condition_groups[cond_b]

        t_stat, p_value = ttest_ind(group_a, group_b)
        d_value = cohen_d(group_a, group_b)

        comparison_labels.append(f'{cond_a} vs {cond_b}')
        t_statistics.append(t_stat)
        p_values.append(p_value)
        cohen_ds.append(d_value)
    else:
        print(f'Skipping comparison {cond_a} vs {cond_b} due to missing data.')

# Create a DataFrame for the results
results_df = pd.DataFrame({
    'Comparison': comparison_labels,
    'T-statistic': t_statistics,
    'P-value': p_values,
    "Cohen's d": cohen_ds
})

# Export results to a CSV file with p-values formatted to 5 decimal places
results_csv_path = os.path.join(Results_Folder, 't_test_results_DIVISION_TIME_MEAN_balanced.csv')
results_df.to_csv(results_csv_path, index=False, float_format='%.5f')

# Print the results
print('T-test results:')
print(results_df)
print(f'Results exported to: {results_csv_path}')







In [None]:
# @title #Calculate statistics between conditions for TRACK_MEAN_SPEED_balanced

# @title #Calculate statistics between conditions for TRACK_MEAN_SPEED

import pandas as pd
from scipy.stats import ttest_ind
from numpy import std
import os

# Load the CSV file into a pandas DataFrame
file_path = os.path.join(Results_Folder, 'data_for_TRACK_MEAN_SPEED_balanced.csv')
df = pd.read_csv(file_path)

# Drop rows with NaN values in the DIVISION_TIME_MEAN column
df = df.dropna(subset=['TRACK_MEAN_SPEED'])

# Define the specific conditions to compare
comparisons = [
    ('Control pool', 'Mutation #34'),
    ('Control pool', 'Mutation #38'),
    ('Control single cell', 'Mutation #34'),
    ('Control single cell', 'Mutation #38')
]

# Create a dictionary to store condition data
condition_groups = {condition: df[df['Condition'] == condition]['TRACK_MEAN_SPEED'] for condition in df['Condition'].unique()}

# Display group lengths
for condition, data in condition_groups.items():
    print(f'{condition} length: {len(data)}')

# Perform specified t-tests and Cohen's d calculations
t_statistics = []
p_values = []
cohen_ds = []

def cohen_d(group_a, group_b):
    mean_diff = group_a.mean() - group_b.mean()
    pooled_std = std(pd.concat([group_a, group_b], axis=0), ddof=1)
    return mean_diff / pooled_std

comparison_labels = []
for cond_a, cond_b in comparisons:
    if cond_a in condition_groups and cond_b in condition_groups:
        group_a, group_b = condition_groups[cond_a], condition_groups[cond_b]

        t_stat, p_value = ttest_ind(group_a, group_b)
        d_value = cohen_d(group_a, group_b)

        comparison_labels.append(f'{cond_a} vs {cond_b}')
        t_statistics.append(t_stat)
        p_values.append(p_value)
        cohen_ds.append(d_value)
    else:
        print(f'Skipping comparison {cond_a} vs {cond_b} due to missing data.')

# Create a DataFrame for the results
results_df = pd.DataFrame({
    'Comparison': comparison_labels,
    'T-statistic': t_statistics,
    'P-value': p_values,
    "Cohen's d": cohen_ds
})

# Export results to a CSV file with p-values formatted to 5 decimal places
results_csv_path = os.path.join(Results_Folder, 't_test_results_TRACK_MEAN_SPEED_balanced.csv')
results_df.to_csv(results_csv_path, index=False, float_format='%.5f')

# Print the results
print('T-test results:')
print(results_df)
print(f'Results exported to: {results_csv_path}')


# READY!!!