# **PDAC CellTracksColab - Plot Arrest Profiles:**
---

<font size = 4>Notebook created by [Guillaume Jacquemet](https://cellmig.org/)


In [None]:
# @title #MIT License

print("""
**MIT License**

Copyright (c) 2023 Guillaume Jacquemet

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.""")

--------------------------------------------------------
# **Part 1: Prepare the session and load your data**
--------------------------------------------------------


## **1.1. Install key dependencies**
---
<font size = 4>

In [None]:
#@markdown ##Play to install
!pip -q install pandas scikit-learn
!pip -q install plotly
!pip -q install tqdm
!pip -q install gdown
!pip -q install -U -q PyDrive

import ipywidgets as widgets
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
import itertools
from matplotlib.gridspec import GridSpec
import requests



# Function to calculate Cohen's d
def cohen_d(group1, group2):
    diff = group1.mean() - group2.mean()
    n1, n2 = len(group1), len(group2)
    var1 = group1.var()
    var2 = group2.var()
    pooled_var = ((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2)
    d = diff / np.sqrt(pooled_var)
    return d

import requests


def save_dataframe_with_progress(df, path, desc="Saving", chunk_size=50000):
    """Save a DataFrame with a progress bar."""

    # Estimating the number of chunks based on the provided chunk size
    num_chunks = int(len(df) / chunk_size) + 1

    # Create a tqdm instance for progress tracking
    with tqdm(total=len(df), unit="rows", desc=desc) as pbar:
        # Open the file for writing
        with open(path, "w") as f:
            # Write the header once at the beginning
            df.head(0).to_csv(f, index=False)

            for chunk in np.array_split(df, num_chunks):
                chunk.to_csv(f, mode="a", header=False, index=False)
                pbar.update(len(chunk))


def check_for_nans(df, df_name):
    """
    Checks the given DataFrame for NaN values and prints the count for each column containing NaNs.

    Args:
    df (pd.DataFrame): DataFrame to be checked for NaN values.
    df_name (str): The name of the DataFrame as a string, used for printing.
    """
    # Check if the DataFrame has any NaN values and print a warning if it does.
    nan_columns = df.columns[df.isna().any()].tolist()

    if nan_columns:
        for col in nan_columns:
            nan_count = df[col].isna().sum()
            print(f"Column '{col}' in {df_name} contains {nan_count} NaN values.")
    else:
        print(f"No NaN values found in {df_name}.")




## **1.2. Mount your Google Drive**
---
<font size = 4> To use this notebook on the data present in your Google Drive, you need to mount your Google Drive to this notebook.

<font size = 4> Play the cell below to mount your Google Drive and follow the instructions.

<font size = 4> Once this is done, your data are available in the **Files** tab on the top left of notebook.

In [None]:
#@markdown ##Play the cell to connect your Google Drive to Colab

from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive



## **1.3. download and Load the dataset**
---



In [None]:
#@markdown ##Download your dataset


import os
import re
import glob
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np
import requests
import zipfile

import gdown

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Download a file based on its file ID.

file_id = '1GAoZxiQbQ85pgW-Y3PvcGw8JbfZkkP32'
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('/content/slow_tracks_count_adjusted.csv')  # Replace with your file name and extension

#@markdown ###Provide the path to your Result folder

Results_Folder = ""  # @param {type: "string"}

if not Results_Folder:
    Results_Folder = '/content/Results'  # Default Results_Folder path if not defined

if not os.path.exists(Results_Folder):
    os.makedirs(Results_Folder)  # Create Results_Folder if it doesn't exist

# Print the location of the result folder
print(f"Result folder is located at: {Results_Folder}")

# For existing dataframes

print("Loading track table file....")
count_df = pd.read_csv("/content/slow_tracks_count_adjusted.csv", low_memory=False)

check_for_nans(count_df, "count_df")

--------------------------------------------------------
# **Part 2: Filter and plot your data**
--------------------------------------------------------

In [None]:
import pandas as pd
import ipywidgets as widgets
from IPython.display import display

# @title ##Filter the data


# Global variables to store the selected options
global filtered_df
filtered_df = pd.DataFrame()

global selected_cells, selected_speeds, selected_ilbetas
selected_cells, selected_speeds, selected_ilbetas = [], [], []

# Function to summarize selected options into a string
def summarize_options(options):
    return "_".join([str(option) for option in options if option])  # Filters out any 'falsy' values like empty strings or None

# Function to create a filename based on selected options
def create_filename(selected_cells, selected_speeds, selected_ilbetas):
    # Join the summarized options for each parameter with an underscore
    selected_options = "_".join([
        summarize_options(selected_cells),
        summarize_options(selected_speeds),
        summarize_options(selected_ilbetas)
    ])

    # Replace spaces with underscores and return the filename
    filename = f"{selected_options}"
    return filename.replace(" ", "_")

# Create checkboxes for each category
cells_checkboxes = [widgets.Checkbox(value=False, description=str(cell)) for cell in count_df['Cells'].unique()]
flow_speed_checkboxes = [widgets.Checkbox(value=False, description=str(speed)) for speed in count_df['Flow_speed'].unique()]
ilbeta_checkboxes = [widgets.Checkbox(value=False, description=str(ilbeta)) for ilbeta in count_df['ILbeta'].unique()]

# Function to filter dataframe and update global variables based on selected checkbox values
def filter_dataframe(button):
    global filtered_df, selected_cells, selected_speeds, selected_ilbetas

    # Trim whitespace and correct cases if necessary
    count_df['Cells'] = count_df['Cells'].str.strip()
    count_df['Flow_speed'] = count_df['Flow_speed'].str.strip()
    count_df['ILbeta'] = count_df['ILbeta'].str.strip()

    selected_cells = [box.description for box in cells_checkboxes if box.value]
    selected_speeds = [box.description for box in flow_speed_checkboxes if box.value]
    selected_ilbetas = [box.description for box in ilbeta_checkboxes if box.value]

    # Debugging output
    print("Selected Cells:", selected_cells)
    print("Selected Speeds:", selected_speeds)
    print("Selected ILbetas:", selected_ilbetas)
    print("Original DF length:", len(count_df))

    filtered_df = count_df[
        (count_df['Cells'].isin(selected_cells)) &
        (count_df['Flow_speed'].isin(selected_speeds)) &
        (count_df['ILbeta'].isin(selected_ilbetas))
    ]

    # More debugging output
    print("Filtered DF length:", len(filtered_df))
    if len(filtered_df) == 0:
        print("No data matched the selected filters. Check filters and data for consistency.")
        print("Unique 'Cells' in DataFrame:", count_df['Cells'].unique())
        print("Unique 'Flow_speed' in DataFrame:", count_df['Flow_speed'].unique())
        print("Unique 'ILbeta' in DataFrame:", count_df['ILbeta'].unique())

    print("Done")

# Now call the filter function or trigger the button to filter the dataframe and see the output.


# Button to trigger dataframe filtering
filter_button = widgets.Button(description="Filter Dataframe")
filter_button.on_click(filter_dataframe)

# Display checkboxes and button
display(widgets.VBox([
    widgets.Label('Select Cells:'),
    widgets.HBox(cells_checkboxes),
    widgets.Label('Select Flow Speed:'),
    widgets.HBox(flow_speed_checkboxes),
    widgets.Label('Select ILbeta:'),
    widgets.HBox(ilbeta_checkboxes),
    filter_button
]))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import os

# @title #Plot selected conditions

# Check and create necessary directories
if not os.path.exists(f"{Results_Folder}/Track_Counts"):
    os.makedirs(f"{Results_Folder}/Track_Counts")

filename = create_filename(selected_cells, selected_speeds, selected_ilbetas)


pdf_filepath = os.path.join(Results_Folder + '/Track_Counts/', filename+'_plot.pdf')

# Get unique combinations of 'Cells' and 'ILbeta'
unique_cells_ilbeta = filtered_df[['Cells', 'ILbeta']].drop_duplicates()

# Adjust figure size and layout
fig, ax = plt.subplots(figsize=(12, 8))  # Adjusted figure size

for _, row in unique_cells_ilbeta.iterrows():
    cells, ilbeta = row['Cells'], row['ILbeta']
    combo_df = filtered_df[(filtered_df['Cells'] == cells) & (filtered_df['ILbeta'] == ilbeta)]

    filepath = os.path.join(Results_Folder + '/Track_Counts/', filename +'_data.csv')
    combo_df.to_csv(filepath, index=False)
    print(f"Dataframe for {cells}, {ilbeta} saved to {filepath}")

    sns.lineplot(data=combo_df, x='POSITION_T_REPEAT', y='Unique_ID_Rolling', label=f"{cells}, {ilbeta}", errorbar="se")

# Manually adjust y-axis limits
current_ylim = ax.get_ylim()
ax.set_ylim(current_ylim[0], current_ylim[1] * 1.1)

# Add horizontal lines for different Flow_speed segments
ax.hlines(y=current_ylim[1]*1.00, xmin=0, xmax=87, colors='gray', linestyles='solid', lw=5)
ax.hlines(y=current_ylim[1]*1.00, xmin=88, xmax=175, colors='gray', linestyles='solid', lw=5)
ax.hlines(y=current_ylim[1]*1.00, xmin=176, xmax=263, colors='gray', linestyles='solid', lw=5)
ax.hlines(y=current_ylim[1]*1.00, xmin=264, xmax=350, colors='gray', linestyles='solid', lw=5)

ax.text(40, current_ylim[1]*1.03, '300', horizontalalignment='center')
ax.text(130, current_ylim[1]*1.03, '200', horizontalalignment='center')
ax.text(220, current_ylim[1]*1.03, '100', horizontalalignment='center')
ax.text(310, current_ylim[1]*1.03, 'Wash', horizontalalignment='center')

ax.set_title('Track Count over Time')
ax.set_xlabel('Time (s)')
ax.set_ylabel('Number of Tracks')

# Place the legend outside the plot on the right
ax.legend(title='Conditions', loc='center left', bbox_to_anchor=(1, 0.5))

plt.tight_layout()

# Save the plot as a PDF
plt.savefig(pdf_filepath)
plt.show()
plt.close()
print(f"Plot saved to {pdf_filepath}")
