#Cellgroup

this is a library to cluster, analyse, and potentially forecast cell location

## Download Libraries and Repositories

In [None]:
pip install reportlab

In [None]:
!git clone https://github.com/dvida/cyoptics-clustering.git

In [None]:
%cd cyoptics-clustering/

In [None]:
from __future__ import print_function, division, absolute_import

import numpy as np

# Cython init
import pyximport
pyximport.install(setup_args={'include_dirs':[np.get_include()]})
from cyOPTICS import runCyOPTICS

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import pandas as pd
import numpy as np
from scipy.interpolate import interp1d
import time
import random
import os
import math
import pandas as pd
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
from pydrive.auth import GoogleAuth
from google.colab import files
import pickle
from tqdm import tqdm
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from PIL import Image
from tqdm import tqdm
from google.colab import drive
from pydrive.drive import GoogleDrive

from GradientClustering import gradientClustering, plotClusteringReachability, filterLargeClusters, \
    mergeSimilarClusters

## Data treatment  

Table T0001 has a problem with noise. The solution for it is to use the new algorithm and it's possible to merge a part of it before.

In [None]:
df1 = pd.read_csv('df1.csv')

In [None]:
# Remove points from coordinates 0 to 3000, 2000 to 3000, and 5500 to 6000 for X, and 0 to 3000 and 3000 to 4000 for Y
df1 = df1[~((df1['X'] >= 0) & (df1['X'] <= 3000)) | ~((df1['Y'] >= 0) & (df1['Y'] <= 3000))]
df1 = df1[~((df1['X'] >= 2000) & (df1['X'] <= 6000)) | ~((df1['Y'] >= 3100) & (df1['Y'] <= 3700))]
df1 = df1[~((df1['X'] >= 4200) & (df1['X'] <= 6400)) | ~((df1['Y'] >= 3000) & (df1['Y'] <= 6200))]
df1 = df1[~((df1['X'] >= 5100) & (df1['X'] <= 6000)) | ~((df1['Y'] >= 0) & (df1['Y'] <= 3000))]
df1 = df1[~((df1['X'] >= 2000) & (df1['X'] <= 3000)) | ~((df1['Y'] >= 3000) & (df1['Y'] <= 4000))]
df1.to_csv('df1.csv', index=False)

In [None]:
T1 = df1[['X', 'Y']].copy()

In [None]:
df.to_csv('file_name.csv')

## Cluster

### Optics Algorithm

The first part of the notebook aims to understand some particular values trough an Optics alogrithm

To copy Copy of runCyoptics.ipynb

In [None]:
def runOPTICS(input_list, eps, min_pts):
    """ A wrapper funtion for the OPTICS clustering Cython implementation.
    Arguments:
        input_list: [ndarray] 2D numpy array containing the input data (1 datum per row)
        eps: [float] epsilon parameter - maximum distance between points
        min_pts: [int] minimum points in the cluster

    Return:
        point_list: [ndarray] 2D numpy array containing information about every processed point, the columns
            of the array are:
            - processed: 0 for not processed, 1 for processed - upon returning, processed values of all
                entries should be 1
            - reachability distance: -1 for first points in the cluster, positive for all others
            - core distance: -1 for noise, positive otherwise (the notion of noise can change with regard to
                the different input values eps and min_pts)
            - input data points (the input data colums are appended to the right)
    """

    return runCyOPTICS(input_list, eps, min_pts)

In [None]:
def plotPoints(points, ordered_list=[], clusters=[], title=''):

    # Plot all points
    plt.scatter(points[:,0], points[:,1], c='k', linewidth=0.2, edgecolor='w', facecolor=None)

    # Plot clusters, if any
    if clusters:

        # Generate a list of colors for each cluster and randomize their order (so close clusters would have
        # significcantly different color)
        colors = cm.inferno(np.linspace(0.3, 1, len(clusters)))
        color_order = random.sample(range(len(colors)), len(colors))

        # Plot the clusters in 2D
        for color, cluster in zip(colors[color_order], clusters):
            plt.scatter(ordered_list[cluster][:,3], ordered_list[cluster][:,4], c=color, linewidth=0.2,
                edgecolor='w')

    # Set the title
    plt.title(title)

    # Turn on the grid, set color to grey
    plt.gca().grid(color='0.5')

    # Set background color to black
    plt.gca().set_facecolor('black')

    # Set the ratio to the window size 1:1
    plt.gca().set_aspect('equal')
    plt.tight_layout()

    plt.show()

In [None]:
def saveResults(df, ordered_list, time, clusters=[], filename='results.csv'):
  # Create an empty dataframe to store the results
  results = pd.DataFrame(df)

  # Plot clusters, if any
  if clusters:
        # Generate a list of colors for each cluster and randomize their order (so close clusters would have
        # significcantly different color)
        colors = cm.inferno(np.linspace(0.3, 1, len(clusters)))
        color_order = random.sample(range(len(colors)), len(colors))

        # Create a tuple to map each unique color to a unique integer ID
        tuple_colors = [tuple(color) for color in colors]
        unique_colors = list(set(tuple_colors))
        color_to_id = {color: i for i, color in enumerate(unique_colors)}

        results['Labels'] = np.nan
        results['Time'] = np.nan
        # Store the data in the results dataframe
        for color, cluster in zip(colors[color_order], clusters):
            x = ordered_list[cluster][:,3]
            y = ordered_list[cluster][:,4]
            labels = [color_to_id[tuple(color)]] * len(x)
            # Replace column X and Y with x and y
            results.loc[cluster, 'X'] = x
            results.loc[cluster, 'Y'] = y

             # Create the Labels column and store labels in the dataframe df
            results.loc[cluster, 'Labels'] = labels
            results.loc[cluster, 'Time'] = time

  # Save the results to a CSV file
  results = results.dropna(subset=['Labels'])
  results.to_csv(filename, index=False)
  print("succesfully saved the results to the cyoptics-clustering folder")
  return results

In [None]:
def plot_csv(csv_file):
   # Load the data from the CSV file
    results = pd.read_csv(csv_file)

    # Create a scatter plot
    fig, ax = plt.subplots()
    for label, group in results.groupby('Labels'):
        ax.scatter(group['X'], group['Y'], label=label)

    # Show the plot
    plt.show()
    return fig

In [None]:
def plotPointsFromCSV(data, title=''):
    # Extract X, Y, and Labels columns from the DataFrame
    points = data[['X', 'Y']].to_numpy()
    labels = data['Labels']

    # Find unique cluster labels
    unique_labels = labels.unique()

    # Create clusters based on unique labels
    clusters = [points[labels == label] for label in unique_labels]

    fig, ax = plt.subplots()

    # Plot all points in black
    ax.scatter(points[:, 0], points[:, 1], c='k', linewidth=0.2, edgecolor='w', facecolor=None)

    # Plot clusters using the 'viridis' colormap
    colors = cm.viridis(np.linspace(0, 1, len(unique_labels)))

    for color, cluster in zip(colors, clusters):
        ax.scatter(cluster[:, 0], cluster[:, 1], c=color, linewidth=0.2, edgecolor='w')

    # Set the title
    ax.set_title(title)

    # Set the X and Y limits to ensure a fixed size of 6500x6500 units
    ax.set_xlim(0, 6500)
    ax.set_ylim(0, 6500)

    # Turn on the grid, set color to grey
    ax.grid(color='0.5')

    # Set background color to black
    ax.set_facecolor('black')

    # Set the ratio to the window size 1:1
    ax.set_aspect('equal')
    plt.tight_layout()

    # Show the plot
    plt.show()

In [None]:
def save_plot_and_show(plot_function, filename, *args, **kwargs):
    """
    Plot using the provided plot_function, save the figure to the given filename,
    and then display the plot.

    Parameters:
    - plot_function: The function to create the plot.
    - filename: The name of the file to save the plot.
    - *args, **kwargs: Additional arguments to pass to the plot_function.
    """
    # Create the plot using the provided function and arguments
    fig = plot_function(*args, **kwargs)

    # Save the plot to the specified filename
    fig.savefig(filename)

In [None]:
def run_clustering(input_data, min_points, epsilon, w, max_points_ratio, cluster_similarity_threshold, t=150):
  input_data = input_data.values

  # Plot input data
  plotPoints(input_data, title='Input data')


  t1 = time.process_time()

  # Run OPTICS ordering
  ordered_list = runOPTICS(input_data, epsilon, min_points)

  print('Total time for processing', time.process_time() - t1, 's')

  # Plot the reachability diagram
  plotClusteringReachability(ordered_list[:,1])

  # Do the gradient clustering
  clusters = gradientClustering(ordered_list[:,1], min_points, t, w)

  # Remove very large clusters
  filtered_clusters = filterLargeClusters(clusters, len(ordered_list), max_points_ratio)

  # Plot the results, reachability diagram
  plotClusteringReachability(ordered_list[:,1], filtered_clusters)


  # Merge similar clusters by looking at the ratio of their intersection and their total number
  filtered_clusters = mergeSimilarClusters(filtered_clusters, cluster_similarity_threshold)

  # Plot the results, reachability diagram
  plotClusteringReachability(ordered_list[:,1], filtered_clusters)

  # Plot the final results
  plotPoints(input_data, ordered_list, filtered_clusters, title='Final results')
  return filtered_clusters, ordered_list

In [None]:
def run_clustering_without_plots(input_data, min_points, epsilon, w, max_points_ratio, cluster_similarity_threshold, t=150):
  input_data = input_data.values

  # Plot input data
  plotPoints(input_data, title='Input data')


  t1 = time.process_time()

  # Run OPTICS ordering
  ordered_list = runOPTICS(input_data, epsilon, min_points)

  print('Total time for processing', time.process_time() - t1, 's')

  # Do the gradient clustering
  clusters = gradientClustering(ordered_list[:,1], min_points, t, w)

  # Remove very large clusters
  filtered_clusters = filterLargeClusters(clusters, len(ordered_list), max_points_ratio)

  # Merge similar clusters by looking at the ratio of their intersection and their total number
  filtered_clusters = mergeSimilarClusters(filtered_clusters, cluster_similarity_threshold)

  return filtered_clusters, ordered_list

### Obtain parameter for clusters

In [None]:
# Import values from Github

import pandas as pd
df02 = pd.read_csv('df2.csv')
df10 = pd.read_csv('df10.csv')
df20 = pd.read_csv('df20.csv')
df30 = pd.read_csv('df30.csv')
df40 = pd.read_csv('df40.csv')
df60 = pd.read_csv('df60.csv')
df80 = pd.read_csv('df80.csv')
df100 = pd.read_csv('df100.csv')
df118 = pd.read_csv('df118.csv')

df02 = df02[['Area', 'X', 'Y', 'IntDen']].copy()
df10 = df10[['Area', 'X', 'Y', 'IntDen']].copy()
df20 = df20[['Area', 'X', 'Y', 'IntDen']].copy()
df30 = df30[['Area', 'X', 'Y', 'IntDen']].copy()
df40 = df40[['Area', 'X', 'Y', 'IntDen']].copy()
df60 = df60[['Area', 'X', 'Y', 'IntDen']].copy()
df80 = df80[['Area', 'X', 'Y', 'IntDen']].copy()
df100 = df100[['Area', 'X', 'Y', 'IntDen']].copy()
df118 = df118[['Area', 'X', 'Y', 'IntDen']].copy()

T02 = df02[['X', 'Y']].copy()
T10 = df10[['X', 'Y']].copy()      #There was no ROI.zip file for Point 1 by the origional Macro Provided.
T20 = df20[['X', 'Y']].copy()
T30 = df30[['X', 'Y']].copy()
T40 = df40[['X', 'Y']].copy()
T60 = df60[['X', 'Y']].copy()
T80 = df80[['X', 'Y']].copy()
T100 = df100[['X', 'Y']].copy()
T118 = df118[['X', 'Y']].copy()

In [None]:
df60 = pd.read_csv('df60.csv')

In [None]:
T60 = df60[['X', 'Y']].copy()

showcase just a few examples

#### Selected Examples

##### T01

In [None]:
clusters, coord = run_clustering(T1, min_points=325, epsilon=260, w=10, max_points_ratio=0.8, cluster_similarity_threshold=0.5)

##### T02

In [None]:
clusters, coord = run_clustering(T02, min_points=400, epsilon=260, w=20, max_points_ratio=0.8, cluster_similarity_threshold=0.7)

##### T10

In [None]:
clusters, coord = run_clustering(T10, min_points=200, epsilon=300, w=5.5, max_points_ratio=0.6, cluster_similarity_threshold=0.6)

##### T20

In [None]:
clusters, coord = run_clustering(T20, min_points=150, epsilon=300, w=5.25, max_points_ratio=0.6, cluster_similarity_threshold=0.5)

##### T30

In [None]:
clusters, coord = run_clustering(T30, min_points=75, epsilon=300, w=4.75, max_points_ratio=0.6, cluster_similarity_threshold=0.4)

##### T40

In [None]:
clusters, coord = run_clustering(T40, min_points=40, epsilon=300, w=4.75, max_points_ratio=0.6, cluster_similarity_threshold=0.4)

##### T60

In [None]:
clusters, coord = run_clustering(T60, min_points=25,
epsilon=250,
w=4.5,
max_points_ratio=0.8,
cluster_similarity_threshold=0.1)

##### T80

In [None]:
clusters, coord = run_clustering(T80, min_points=25,epsilon=250,w=4.4,max_points_ratio=0.8,cluster_similarity_threshold=0.3)

##### T100

In [None]:
clusters, coord = run_clustering(T100, min_points=7,epsilon=200,w=3.5,max_points_ratio=0.8,cluster_similarity_threshold=0.1)

##### T118

In [None]:
clusters, coord = run_clustering(T118, min_points=3, epsilon=200,w=3,max_points_ratio=0.8,cluster_similarity_threshold=0.1)

### generalise the analysis to all the values

Here there are the obtained results from the previous paragraph

#### Old Algorithm

In [None]:
import matplotlib.pyplot as plt
import numpy as np
t_data = np.array([1, 2, 10, 20, 30, 40, 60, 80, 100, 118])
w_data = np.array([10, 20, 5.5, 5.25, 4.75, 4.75, 4.5, 4.4, 3.5, 3])

# Create an interp1d object with 'slinear' interpolation
interp_func = interp1d(t_data, w_data, kind='slinear')

# Generate finer x values for plotting
t_fine = np.linspace(min(t_data), max(t_data), max(t_data))
w_fine = interp_func(t_fine)

# Plot the original data and the interpolated curve
plt.scatter(t_data, w_data, label='Data')
plt.plot(t_fine, w_fine, label='Interpolated Curve', color='red')
plt.xlabel('t')
plt.ylabel('w')
plt.legend()
plt.show()

In [None]:
t_data = np.array([1, 2, 10, 20, 30, 40, 60, 80, 100, 118])
max_points_ratio_data = np.array([0.8, 0.8, 0.6, 0.6, 0.6, 0.6, 0.8, 0.8, 0.8, 0.8])

# Create an interp1d object with 'slinear' interpolation
interp_func = interp1d(t_data, max_points_ratio_data, kind='slinear')

# Generate finer x values for plotting
t_fine = np.linspace(min(t_data), max(t_data), max(t_data))
max_points_ratio_fine = interp_func(t_fine)

# Plot the original data and the interpolated curve
plt.scatter(t_data, max_points_ratio_data, label='Data')
plt.plot(t_fine, max_points_ratio_fine, label='Interpolated Curve', color='red')
plt.xlabel('t')
plt.ylabel('max_points_ratio_data')
plt.legend()
plt.show()


In [None]:
t_data = np.array([1, 2, 10, 20, 30, 40, 60, 80, 100, 118])
cluster_similarity_threshold_data = np.array([0.5, 0.7, 0.6, 0.5, 0.4, 0.4, 0.4, 0.3, 0.1, 0.1])

# Create an interp1d object with 'slinear' interpolation
interp_func = interp1d(t_data, cluster_similarity_threshold_data, kind='slinear')

# Generate finer x values for plotting
t_fine = np.linspace(min(t_data), max(t_data), max(t_data))
cluster_similarity_threshold_fine = interp_func(t_fine)

# Plot the original data and the interpolated curve
plt.scatter(t_data, cluster_similarity_threshold_data, label='Data')
plt.plot(t_fine, cluster_similarity_threshold_fine, label='Interpolated Curve', color='red')
plt.xlabel('t')
plt.ylabel('cluster_similarity_threshold_data')
plt.legend()
plt.show()

In [None]:
t_data = np.array([1, 2, 10, 20, 30, 40, 60, 80, 100, 118])
min_points_data = np.array([325, 400, 200, 150, 75, 40, 25, 25, 7, 3])

# Create an interp1d object with 'slinear' interpolation
interp_func = interp1d(t_data, min_points_data, kind='slinear')

# Generate finer x values for plotting
t_fine = np.linspace(min(t_data), max(t_data), max(t_data))
min_points_fine = interp_func(t_fine)

# Plot the original data and the interpolated curve
plt.scatter(t_data, min_points_data, label='Data')
plt.plot(t_fine, min_points_fine, label='Interpolated Curve', color='red')
plt.xlabel('t')
plt.ylabel('min_points_data')
plt.legend()
plt.show()

In [None]:
t_data = np.array([1, 2, 10, 20, 30, 40, 60, 80, 100, 118])
epsilon_data = np.array([260, 260, 300, 300, 300, 300, 250, 250, 200, 200])

# Create an interp1d object with 'slinear' interpolation
interp_func = interp1d(t_data, epsilon_data, kind='slinear')

# Generate finer x values for plotting
t_fine = np.linspace(min(t_data), max(t_data), max(t_data))
epsilon_fine = interp_func(t_fine)

# Plot the original data and the interpolated curve
plt.scatter(t_data, epsilon_data, label='Data')
plt.plot(t_fine, epsilon_fine, label='Interpolated Curve', color='red')
plt.xlabel('t')
plt.ylabel('epsilon_data')
plt.legend()
plt.show()

In [None]:
t_data = np.array([0, 40, 48, 65, 72, 90, 97, 98, 100, 118])
p_data = np.array([0, 0, 200, 300, 400, 500, 700, 850, 850, 1000])

# Create an interp1d object with 'slinear' interpolation
interp_func = interp1d(t_data, p_data, kind='slinear')

# Generate finer x values for plotting
t_fine = np.linspace(min(t_data), max(t_data), max(t_data))
p_fine = interp_func(t_fine)

# Plot the original data and the interpolated curve
plt.scatter(t_data, p_data, label='Data')
plt.plot(t_fine, p_fine, label='Interpolated Curve', color='red')
plt.xlabel('t')
plt.ylabel('p_data')
plt.legend()
plt.show()

In [None]:
t_data = np.array([0, 1, 118])
s_data = np.array([-750, 4000, 4000])

# Create an interp1d object with 'slinear' interpolation
interp_func = interp1d(t_data, s_data, kind='slinear')

# Generate finer x values for plotting
t_fine = np.linspace(min(t_data), max(t_data), max(t_data))
s_fine = interp_func(t_fine)

# Plot the original data and the interpolated curve
plt.scatter(t_data, s_data, label='Data')
plt.plot(t_fine, s_fine, label='Interpolated Curve', color='red')
plt.xlabel('t')
plt.ylabel('s_data')
plt.legend()
plt.show()

#### New Algorithm

In [None]:
import matplotlib.pyplot as plt

t_data = np.array([1, 20, 60, 80, 100, 118])
w_data = np.array([4.8, 4, 2, 1, 0.05, 0.01])

# Create an interp1d object with 'slinear' interpolation
interp_func = interp1d(t_data, w_data, kind='slinear')

# Generate finer x values for plotting
t_fine = np.linspace(min(t_data), max(t_data), max(t_data))
w_fine_new = interp_func(t_fine)

# Plot the original data and the interpolated curve
plt.scatter(t_data, w_data, label='Data')
plt.plot(t_fine, w_fine_new, label='Interpolated Curve', color='red')
plt.xlabel('t')
plt.ylabel('w')
plt.legend()
plt.show()

In [None]:
t_data = np.array([1, 20, 60, 80, 100, 118])
max_points_ratio_data = np.array([0.6, 0.6, 0.8, 0.8, 0.8, 0.9])

# Create an interp1d object with 'slinear' interpolation
interp_func = interp1d(t_data, max_points_ratio_data, kind='slinear')

# Generate finer x values for plotting
t_fine = np.linspace(min(t_data), max(t_data), max(t_data))
max_points_ratio_fine_new = interp_func(t_fine)

# Plot the original data and the interpolated curve
plt.scatter(t_data, max_points_ratio_data, label='Data')
plt.plot(t_fine, max_points_ratio_fine_new, label='Interpolated Curve', color='red')
plt.xlabel('t')
plt.ylabel('max_points_ratio_data')
plt.legend()
plt.show()


In [None]:
t_data = np.array([1, 20, 60, 80, 100, 118])
cluster_similarity_threshold_data = np.array([0.6, 0.5, 0.03, 0.02, 0.02, 0.02])

# Create an interp1d object with 'slinear' interpolation
interp_func = interp1d(t_data, cluster_similarity_threshold_data, kind='slinear')

# Generate finer x values for plotting
t_fine = np.linspace(min(t_data), max(t_data), max(t_data))
cluster_similarity_threshold_fine_new = interp_func(t_fine)

# Plot the original data and the interpolated curve
plt.scatter(t_data, cluster_similarity_threshold_data, label='Data')
plt.plot(t_fine, cluster_similarity_threshold_fine_new, label='Interpolated Curve', color='red')
plt.xlabel('t')
plt.ylabel('cluster_similarity_threshold_data')
plt.legend()
plt.show()

In [None]:
t_data = np.array([1, 20, 60, 80, 100, 118])
min_points_data = np.array([100, 95, 50, 15, 5, 3])

# Create an interp1d object with 'slinear' interpolation
interp_func = interp1d(t_data, min_points_data, kind='slinear')

# Generate finer x values for plotting
t_fine = np.linspace(min(t_data), max(t_data), max(t_data))
min_points_fine_new = interp_func(t_fine)

# Plot the original data and the interpolated curve
plt.scatter(t_data, min_points_data, label='Data')
plt.plot(t_fine, min_points_fine_new, label='Interpolated Curve', color='red')
plt.xlabel('t')
plt.ylabel('min_points_data')
plt.legend()
plt.show()

In [None]:
t_data = np.array([1, 20, 60, 80, 100, 118])
epsilon_data = np.array([500, 450, 300, 300, 250, 250])

# Create an interp1d object with 'slinear' interpolation
interp_func = interp1d(t_data, epsilon_data, kind='slinear')

# Generate finer x values for plotting
t_fine = np.linspace(min(t_data), max(t_data), max(t_data))
epsilon_fine_new = interp_func(t_fine)

# Plot the original data and the interpolated curve
plt.scatter(t_data, epsilon_data, label='Data')
plt.plot(t_fine, epsilon_fine_new, label='Interpolated Curve', color='red')
plt.xlabel('t')
plt.ylabel('epsilon_data')
plt.legend()
plt.show()

#### cluster all tables from Google Drive

##### importing values from drive

In [None]:
from google.colab import drive
import os
import pandas as pd
from tqdm import tqdm

# Mount Google Drive
drive.mount('/content/drive')

In [None]:
# Define the folder path in your Google Drive containing the CSV files
folder_path = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/Original_Algorithm/Raw CSV'

# List all CSV files in the folder and sort them alphabetically
csv_files = sorted([csv_file for csv_file in os.listdir(folder_path) if csv_file.endswith('.csv')])

csv_data = []  # List to store dataframes

for csv_file in tqdm(csv_files, desc="Processing CSV files"):
    df = pd.read_csv(os.path.join(folder_path, csv_file))
    csv_data.append(df)


Note, we have a particular value for df1 that would correspond to csv_file[117]. So it's inverted. For this reason, we can invert the list (rather than inverting 6 lists that is the other possibility)

In [None]:
t_data = []
for i in tqdm(range(len((csv_data)))):
  t_data.append(csv_data[i][['X', 'Y']].copy())

The DataFrame is at index 0. SO for df1, the position is 0. It's in otder

##### Increase robustness merging clusters together

In [None]:
def delete_outliers(data):
  L_max = 1000
    # Iterate through unique cluster labels
  for cluster_label in data['Labels'].unique():
        # Get the rows corresponding to the current cluster
        cluster = data[data['Labels'] == cluster_label]

        # Calculate the mean 'X' and 'Y' values for the current cluster
        X_cluster_location = cluster['X'].mean()
        Y_cluster_location = cluster['Y'].mean()

        # Create a mask to identify outliers in the cluster
        mask = (abs(cluster['X'] - X_cluster_location) <= L_max) & (abs(cluster['Y'] - Y_cluster_location) <= L_max)

        # Remove outliers from the cluster
        cluster = cluster[mask]

        # Update the data DataFrame with the modified cluster
        data = pd.concat([data[data['Labels'] != cluster_label], cluster])

  return data


In [None]:
from scipy.stats import gmean
import pandas as pd

def data_cluster(cluster):
  Ari_light_intensity = sum(cluster['IntDen']) / len(cluster)
  Geo_light_intensity = geometric_mean = gmean(cluster.loc[:,'IntDen'])
  Ari_cell_size = sum(cluster['Area']) / len(cluster)
  Geo_cell_size = geometric_mean = gmean(cluster.loc[:,'Area'])
  Cluster_dimension = sum(cluster['Area'])
  N_cell = len(cluster)
  X_cluster_location = sum(cluster['X']) / len(cluster)
  Y_cluster_location = sum(cluster['Y']) / len(cluster)
  Labels = sum(cluster['Labels']) / len(cluster)

  cluster = {
      'Ari_light_intensity': [Ari_light_intensity],
      'Geo_light_intensity': [Geo_light_intensity],
      'Ari_cell_size': [Ari_cell_size],
      'Geo_cell_size': [Geo_cell_size],
      'Cluster_dimension': [Cluster_dimension],
      'N_cell': [N_cell],
      'X_cluster_location': [X_cluster_location],
      'Y_cluster_location': [Y_cluster_location],
      'Labels': [Labels]
  }

  cluster_df = pd.DataFrame(cluster)
  return cluster_df

In [None]:
def merge_clusters_and_list(cluster_df, p=0):
    cluster_list = []
    T_x = 1000 - p
    T_y = 1000 - p
    i = len(cluster_df) - 1

    while i >= 0:
        j = i + 1  # Initialize j

        while j < len(cluster_df):
            distance_x, distance_y = calculate_distance(cluster_df.iloc[i]['X_cluster_location'], cluster_df.iloc[i]['Y_cluster_location'], cluster_df.iloc[j]['X_cluster_location'], cluster_df.iloc[j]['Y_cluster_location'])

            if distance_x < T_x and distance_y < T_y and i != j:
                # save the values in the cluster list
                cluster_list.append(cluster_df['Labels'][i])
                cluster_list.append(cluster_df['Labels'][j])
                # Update new_x and new_y based on the average of the two points
                new_Ari_light_intensity = (cluster_df.iloc[i]['Ari_light_intensity']*cluster_df.iloc[i]['N_cell'] + cluster_df.iloc[j]['Ari_light_intensity']*cluster_df.iloc[j]['N_cell'])/ (cluster_df.iloc[i]['N_cell'] + cluster_df.iloc[j]['N_cell'])
                log_Geo_light_intensity = (np.log(cluster_df.iloc[i]['Geo_light_intensity'])*cluster_df.iloc[i]['N_cell'] + np.log(cluster_df.iloc[j]['Geo_light_intensity'])*cluster_df.iloc[j]['N_cell'])/ (cluster_df.iloc[i]['N_cell'] + cluster_df.iloc[j]['N_cell'])
                new_Geo_light_intensity = np.exp(log_Geo_light_intensity)
                new_Ari_cell_size = (cluster_df.iloc[i]['Ari_light_intensity']*cluster_df.iloc[i]['N_cell'] + cluster_df.iloc[j]['Ari_light_intensity']*cluster_df.iloc[j]['N_cell'])/ (cluster_df.iloc[i]['N_cell'] + cluster_df.iloc[j]['N_cell'])
                log_Geo_cell_size = (np.log(cluster_df.iloc[i]['Geo_cell_size'])*cluster_df.iloc[i]['N_cell'] + np.log(cluster_df.iloc[j]['Geo_cell_size'])*cluster_df.iloc[j]['N_cell'])/ (cluster_df.iloc[i]['N_cell'] + cluster_df.iloc[j]['N_cell'])
                new_Geo_cell_size = np.exp(log_Geo_cell_size)
                new_Cluster_dimension = cluster_df.iloc[i]['Ari_cell_size'] + cluster_df.iloc[j]['Ari_cell_size']
                new_N_cell = cluster_df.iloc[i]['N_cell'] + cluster_df.iloc[j]['N_cell']
                new_X_cluster_location = (cluster_df.iloc[i]['X_cluster_location']*cluster_df.iloc[i]['N_cell'] + cluster_df.iloc[j]['X_cluster_location']*cluster_df.iloc[j]['N_cell'])/ (cluster_df.iloc[i]['N_cell'] + cluster_df.iloc[j]['N_cell'])
                new_Y_cluster_location = (cluster_df.iloc[i]['Y_cluster_location']*cluster_df.iloc[i]['N_cell'] + cluster_df.iloc[j]['Y_cluster_location']*cluster_df.iloc[j]['N_cell'])/ (cluster_df.iloc[i]['N_cell'] + cluster_df.iloc[j]['N_cell'])
                Labels = cluster_df['Labels'][j]
                # New row data
                new_row = [new_Ari_light_intensity, new_Geo_light_intensity, new_Ari_cell_size, new_Geo_cell_size, new_Cluster_dimension, new_N_cell, new_X_cluster_location, new_Y_cluster_location, Labels]

                # Add the new row using DataFrame.loc
                cluster_df.loc[len(cluster_df)] = new_row

                # Positions (indices) of the rows to be removed
                positions_to_remove = [i, j]

                # Drop rows by position
                cluster_df.drop(positions_to_remove, inplace=True)
                cluster_df.reset_index(drop=True, inplace=True)
                j = i + 1  # Increment j
            else:
                j += 1  # Increment j

        i -= 1  # Decrement i

    return cluster_list, cluster_df

In [None]:
def change_labels(cluster_list, data):
    for i in range(len(cluster_list)):
        if i % 2 == 0:  # Check if 'i' is even
            data.loc[data['Labels'] == cluster_list[i], 'Labels'] = cluster_list[i + 1]
    return data

In [None]:
def listing_clusters(dataframe):
    # Create an empty list to store DataFrames for each cluster
    cluster_list = []

    # Get unique cluster values from the 'Cluster' column
    unique_clusters = dataframe['Labels'].unique()

    # Iterate through unique cluster values and create a DataFrame for each
    for cluster_value in unique_clusters:
        # Filter the DataFrame for the current cluster value
        cluster_df = dataframe[dataframe['Labels'] == cluster_value]

        # Append the filtered DataFrame to the list
        cluster_list.append(cluster_df)

    return cluster_list

In [None]:
def merge_cluster_single_points(data, p=0):
  cluster_list = listing_clusters(data)

  stor = []
  for i in range(len(cluster_list)):
    stor.append(data_cluster(cluster_list[i]))

  # Combine the dataframes in stor into a single dataframe
  combined_df = pd.concat(stor, ignore_index=True)

  cluster_list, merged_cluster = merge_clusters_and_list(combined_df, p)
  data = change_labels(cluster_list, data)
  return data, merged_cluster

In [None]:
def run_clustering_no_plots(input_data, min_points=150, epsilon=500, t=150, w=5, max_points_ratio=0.5, cluster_similarity_threshold=0.8):
  input_data = input_data.values

  # Run OPTICS ordering
  ordered_list = runOPTICS(input_data, epsilon, min_points)

  # Do the gradient clustering
  clusters = gradientClustering(ordered_list[:,1], min_points, t, w)

  # Remove very large clusters
  filtered_clusters = filterLargeClusters(clusters, len(ordered_list), max_points_ratio)

  # Merge similar clusters by looking at the ratio of their intersection and their total number
  filtered_clusters = mergeSimilarClusters(filtered_clusters, cluster_similarity_threshold)

  return filtered_clusters, ordered_list

In [None]:
def saveResults(df, ordered_list, time, clusters=[], filename='results.csv'):
  # Create an empty dataframe to store the results
  results = pd.DataFrame(df)

  # Plot clusters, if any
  if clusters:
        # Generate a list of colors for each cluster and randomize their order (so close clusters would have
        # significcantly different color)
        colors = cm.inferno(np.linspace(0.3, 1, len(clusters)))
        color_order = random.sample(range(len(colors)), len(colors))

        # Create a tuple to map each unique color to a unique integer ID
        tuple_colors = [tuple(color) for color in colors]
        unique_colors = list(set(tuple_colors))
        color_to_id = {color: i for i, color in enumerate(unique_colors)}

        results['Labels'] = np.nan
        results['Time'] = np.nan
        # Store the data in the results dataframe
        for color, cluster in zip(colors[color_order], clusters):
            x = ordered_list[cluster][:,3]
            y = ordered_list[cluster][:,4]
            labels = [color_to_id[tuple(color)]] * len(x)
            # Replace column X and Y with x and y
            results.loc[cluster, 'X'] = x
            results.loc[cluster, 'Y'] = y

             # Create the Labels column and store labels in the dataframe df
            results.loc[cluster, 'Labels'] = labels
            results.loc[cluster, 'Time'] = time

  # Save the results to a CSV file
  results = results.dropna(subset=['Labels'])
  results.to_csv(filename, index=False)
  return results

#### Example with some points

##### T02

In [None]:
clusters, coord = run_clustering_no_plots(T10, min_points=200, epsilon=300, w=5.5, max_points_ratio=0.6, cluster_similarity_threshold=0.6)

In [None]:
df10 = saveResults(df10, coord, 10, clusters, 'Results_010.csv')

In [None]:
plotPointsFromCSV(df10)

In [None]:
df02 = delete_outliers(df02)

This approach can allow to have info either about the clusters, or about the single points. Moreover, there would be the same label between clusters and single points

##### T10

In [None]:
clusters, coord = run_clustering_no_plots(T10, min_points=200, epsilon=300, w=5.5, max_points_ratio=0.6, cluster_similarity_threshold=0.6)

In [None]:
df10 = saveResults(df10, coord, 10, clusters, 'Results_010.csv')

In [None]:
plotPointsFromCSV(df10)

In [None]:
df10 = delete_outliers(df10)

In [None]:
plotPointsFromCSV(df10)

##### T40

In [None]:
clusters, coord = run_clustering(T40, min_points=40, epsilon=300, w=4.75, max_points_ratio=0.6, cluster_similarity_threshold=0.4)

In [None]:
df40 = saveResults(df40, coord, 40, clusters, 'Results_040.csv')

In [None]:
plotPointsFromCSV(df40)

In [None]:
df40

In [None]:
cluster_list = listing_clusters(df40)

In [None]:
  stor = []
  for i in range(len(cluster_list)):
    stor.append(data_cluster(cluster_list[i]))

  # Combine the dataframes in stor into a single dataframe
  combined_df = pd.concat(stor, ignore_index=True)


In [None]:
cluster_list, merged_cluster = merge_clusters_and_list(combined_df)

In [None]:
data = change_labels(cluster_list, df40)

In [None]:
plotPointsFromCSV(data)

##### T60

In [None]:
clusters, coord = run_clustering_no_plots(T60, min_points=25,
epsilon=250,
w=4.5,
max_points_ratio=0.8,
cluster_similarity_threshold=0.1)

In [None]:
df60 = saveResults(df60, coord, 60, clusters, 'Results_060.csv')

In [None]:
plotPointsFromCSV(df60)

In [None]:
df60_new, merged_cluster = merge_cluster_single_points(df60, p =300)

In [None]:
plotPointsFromCSV(df60_new)

#### Generalisation

In [None]:
df_list = []
cluster_list = []
for i in tqdm(range(len((csv_data)))):
  clusters, coord = run_clustering_no_plots(t_data[i], min_points=min_points_fine[i], epsilon=epsilon_fine[i], w=w_fine[i], max_points_ratio=max_points_ratio_fine[i], cluster_similarity_threshold = cluster_similarity_threshold_fine[i])
  df_tmp = saveResults(csv_data[i], coord, int(i + 1), clusters, 'Results_'+ str(i + 1) +'.csv')
  df_tmp = delete_outliers(df_tmp)
  df_tmp, merged_cluster = merge_cluster_single_points(df_tmp, p =p_fine[i])
  df_list.append(df_tmp)
  merged_cluster['Time'] = int(i + 1)
  cluster_list.append(merged_cluster)

We can save the value so they won't get lost in case of a problem

If I want to save all the files together

In [None]:
# Specify the file path where you want to save the list
file_path_df = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/Original_Algorithm/df_list.pkl'
file_path_cluster = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/Original_Algorithm/cluster_list.pkl'
# Open the file in binary write mode and save the list using pickle.dump
with open(file_path_df, 'wb') as file:
    pickle.dump(df_list, file)

with open(file_path_cluster, 'wb') as file:
    pickle.dump(cluster_list, file)

If I want to take my file back

In [None]:
# Specify the same file path where you saved the list
file_path_df = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/Original_Algorithm/df_list.pkl'
file_path_cluster = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/Original_Algorithm/cluster_list.pkl'


# Load the first file (assuming it's a pickled object)
with open(file_path_df, 'rb') as file:
    df_list = pickle.load(file)

# Load the second file (assuming it's a pickled object)
with open(file_path_cluster, 'rb') as file:
    cluster_list = pickle.load(file)


If I want to save the files indipendently

In [None]:
file_path_cluster = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/Original_Algorithm/cluster_list.pkl'
# Open the file in binary write mode and save the list using pickle.dump

with open(file_path_cluster, 'wb') as file:
    pickle.dump(cluster_list, file)

#### Link clusters together

##### Used functions

Now it's the turn to work with the files to link the clusters together

In [None]:
import math

def calculate_distance_euclide(x1, y1, x2, y2):
    distance_x = x2 - x1
    distance_y = y2 - y1
    euclidean_distance = math.sqrt(distance_x**2 + distance_y**2)
    return euclidean_distance


In [None]:
def calculate_distance(x1, y1, x2, y2):
    distance_x = abs(x2 - x1)
    distance_y = abs(y2 - y1)
    return distance_x, distance_y

In [None]:
def link_clusters(far_time_point, recent_time_point, linked_clusters, p = 0, s = 0):
    k = 1
    linked_clusters = linked_clusters.copy()
    far_time_point = far_time_point.copy()
    recent_time_point = recent_time_point.copy()
    swap_list = []
    N_min = 5000 - s
    for i in range(len(recent_time_point)):
        T_min = float('inf')  # Reset T_min for each recent_time_point iteration
        predefined = 1000 - p
        closest = None  # Initialize closest index

        for j in range(len(far_time_point)):
            distance_euclide = calculate_distance_euclide(recent_time_point.iloc[i]['X_cluster_location'], recent_time_point.iloc[i]['Y_cluster_location'], far_time_point.iloc[j]['X_cluster_location'], far_time_point.iloc[j]['Y_cluster_location'])
            difference_cells = abs(recent_time_point.iloc[i]['N_cell'] - far_time_point.iloc[j]['N_cell'])
            if distance_euclide < T_min and distance_euclide < predefined and difference_cells < N_min:
                T_min = distance_euclide  # Update T_min with the minimum distance
                closest = j

        if closest is not None:
            # I want the row to have the same label
            swap_list.append(recent_time_point.iloc[i]['Labels'])
            swap_list.append(-far_time_point.iloc[closest]['Labels'])
            recent_time_point.loc[i, 'Labels'] = far_time_point.loc[closest, 'Labels']
            # Remove the cluster considered from far_time_point
            far_time_point.drop(index=closest, inplace=True)
            far_time_point.reset_index(drop=True, inplace=True)
        else:
            swap_list.append(recent_time_point.iloc[i]['Labels'])
            recent_time_point.loc[i, 'Labels'] = (max(linked_clusters['Labels']) + k)
            k = k + 1
            swap_list.append(recent_time_point.iloc[i]['Labels'])


    dataframes_to_concat = [linked_clusters, recent_time_point]

    # Use pd.concat to concatenate the DataFrames
    linked_clusters = pd.concat(dataframes_to_concat, axis=0, ignore_index=True)

    return linked_clusters, recent_time_point, swap_list

In [None]:
def swapping_labels(df_all, df_new, swap_list):
    df_all = df_all.copy()
    df_new = df_new.copy()

    df_new = change_labels(swap_list, df_new)
    df_new['Labels'] = -df_new['Labels']
    df_new['Labels'] = abs(df_new['Labels'])

    dataframes_to_concat = [df_all, df_new]
    # Use pd.concat to concatenate the DataFrames
    df_all = pd.concat(dataframes_to_concat, axis=0, ignore_index=True)

    return df_all

In [None]:
def change_labels(cluster_list, data):
    data = data.copy()
    for i in range(len(cluster_list)):
        if i % 2 == 0:  # Check if 'i' is even
            if (cluster_list[i + 1] == 0):
                cluster_list[i + 1] = -1000  # just a big number
            data.loc[data['Labels'].astype(str) == str(cluster_list[i]), 'Labels'] = cluster_list[i + 1]

    data.loc[data['Labels'] == -1000, 'Labels'] = 0
    return data

In [None]:
def cluster_display_plots_in_colab(data):
    # Iterate over each timepoint in your DataFrame
    for timepoint in data['Time'].unique():
        # Create a sub-dataframe for the current timepoint
        sub_data = data[data['Time'] == timepoint]

        # Create a new figure
        fig, ax = plt.subplots()

        # Call your plotting function with the sub-dataframe
        cluster_same_colour(sub_data, title=f"Timepoint {timepoint}")

        # Display the current figure in Colab
        plt.show()

In [None]:
def display_png_plots(data):
    # Iterate over each timepoint in your DataFrame
    for timepoint in data['Time'].unique():
        # Create a sub-dataframe for the current timepoint
        sub_data = data[data['Time'] == timepoint]

        # Create a new figure
        fig, ax = plt.subplots()

        # Call your plotting function with the sub-dataframe
        plotPointsFromCSV_same_colour(sub_data, title=f"Timepoint {timepoint}")

        # Display the current figure in the Colab notebook
        plt.show()

In [None]:
def plotPointsFromCSV_same_colour(data, title=''):
    # Extract X, Y, and Labels columns from the DataFrame
    points = data[['X', 'Y']].to_numpy()
    labels = data['Labels']

    # Find unique cluster labels
    unique_labels = labels.unique()

    # Define a color palette for the first 10 labels
    palette = sns.color_palette("Set1", n_colors=100)

    # Create a dictionary to map labels to colors
    label_to_color = {label: color for label, color in zip(unique_labels, palette)}

    fig, ax = plt.subplots()

    # Plot all points in black
    ax.scatter(points[:, 0], points[:, 1], c='k', linewidth=0.2, edgecolor='w', facecolor=None)

    for label in unique_labels:
        color = label_to_color.get(label, 'k')  # Use black for labels not in the first 10
        cluster = points[labels == label]
        ax.scatter(cluster[:, 0], cluster[:, 1], color=color, linewidth=0.2, edgecolor='w', label=str(label))

    # Set the title
    ax.set_title(title)

    # Set the X and Y limits to ensure a fixed size of 6500x6500 units
    ax.set_xlim(0, 6500)
    ax.set_ylim(0, 6500)

    # Turn on the grid, set color to grey
    ax.grid(color='0.5')

    # Set background color to black
    ax.set_facecolor('black')

    # Set the ratio to the window size 1:1
    ax.set_aspect('equal')

    # Add a legend for labels
    ax.legend(title='Labels', loc='upper right', bbox_to_anchor=(1.2, 1.0))

    return fig


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plotPointsFromCSV_same_colour_unique(data, title=''):
    # Extract X, Y, and Labels columns from the DataFrame
    points = data[['X', 'Y']].to_numpy()
    labels = data['Labels']

    # Find unique cluster labels
    unique_labels = labels.unique()

    # Define a color palette for the labels
    palette = sns.color_palette("Set1", n_colors=len(unique_labels))

    # Create a dictionary to map labels to colors
    label_to_color = {label: color for label, color in zip(unique_labels, palette)}

    fig, ax = plt.subplots()

    # Plot all points in black
    ax.scatter(points[:, 0], points[:, 1], c='k', linewidth=0.2, edgecolor='w', facecolor=None)

    for label in unique_labels:
        color = label_to_color.get(label, 'k')  # Use black for labels not in the palette
        cluster = points[labels == label]
        ax.scatter(cluster[:, 0], cluster[:, 1], color=color, linewidth=0.2, edgecolor='w', label=str(label))

    # Set the title
    ax.set_title(title)

    # Set the X and Y limits to ensure a fixed size of 6500x6500 units
    ax.set_xlim(0, 6500)
    ax.set_ylim(0, 6500)

    # Turn on the grid, set color to grey
    ax.grid(color='0.5')

    # Set background color to black
    ax.set_facecolor('black')

    # Set the ratio to the window size 1:1
    ax.set_aspect('equal')

    # Add a legend for labels
    ax.legend(title='Labels', loc='upper right', bbox_to_anchor=(1.2, 1.0))

    return fig


In [None]:
def cluster_same_colour(data, title=''):
    # Extract X, Y, and Labels columns from the DataFrame
    points = data[['X_cluster_location', 'Y_cluster_location']].to_numpy()
    labels = data['Labels']

    # Find unique cluster labels
    unique_labels = labels.unique()

    # Define a color palette for the first 10 labels
    palette = sns.color_palette("Set1", n_colors=10)

    # Create a dictionary to map labels to colors
    label_to_color = {label: color for label, color in zip(unique_labels, palette)}

    fig, ax = plt.subplots()

    # Plot all points in black
    ax.scatter(points[:, 0], points[:, 1], c='k', linewidth=0.2, edgecolor='w', facecolor=None)

    for label in unique_labels:
        color = label_to_color.get(label, 'k')  # Use black for labels not in the first 10
        cluster = points[labels == label]
        ax.scatter(cluster[:, 0], cluster[:, 1], color=color, linewidth=0.2, edgecolor='w', label=str(label))

    # Set the title
    ax.set_title(title)

    # Set the X and Y limits to ensure a fixed size of 6500x6500 units
    ax.set_xlim(0, 6500)
    ax.set_ylim(0, 6500)

    # Turn on the grid, set color to grey
    ax.grid(color='0.5')

    # Set background color to black
    ax.set_facecolor('black')

    # Set the ratio to the window size 1:1
    ax.set_aspect('equal')

    # Add a legend for labels
    ax.legend(title='Labels', loc='upper right', bbox_to_anchor=(1.2, 1.0))

    plt.tight_layout()

    # Show the plot
    plt.show()

In [None]:
def cluster_display_plots_in_colab(data):
    # Iterate over each timepoint in your DataFrame
    for timepoint in data['Time'].unique():
        # Create a sub-dataframe for the current timepoint
        sub_data = data[data['Time'] == timepoint]

        # Create a new figure
        fig, ax = plt.subplots()

        # Call your plotting function with the sub-dataframe
        cluster_same_colour(sub_data, title=f"Timepoint {timepoint}")

        # Display the current figure in Colab
        plt.show()


def display_plots_in_colab(data):
    # Iterate over each timepoint in your DataFrame
    for timepoint in data['Time'].unique():
        # Create a sub-dataframe for the current timepoint
        sub_data = data[data['Time'] == timepoint]

        # Create a new figure
        fig, ax = plt.subplots()

        # Call your plotting function with the sub-dataframe
        plotPointsFromCSV_same_colour(sub_data, title=f"Timepoint {timepoint}")

        # Display the current figure in Colab
        plt.show()

perform Linking values

##### Iteration for Original_Algorithm

In [None]:
################################################################################################
clusters_ori = pd.read_csv('/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/Original_Algorithm/linked_clusters.csv')
df_ori = pd.read_csv('/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/Original_Algorithm/df_all.csv')

In [None]:
# Specify the same file path where you saved the list
file_path_df_or = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/Original_Algorithm/df_list.pkl'
file_path_cluster_or = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/Original_Algorithm/cluster_list.pkl'

# Load the first file (assuming it's a pickled object)
with open(file_path_df_or, 'rb') as file:
    df_list = pickle.load(file)

# Load the second file (assuming it's a pickled object)
with open(file_path_cluster_or, 'rb') as file:
    cluster_list = pickle.load(file)

In [None]:
linked_clusters = pd.DataFrame()
df_all = pd.DataFrame()
linked_clusters = cluster_list[0].copy()
df_all = df_list[0].copy()
new_old_time_point = cluster_list[0].copy()
for i in tqdm(range(len((df_list)) - 1)):
  linked_clusters, new_old_time_point, swap_list = link_clusters(new_old_time_point, cluster_list[i + 1], linked_clusters, p_fine[i+1], s_fine[i])
  df_all = swapping_labels(df_all, df_list[i + 1], swap_list)

It may be better to save the new results

In [None]:
# Save df1 to Google Drive
linked_clusters.to_csv('/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/Original_Algorithm/linked_clusters_v2.csv', index=False)

# Save df2 to Google Drive
df_all.to_csv('/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/Original_Algorithm/df_all_v2.csv', index=False)


#### Iteration for New_algorithm

This code has the function of including the segmentations coming from New_algorithm. As less cells were considered (and mostly those of the 4 main clusters included), it may be easier to have constant values, as well as study main clusters over time. A comparison between multiple algorithms can be useful to understand multiple points

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Define the folder path in your Google Drive containing the CSV files
folder_path = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/Raw CSV'

# List all CSV files in the folder and sort them alphabetically
csv_files_new = sorted([csv_file for csv_file in os.listdir(folder_path) if csv_file.endswith('.csv')])

csv_data_new = []  # List to store dataframes

for csv_file_new in tqdm(csv_files_new, desc="Processing CSV files"):
    df = pd.read_csv(os.path.join(folder_path, csv_file_new))
    csv_data_new.append(df)

In [None]:
t_data_new = []
for i in tqdm(range(len((csv_data)))):
  t_data_new.append(csv_data_new[i][['X', 'Y']].copy())

In [None]:
df_list_new = []
cluster_list_new = []
for i in tqdm(range(len((csv_data_new)))):
  clusters, coord = run_clustering_no_plots(t_data_new[i], min_points=min_points_fine_new[i], epsilon=epsilon_fine_new[i], w=w_fine_new[i], max_points_ratio=max_points_ratio_fine_new[i], cluster_similarity_threshold = cluster_similarity_threshold_fine_new[i])
  df_tmp = saveResults(csv_data_new[i], coord, int(i + 1), clusters, 'Results_'+ str(i + 1) +'.csv')
  df_tmp = delete_outliers(df_tmp)
  df_tmp, merged_cluster = merge_cluster_single_points(df_tmp, p =p_fine[i])
  df_list_new.append(df_tmp)
  merged_cluster['Time'] = int(i + 1)
  cluster_list_new.append(merged_cluster)

To save the list

Here there is a previous part that has been considered. In this case the purpose of this part is

In [None]:
# Specify the file path where you want to save the list
file_path_df_new = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/df_list_new.pkl'
file_path_cluster_new = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/cluster_list_new.pkl'
# Open the file in binary write mode and save the list using pickle.dump
with open(file_path_df_new, 'wb') as file:
    pickle.dump(df_list_new, file)

with open(file_path_cluster_new, 'wb') as file:
    pickle.dump(cluster_list_new, file)

To take the list that has been saved

In [None]:
# Specify the same file path where you saved the list
file_path_df_new = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/df_list_new.pkl'
file_path_cluster_new = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/cluster_list_new.pkl'

# Load the first file (assuming it's a pickled object)
with open(file_path_df_new, 'rb') as file:
    df_list_new = pickle.load(file)

# Load the second file (assuming it's a pickled object)
with open(file_path_cluster_new, 'rb') as file:
    cluster_list_new = pickle.load(file)

In [None]:
linked_clusters_new = pd.DataFrame()
df_all_new = pd.DataFrame()
linked_clusters_new = cluster_list_new[0].copy()
df_all_new = df_list_new[0].copy()
new_old_time_point = cluster_list_new[0].copy()
for i in tqdm(range(len((df_list_new)) - 1)):
  linked_clusters_new, new_old_time_point, swap_list = link_clusters(new_old_time_point, cluster_list_new[i + 1], linked_clusters_new, p_fine[i+1], s_fine[i])
  df_all_new = swapping_labels(df_all_new, df_list_new[i + 1], swap_list)

In [None]:
# Save df1 to Google Drive
linked_clusters_new.to_csv('/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/linked_clusters_new.csv', index=False)

# Save df2 to Google Drive
df_all_new.to_csv('/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/df_all_new.csv', index=False)

In [None]:
linked_clusters_new = pd.read_csv('/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/linked_clusters_new.csv')
df_all_new = pd.read_csv('/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/df_all_new.csv')

###### Code that I have to put above. It's the "right one"

## Analyse

### General plot analysis

#### Used functions

Obtain a PDF from values

You can download .png files

In [None]:
import os
import matplotlib.pyplot as plt
import seaborn as sns

def create_png_plots(data, output_folder):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Iterate over each timepoint in your DataFrame
    for timepoint in data['Time'].unique():
        # Create a sub-dataframe for the current timepoint
        sub_data = data[data['Time'] == timepoint]

        # Call your plotting function with the sub-dataframe and get the figure object
        fig = plotPointsFromCSV_same_colour(sub_data, title=f"Timepoint {timepoint}")

        # Define the output PNG file path and name
        output_file = os.path.join(output_folder, f"timepoint_{timepoint}.png")

        # Save the current figure as a PNG file
        fig.savefig(output_file, dpi=300)  # You can adjust the DPI as needed

        # Close the current figure to free up memory
        plt.close(fig)


In [None]:
def merge_pngs_to_pdf(gdrive_folder, output_pdf_name):
    # Mount Google Drive to access the folder with PNG files
    drive.mount('/content/drive')

    # Define the full path to the folder containing PNG files
    folder_path = os.path.join('/content/drive/My Drive', gdrive_folder)

    # List PNG files in the folder
    png_files = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith('.png')]

    if not png_files:
        print("No PNG files found in the folder.")
        return

    # Initialize the PDF document
    pdf_path = os.path.join('/content', output_pdf_name)
    c = canvas.Canvas(pdf_path, pagesize=letter)

    # Iterate through PNG files and add them to the PDF
    for png_file in tqdm(png_files, desc="Merging to PDF"):
        img = Image.open(png_file)
        img_width, img_height = img.size
        img_width //= 3.5
        img_height //= 3.5
        # Add the PNG image to the PDF page
        # Adjust the X-coordinate to move the image to the left
        x_offset = 20  # You can adjust this value as needed
        c.drawImage(png_file, x_offset, 0, width=img_width, height=img_height)

        # Add a new page for the next image (if any)
        c.showPage()

    # Save the PDF document
    c.save()

    return pdf_path



In [None]:
def adjust_clusters_disappeared(data, threshold = 70):
  data = data.copy()
  # Sample dataframe 'df' with columns 'Label', 'X', and 'Y'
  # threshold is your desired threshold value

  # Group the dataframe by 'Label' and calculate the mean for each cluster
  cluster_means = data.groupby('Labels')[['X', 'Y']].mean()

  # Initialize a dictionary to store the minimum label for each cluster
  min_label_for_cluster = {}

  # Iterate through the cluster means
  for label, row in cluster_means.iterrows():
      x_mean = row['X']
      y_mean = row['Y']

      # Convert label to an integer
      label = int(label)

      # Check if the difference between the current cluster and clusters with smaller labels is under the threshold
      for smaller_label in range(label):
        if smaller_label != 1:
              smaller_x_mean, smaller_y_mean = cluster_means.loc[smaller_label]
              if abs(x_mean - smaller_x_mean) < threshold and abs(y_mean - smaller_y_mean) < threshold:
                  min_label_for_cluster[label] = smaller_label
                  break

  # Update the 'Label' column in the original dataframe based on the dictionary
  for label, min_label in min_label_for_cluster.items():
      data.loc[data['Labels'] == label, 'Labels'] = min_label
  return data
# Now 'df' contains updated labels based on the smallest label within the threshold difference

In [None]:
def order_labels(df):
  df = df.copy()
  # Create a mapping dictionary
  mapping = {val: idx for idx, val in enumerate(sorted(df['Labels'].unique()))}

  # Apply the mapping to the "Labels" column
  df['Labels'] = df['Labels'].map(mapping)

  return df

In [None]:
from scipy.stats import gmean
import pandas as pd

def data_cluster_updated(cluster):
    Geo_light_intensity = gmean(cluster['IntDen'])
    Geo_cell_size = gmean(cluster['Area'])
    Cluster_dimension = sum(cluster['Area'])
    N_cell = len(cluster)
    X = sum(cluster['X']) / len(cluster)
    Y = sum(cluster['Y']) / len(cluster)
    Labels = cluster['Labels'].iloc[0]  # Get the label value for this cluster

    return pd.Series({
        'Geo_light_intensity': Geo_light_intensity,
        'Geo_cell_size': Geo_cell_size,
        'Cluster_dimension': Cluster_dimension,
        'N_cell': N_cell,
        'X': X,
        'Y': Y,
        'Cluster_Labels': Labels
    })


#### New Algorithm

In [None]:
df = df_all_new_updated.copy()
selected_labels = [0, 1, 2, 3]
df = df[df['Labels'].isin(selected_labels)]

data = df
output_folder_png = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/clusters/N=0123'
create_png_plots(data, output_folder_png)

In [None]:
gdrive_folder = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/clusters/N=0123'
output_pdf_name = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/clusters/N=0123/new_model.pdf'

result_pdf_path = merge_pngs_to_pdf(gdrive_folder, output_pdf_name)

print(f"PDF saved at: {result_pdf_path}")

##### Fixing some values due to code

What is possible to notice is that the value here is really close. And there are just minimal differences

In [None]:
df_all_new = adjust_clusters_disappeared(df_all_new)

In [None]:
data = df_all_new
output_folder_png = '/content/drive/MyDrive/Academic Work/Utkarsha Khanal/CSV_files/New_algorithm/png_files_updated'
create_png_plots(data, output_folder_png)

In [None]:
gdrive_folder = '/content/drive/MyDrive/Academic Work/Utkarsha Khanal/CSV_files/New_algorithm/png_files_updated'
output_pdf_name = '/content/drive/MyDrive/Academic Work/Utkarsha Khanal/CSV_files/New_algorithm/png_files/new_model_updated.pdf'

result_pdf_path = merge_pngs_to_pdf(gdrive_folder, output_pdf_name)

print(f"PDF saved at: {result_pdf_path}")

In [None]:
output_folder_png = '/content/drive/MyDrive/Academic Work/Utkarsha Khanal/CSV_files/New_algorithm/png_files_updated_to_delete'
create_png_plots(proviamo, output_folder_png)

In [None]:
gdrive_folder = '/content/drive/MyDrive/Academic Work/Utkarsha Khanal/CSV_files/New_algorithm/png_files_updated_to_delete'
output_pdf_name = '/content/drive/MyDrive/Academic Work/Utkarsha Khanal/CSV_files/New_algorithm/png_files_updated_to_delete/new_model_updated_to_delete.pdf'

result_pdf_path = merge_pngs_to_pdf(gdrive_folder, output_pdf_name)

print(f"PDF saved at: {result_pdf_path}")

perform final adjustments

In [None]:
df_all_new = order_labels(df_all_new)

In [None]:
# Define replacement dictionary
replacement_dict = {35: 3, 97:3, 24:3, 106:3, 30:1, 22:1, 84:1, 95:1, 103:2, 28:2, 129:0, 39:0}

# Replace values in the 'label' column using the dictionary
df_all_new['Labels'] = df_all_new['Labels'].replace(replacement_dict)


In [None]:
df_all_new = order_labels(df_all_new)

Here we can save our values

In [None]:
output_folder_png = '/content/drive/MyDrive/Academic Work/Utkarsha Khanal/CSV_files/New_algorithm/png_files_updated_completev2'
create_png_plots(proviamo, output_folder_png)

In [None]:
gdrive_folder = '/content/drive/MyDrive/Academic Work/Utkarsha Khanal/CSV_files/New_algorithm/png_files_updated_completev2'
output_pdf_name = '/content/drive/MyDrive/Academic Work/Utkarsha Khanal/CSV_files/New_algorithm/png_files_updated_completev2/new_model_updated_complete.pdf'

result_pdf_path = merge_pngs_to_pdf(gdrive_folder, output_pdf_name)

print(f"PDF saved at: {result_pdf_path}")

In [None]:
for i in range(4):
  selected_label = i
  selected_cluster = df_all_new_updated[df_all_new_updated['Labels'] == selected_label]

  output_folder_png = '/content/drive/MyDrive/Academic Work/Utkarsha Khanal/CSV_files/New_algorithm/clusters/N='+ str(i)
  create_png_plots(selected_cluster, output_folder_png)

  gdrive_folder = '/content/drive/MyDrive/Academic Work/Utkarsha Khanal/CSV_files/New_algorithm/clusters/N='+ str(i)
  output_pdf_name = '/content/drive/MyDrive/Academic Work/Utkarsha Khanal/CSV_files/New_algorithm/clusters/N='+ str(i) + '/complete_pdf_N=' + str(i) + '.pdf'

  result_pdf_path = merge_pngs_to_pdf(gdrive_folder, output_pdf_name)


As there is the same data frame, it's also possible to select just one particular cluster and to see how it would develop

after we have saved the final value we can also create a new cluster from it. So we can analyse our data

In [None]:
import pandas as pd

In [None]:
df_all_new_updated = pd.read_csv('/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/df_all_new_updated_v2.csv')

Obtain updated clusters

In [None]:
cluster_partial_1 = df_all_new_updated.groupby(['Labels', 'Time']).mean().reset_index()

# Group by 'Labels' and 'Time' and apply the custom aggregation function
cluster_partial_2 = df_all_new_updated.copy()

# Group by 'Old_Labels' and 'Time' and apply the custom aggregation function
cluster_partial_2 = cluster_partial_2.groupby(['Labels', 'Time']).apply(data_cluster_updated).reset_index()

# Merge based on the 'Labels' and 'Time' columns using an inner join
merged_df = pd.merge(result_df, result_df1, how='inner', on=['Labels', 'Time'])


In [None]:
merged_df.to_csv('/content/drive/MyDrive/Academic Work/Utkarsha Khanal/CSV_files/New_algorithm/cluster_new_updated_v2.csv')

In [None]:
cluster_new_updated = pd.read_csv('/content/drive/MyDrive/Academic Work/Utkarsha Khanal/CSV_files/New_algorithm/cluster_new_updated_v2.csv')

###### Location

In [None]:
df = cluster_new_updated.copy()
selected_labels = [0, 1, 2, 3]
df = df[df['Labels'].isin(selected_labels)]

# Get unique labels from the DataFrame
unique_labels = df['Labels'].unique()

# Create a 3D plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Plot each label's X_x and Y_y positions over time
for label in unique_labels:
    label_data = df[df['Labels'] == label]

    ax.plot(label_data['Time'], label_data['X_x'], label_data['Y_y'], label=label)

# Set labels and title
ax.set_xlabel('Time')
ax.set_ylabel('X_x')
ax.set_zlabel('Y_y')
plt.title('3D Plot of Position vs. Time for Each Label')
plt.legend()

# Show the plot
plt.show()


###### Correlation

In [None]:
# Calculate Pearson correlation
correlation_matrix_pearson = clustered_df.corr()

# Calculate Spearman correlation
correlation_matrix_spearman = clustered_df.corr(method='spearman')

# Calculate Kendall correlation
correlation_matrix_kendall = clustered_df.corr(method='kendall')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming df is your DataFrame
correlation_matrix = correlation_matrix_pearson

# Create a figure and axis
fig, ax = plt.subplots(figsize=(8, 6))

# Plot the correlation matrix with a background gradient
cax = ax.matshow(correlation_matrix, cmap='coolwarm')

# Add a colorbar
cbar = fig.colorbar(cax)

# Set the precision of the colorbar labels to 2 decimal places
cbar.set_label('Correlation', rotation=270, labelpad=15)
cbar.set_ticks([-1, -0.5, 0, 0.5, 1])
cbar.set_ticklabels([-1, -0.5, 0, 0.5, 1])

# Set the x and y axis labels and titles
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=90)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)
plt.title('Correlation Matrix')

# Show the plot
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming df is your DataFrame
correlation_matrix = correlation_matrix_spearman

# Create a figure and axis
fig, ax = plt.subplots(figsize=(8, 6))

# Plot the correlation matrix with a background gradient
cax = ax.matshow(correlation_matrix, cmap='coolwarm')

# Add a colorbar
cbar = fig.colorbar(cax)

# Set the precision of the colorbar labels to 2 decimal places
cbar.set_label('Correlation', rotation=270, labelpad=15)
cbar.set_ticks([-1, -0.5, 0, 0.5, 1])
cbar.set_ticklabels([-1, -0.5, 0, 0.5, 1])

# Set the x and y axis labels and titles
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=90)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)
plt.title('Correlation Matrix')

# Show the plot
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming df is your DataFrame
correlation_matrix = correlation_matrix_kendall

# Create a figure and axis
fig, ax = plt.subplots(figsize=(8, 6))

# Plot the correlation matrix with a background gradient
cax = ax.matshow(correlation_matrix, cmap='coolwarm')

# Add a colorbar
cbar = fig.colorbar(cax)

# Set the precision of the colorbar labels to 2 decimal places
cbar.set_label('Correlation', rotation=270, labelpad=15)
cbar.set_ticks([-1, -0.5, 0, 0.5, 1])
cbar.set_ticklabels([-1, -0.5, 0, 0.5, 1])

# Set the x and y axis labels and titles
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=90)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)
plt.title('Correlation Matrix')

# Show the plot
plt.show()

From this graph is possible to understand some key components

In [None]:
# Select specific columns from the DataFrame
selected_columns = cluster_new_updated[['Area', 'Geo_light_intensity', 'IntDen', 'Mean', 'N_cell', 'Mean']]

# If you want to create a new DataFrame with these columns, you can do so:
clustered_df = selected_columns.copy()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming your DataFrame is clustered_df
correlation_matrix = clustered_df.corr(method='kendall')

# Create a figure and axis
fig, ax = plt.subplots(figsize=(8, 6))

# Plot the correlation matrix with a background gradient
cax = ax.matshow(correlation_matrix, cmap='coolwarm')

# Add a colorbar
cbar = fig.colorbar(cax)

# Set the precision of the colorbar labels to 2 decimal places
cbar.set_label('Correlation', rotation=270, labelpad=15)
cbar.set_ticks([-1, -0.5, 0, 0.5, 1])
cbar.set_ticklabels([-1, -0.5, 0, 0.5, 1])

# Set the x and y axis labels and titles
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=90)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)
plt.title('Correlation Matrix')

# Show the plot
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming your DataFrame is clustered_df
correlation_matrix = clustered_df.corr(method='kendall')

# Create a figure and axis
fig, ax = plt.subplots(figsize=(8, 6))

# Plot the correlation matrix with a background gradient
cax = ax.matshow(correlation_matrix, cmap='coolwarm')

# Add a colorbar
cbar = fig.colorbar(cax)

# Set the precision of the colorbar labels to 2 decimal places
cbar.set_label('Correlation', rotation=270, labelpad=15)
cbar.set_ticks([-1, -0.5, 0, 0.5, 1])
cbar.set_ticklabels([-1, -0.5, 0, 0.5, 1])

# Set the x and y axis labels and titles
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=90)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)
plt.title('Correlation Matrix')

# Add correlation values as text annotations
for i in range(len(correlation_matrix.columns)):
    for j in range(len(correlation_matrix.columns)):
        text = f"{correlation_matrix.iloc[i, j]:.2f}"
        ax.text(j, i, text, ha='center', va='center', color='black', fontsize=10)

# Show the plot
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming your DataFrame is clustered_df
correlation_matrix = clustered_df.corr(method='kendall')

# Create a figure and axis
fig, ax = plt.subplots(figsize=(8, 6))

# Plot the correlation matrix with a background gradient
cax = ax.matshow(correlation_matrix, cmap='coolwarm')

# Add a colorbar
cbar = fig.colorbar(cax)

# Set the precision of the colorbar labels to 2 decimal places
cbar.set_label('Correlation', rotation=270, labelpad=15)
cbar.set_ticks([-1, -0.5, 0, 0.5, 1])
cbar.set_ticklabels([-1, -0.5, 0, 0.5, 1])

# Set the x-axis labels at the bottom
ax.xaxis.set_ticks_position('bottom')
ax.set_xticks(range(len(correlation_matrix.columns)))
ax.set_xticklabels(correlation_matrix.columns, rotation=90)

# Set the y-axis labels on the left
ax.set_yticks(range(len(correlation_matrix.columns)))
ax.set_yticklabels(correlation_matrix.columns)

# Add correlation values as text annotations
for i in range(len(correlation_matrix.columns)):
    for j in range(len(correlation_matrix.columns)):
        text = f"{correlation_matrix.iloc[i, j]:.2f}"
        ax.text(j, i, text, ha='center', va='center', color='black', fontsize=10)

# Show the plot
plt.title('Correlation Matrix')
plt.show()


In [None]:
# Example - Major - Minor

Column1 = 'Major'
Column2 = 'Minor'
correlation_pearson = clustered_df[Column1].corr(clustered_df[Column2], method='spearman')

correlation_spearman = clustered_df[Column1].corr(clustered_df[Column2], method='spearman')

correlation_kendall = clustered_df[Column1].corr(clustered_df[Column2], method='spearman')

# Print or inspect the Spearman correlation value
print(correlation_pearson, correlation_spearman, correlation_kendall)

###### Plotting graphs

As see in the cluster, there are some values that are missing. That is mostly due to the segmentation algorithm and or some particular values that have been excluded through the clustering. For this reason one possibility is to use some ML system to solve this problem. It may be hard to understand the number of cells, or their location. However, it can also be possible to use some ML algorithms to solve this problem. Creating synthetic data

There may be some works in another notebook, as it can require more than expected + several iterations.

###### Geo_light_intensity

In [None]:
import numpy as np
from scipy.interpolate import CubicSpline
import pandas as pd

# Assuming you have a DataFrame named 'clustered_df' with columns 'Labels', 'Time', and 'Geo_light_intensity'

# Get unique labels
unique_labels = clustered_df['Labels'].unique()

# Initialize an empty DataFrame to store the results
Geo_light_intensity_df = pd.DataFrame()

# Loop through each unique label and apply the spline fitting
for label in unique_labels:
    # Select data for the current label
    label_data = clustered_df[clustered_df['Labels'] == label]

    # Check if the cluster has at least 2 elements
    if len(label_data) < 2:
        continue  # Skip clusters with less than 2 elements

    # Extract Time and Value columns
    t_data = label_data['Time']
    w_data = label_data['Geo_light_intensity']

    # Create a cubic spline object
    spline = CubicSpline(t_data, w_data)

    # Generate finer x values for plotting
    t_fine = np.linspace(min(t_data), max(t_data), int(max(t_data)))
    w_fine = spline(t_fine)

    # Create a new DataFrame for the current label's results
    label_results = pd.DataFrame({'Time': t_fine, 'Geo_light_intensity': w_fine})

    # Add a 'Labels' column with the current label value
    label_results['Labels'] = label

    # Append the current label's results to the overall results DataFrame
    Geo_light_intensity_df = pd.concat([Geo_light_intensity_df, label_results])


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Sort the DataFrame by "Time" in descending order
df_sorted = Geo_light_intensity_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot them with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)
    plt.plot(cluster_data['Time'], cluster_data['Geo_light_intensity'], label=f'Cluster {cluster_label}', color=color)

plt.xlabel('Time')
plt.ylabel('Geo_light_intensity')
plt.title('Geo_light_intensity Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)
plt.show()


###### Geo_cell_size

In [None]:
import numpy as np
from scipy.interpolate import CubicSpline
import pandas as pd

# Assuming you have a DataFrame named 'clustered_df' with columns 'Labels', 'Time', and 'Geo_light_intensity'

# Get unique labels
unique_labels = clustered_df['Labels'].unique()

# Initialize an empty DataFrame to store the results
Geo_cell_size_df = pd.DataFrame()

# Loop through each unique label and apply the spline fitting
for label in unique_labels:
    # Select data for the current label
    label_data = clustered_df[clustered_df['Labels'] == label]

    # Check if the cluster has at least 2 elements
    if len(label_data) < 2:
        continue  # Skip clusters with less than 2 elements

    # Extract Time and Value columns
    t_data = label_data['Time']
    w_data = label_data['Geo_cell_size']

    # Create a cubic spline object
    spline = CubicSpline(t_data, w_data)

    # Generate finer x values for plotting
    t_fine = np.linspace(min(t_data), max(t_data), int(max(t_data)))
    w_fine = spline(t_fine)

    # Create a new DataFrame for the current label's results
    label_results = pd.DataFrame({'Time': t_fine, 'Geo_cell_size': w_fine})

    # Add a 'Labels' column with the current label value
    label_results['Labels'] = label

    # Append the current label's results to the overall results DataFrame
    Geo_cell_size_df = pd.concat([Geo_cell_size_df, label_results])


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Sort the DataFrame by "Time" in descending order
df_sorted = Geo_cell_size_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot them with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)
    plt.plot(cluster_data['Time'], cluster_data['Geo_cell_size'], label=f'Cluster {cluster_label}', color=color)

plt.xlabel('Time')
plt.ylabel('Geo_cell_size')
plt.title('Geo_cell_size Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)
plt.show()


###### Cluster_dimension

In [None]:
import numpy as np
from scipy.interpolate import CubicSpline
import pandas as pd

# Assuming you have a DataFrame named 'clustered_df' with columns 'Labels', 'Time', and 'Geo_light_intensity'

# Get unique labels
unique_labels = clustered_df['Labels'].unique()

# Initialize an empty DataFrame to store the results
Cluster_dimension_df = pd.DataFrame()

# Loop through each unique label and apply the spline fitting
for label in unique_labels:
    # Select data for the current label
    label_data = clustered_df[clustered_df['Labels'] == label]

    # Check if the cluster has at least 2 elements
    if len(label_data) < 2:
        continue  # Skip clusters with less than 2 elements

    # Extract Time and Value columns
    t_data = label_data['Time']
    w_data = label_data['Cluster_dimension']

    # Create a cubic spline object
    spline = CubicSpline(t_data, w_data)

    # Generate finer x values for plotting
    t_fine = np.linspace(min(t_data), max(t_data), int(max(t_data)))
    w_fine = spline(t_fine)

    # Create a new DataFrame for the current label's results
    label_results = pd.DataFrame({'Time': t_fine, 'Cluster_dimension': w_fine})

    # Add a 'Labels' column with the current label value
    label_results['Labels'] = label

    # Append the current label's results to the overall results DataFrame
    Cluster_dimension_df = pd.concat([Cluster_dimension_df, label_results])

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Sort the DataFrame by "Time" in descending order
df_sorted = Cluster_dimension_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot them with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)
    plt.plot(cluster_data['Time'], cluster_data['Cluster_dimension'], label=f'Cluster {cluster_label}', color=color)

plt.xlabel('Time')
plt.ylabel('Cluster_dimension')
plt.title('Cluster_dimension Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Sort the DataFrame by "Time" in descending order
df_sorted = Cluster_dimension_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot moving averages with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)

    # Calculate the moving average with a window size of, for example, 5
    window_size = 5
    moving_avg = cluster_data['Cluster_dimension'].rolling(window=window_size).mean()

    plt.plot(cluster_data['Time'], moving_avg, label=f'Cluster {cluster_label} (Moving Avg)', color=color)

plt.xlabel('Time')
plt.ylabel('Moving Average of Cluster_dimension')
plt.title('Moving Average of Cluster_dimension Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Sort the DataFrame by "Time" in descending order
df_sorted = Cluster_dimension_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot them with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)
    plt.plot(cluster_data['Time'], cluster_data['Cluster_dimension'], label=f'Cluster {cluster_label}', color=color)

    # Calculate and plot the moving average with a window of your choice, e.g., window=5
    moving_average = cluster_data['Cluster_dimension'].rolling(window=5).mean()
    plt.plot(cluster_data['Time'], moving_average, label=f'Moving Avg {cluster_label}', linestyle='--', color=color)

plt.xlabel('Time')
plt.ylabel('Cluster_dimension')
plt.title('Cluster_dimension Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)

plt.show()


###### N_cell

In [None]:
clustered_df = cluster_new_updated.copy()

In [None]:
import numpy as np
from scipy.interpolate import CubicSpline
import pandas as pd

# Assuming you have a DataFrame named 'clustered_df' with columns 'Labels', 'Time', and 'Geo_light_intensity'

# Get unique labels
unique_labels = clustered_df['Labels'].unique()

# Initialize an empty DataFrame to store the results
N_cell_df = pd.DataFrame()

# Loop through each unique label and apply the spline fitting
for label in unique_labels:
    # Select data for the current label
    label_data = clustered_df[clustered_df['Labels'] == label]

    # Check if the cluster has at least 2 elements
    if len(label_data) < 2:
        continue  # Skip clusters with less than 2 elements

    # Extract Time and Value columns
    t_data = label_data['Time']
    w_data = label_data['N_cell']

    # Create a cubic spline object
    spline = CubicSpline(t_data, w_data)

    # Generate finer x values for plotting
    t_fine = np.linspace(min(t_data), max(t_data), int(max(t_data)))
    w_fine = spline(t_fine)

    # Create a new DataFrame for the current label's results
    label_results = pd.DataFrame({'Time': t_fine, 'N_cell': w_fine})

    # Add a 'Labels' column with the current label value
    label_results['Labels'] = label

    # Append the current label's results to the overall results DataFrame
    N_cell_df = pd.concat([N_cell_df, label_results])


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Sort the DataFrame by "Time" in descending order
df_sorted = N_cell_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot them with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)
    plt.plot(cluster_data['Time'], cluster_data['N_cell'], label=f'Cluster {cluster_label}', color=color)

plt.xlabel('Time')
plt.ylabel('N_cell')
plt.title('N_cell Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Sort the DataFrame by "Time" in descending order
df_sorted = N_cell_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot them with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)
    plt.plot(cluster_data['Time'], cluster_data['N_cell'], label=f'Cluster {cluster_label}', color=color)

plt.xlabel('Time')
plt.ylabel('N_cell')
plt.title('N_cell Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)

# Set the y-axis to log scale
plt.yscale('log')

plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Sort the DataFrame by "Time" in descending order
df_sorted = N_cell_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot them with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)

    # Plot the original data
    plt.plot(cluster_data['Time'], cluster_data['N_cell'], label=f'Cluster {cluster_label}', color=color)

    # Compute and plot the moving average with a window of your choice, e.g., window=5
    moving_average = cluster_data['N_cell'].rolling(window=5).mean()
    plt.plot(cluster_data['Time'], moving_average, label=f'Moving Avg {cluster_label}', linestyle='--', color=color)

plt.xlabel('Time')
plt.ylabel('N_cell')
plt.title('N_cell Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)

# Set the y-axis to log scale
plt.yscale('log')

plt.show()


We can understand the closest exponential function achievable

In [None]:
import pandas as pd
import itertools
import matplotlib.pyplot as plt

# Sort the DataFrame by "Time" in descending order
df_sorted = N_cell_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot them with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)

    # Calculate the moving average with a window size of, for example, 5
    window_size = 5
    moving_avg = cluster_data['N_cell'].rolling(window=window_size).mean()

    plt.plot(cluster_data['Time'], moving_avg, label=f'Cluster {cluster_label} (Moving Avg)', color=color)

plt.xlabel('Time')
plt.ylabel('N_cell')
plt.title('Moving Average of N_cell Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)
plt.show()


###### Mean

In [None]:
import numpy as np
from scipy.interpolate import CubicSpline
import pandas as pd

# Assuming you have a DataFrame named 'clustered_df' with columns 'Labels', 'Time', and 'Geo_light_intensity'

# Get unique labels
unique_labels = clustered_df['Labels'].unique()

# Initialize an empty DataFrame to store the results
Mean_df = pd.DataFrame()

# Loop through each unique label and apply the spline fitting
for label in unique_labels:
    # Select data for the current label
    label_data = clustered_df[clustered_df['Labels'] == label]

    # Check if the cluster has at least 2 elements
    if len(label_data) < 2:
        continue  # Skip clusters with less than 2 elements

    # Extract Time and Value columns
    t_data = label_data['Time']
    w_data = label_data['Mean']

    # Create a cubic spline object
    spline = CubicSpline(t_data, w_data)

    # Generate finer x values for plotting
    t_fine = np.linspace(min(t_data), max(t_data), int(max(t_data)))
    w_fine = spline(t_fine)

    # Create a new DataFrame for the current label's results
    label_results = pd.DataFrame({'Time': t_fine, 'Mean': w_fine})

    # Add a 'Labels' column with the current label value
    label_results['Labels'] = label

    # Append the current label's results to the overall results DataFrame
    Mean_df = pd.concat([Mean_df, label_results])

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Sort the DataFrame by "Time" in descending order
df_sorted = Mean_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot them with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)
    plt.plot(cluster_data['Time'], cluster_data['Mean'], label=f'Cluster {cluster_label}', color=color)

    # Calculate and plot the moving average with a window of your choice, e.g., window=5
    moving_average = cluster_data['Mean'].rolling(window=5).mean()
    plt.plot(cluster_data['Time'], moving_average, label=f'Moving Avg {cluster_label}', linestyle='--', color=color)

plt.xlabel('Time')
plt.ylabel('Mean')
plt.title('Mean Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)

plt.show()

###### IntDen

In [None]:
import numpy as np
from scipy.interpolate import CubicSpline
import pandas as pd

# Assuming you have a DataFrame named 'clustered_df' with columns 'Labels', 'Time', and 'Geo_light_intensity'

# Get unique labels
unique_labels = clustered_df['Labels'].unique()

# Initialize an empty DataFrame to store the results
IntDen_df = pd.DataFrame()

# Loop through each unique label and apply the spline fitting
for label in unique_labels:
    # Select data for the current label
    label_data = clustered_df[clustered_df['Labels'] == label]

    # Check if the cluster has at least 2 elements
    if len(label_data) < 2:
        continue  # Skip clusters with less than 2 elements

    # Extract Time and Value columns
    t_data = label_data['Time']
    w_data = label_data['IntDen']

    # Create a cubic spline object
    spline = CubicSpline(t_data, w_data)

    # Generate finer x values for plotting
    t_fine = np.linspace(min(t_data), max(t_data), int(max(t_data)))
    w_fine = spline(t_fine)

    # Create a new DataFrame for the current label's results
    label_results = pd.DataFrame({'Time': t_fine, 'IntDen': w_fine})

    # Add a 'Labels' column with the current label value
    label_results['Labels'] = label

    # Append the current label's results to the overall results DataFrame
    IntDen_df = pd.concat([IntDen_df, label_results])

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Sort the DataFrame by "Time" in descending order
df_sorted = IntDen_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot them with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)
    plt.plot(cluster_data['Time'], cluster_data['IntDen'], label=f'Cluster {cluster_label}', color=color)

    # Calculate and plot the moving average with a window of your choice, e.g., window=5
    moving_average = cluster_data['IntDen'].rolling(window=5).mean()
    plt.plot(cluster_data['Time'], moving_average, label=f'Moving Avg {cluster_label}', linestyle='--', color=color)

plt.xlabel('Time')
plt.ylabel('IntDen')
plt.title('IntDen Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)

plt.show()

###### StdDev

In [None]:
import numpy as np
from scipy.interpolate import CubicSpline
import pandas as pd

# Assuming you have a DataFrame named 'clustered_df' with columns 'Labels', 'Time', and 'Geo_light_intensity'

# Get unique labels
unique_labels = clustered_df['Labels'].unique()

# Initialize an empty DataFrame to store the results
StdDev_df = pd.DataFrame()

# Loop through each unique label and apply the spline fitting
for label in unique_labels:
    # Select data for the current label
    label_data = clustered_df[clustered_df['Labels'] == label]

    # Check if the cluster has at least 2 elements
    if len(label_data) < 2:
        continue  # Skip clusters with less than 2 elements

    # Extract Time and Value columns
    t_data = label_data['Time']
    w_data = label_data['StdDev']

    # Create a cubic spline object
    spline = CubicSpline(t_data, w_data)

    # Generate finer x values for plotting
    t_fine = np.linspace(min(t_data), max(t_data), int(max(t_data)))
    w_fine = spline(t_fine)

    # Create a new DataFrame for the current label's results
    label_results = pd.DataFrame({'Time': t_fine, 'StdDev': w_fine})

    # Add a 'Labels' column with the current label value
    label_results['Labels'] = label

    # Append the current label's results to the overall results DataFrame
    StdDev_df = pd.concat([StdDev_df, label_results])

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Sort the DataFrame by "Time" in descending order
df_sorted = StdDev_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot them with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)
    plt.plot(cluster_data['Time'], cluster_data['StdDev'], label=f'Cluster {cluster_label}', color=color)

    # Calculate and plot the moving average with a window of your choice, e.g., window=5
    moving_average = cluster_data['StdDev'].rolling(window=5).mean()
    plt.plot(cluster_data['Time'], moving_average, label=f'Moving Avg {cluster_label}', linestyle='--', color=color)

plt.xlabel('Time')
plt.ylabel('StdDev')
plt.title('StdDev Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)

plt.show()

###### Area

In [None]:
import numpy as np
from scipy.interpolate import CubicSpline
import pandas as pd

# Assuming you have a DataFrame named 'clustered_df' with columns 'Labels', 'Time', and 'Geo_light_intensity'

# Get unique labels
unique_labels = clustered_df['Labels'].unique()

# Initialize an empty DataFrame to store the results
Area_df = pd.DataFrame()

# Loop through each unique label and apply the spline fitting
for label in unique_labels:
    # Select data for the current label
    label_data = clustered_df[clustered_df['Labels'] == label]

    # Check if the cluster has at least 2 elements
    if len(label_data) < 2:
        continue  # Skip clusters with less than 2 elements

    # Extract Time and Value columns
    t_data = label_data['Time']
    w_data = label_data['Area']

    # Create a cubic spline object
    spline = CubicSpline(t_data, w_data)

    # Generate finer x values for plotting
    t_fine = np.linspace(min(t_data), max(t_data), int(max(t_data)))
    w_fine = spline(t_fine)

    # Create a new DataFrame for the current label's results
    label_results = pd.DataFrame({'Time': t_fine, 'Area': w_fine})

    # Add a 'Labels' column with the current label value
    label_results['Labels'] = label

    # Append the current label's results to the overall results DataFrame
    Area_df = pd.concat([Area_df, label_results])

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Sort the DataFrame by "Time" in descending order
df_sorted = Area_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot them with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)
    plt.plot(cluster_data['Time'], cluster_data['Area'], label=f'Cluster {cluster_label}', color=color)

    # Calculate and plot the moving average with a window of your choice, e.g., window=5
    moving_average = cluster_data['Area'].rolling(window=5).mean()
    plt.plot(cluster_data['Time'], moving_average, label=f'Moving Avg {cluster_label}', linestyle='--', color=color)

plt.xlabel('Time')
plt.ylabel('Area')
plt.title('Area Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)

plt.show()

##### Analysis of a particular cluster in space

###### Functions used

In [None]:
import pandas as pd
import math
import matplotlib.pyplot as plt
import numpy as np

def plot_distance_vs_cell_location(data, window_size=2):
    df = data.copy()

    # Calculate the barycenter (centroid)
    Barycenter_x = df['X'].mean()
    Barycenter_y = df['Y'].mean()

    # Calculate the distance for each data point
    df['Distance'] = df.apply(lambda row: math.sqrt((row['X'] - Barycenter_x)**2 + (row['Y'] - Barycenter_y)**2), axis=1)

    # Sort the data by distance
    df.sort_values(by='Distance', inplace=True)

    # Calculate the cumulative count of cells with distances lower than or equal to each distance
    df['Cumulative_Count'] = np.arange(1, len(df) + 1)

    # Calculate the derivative (change in cumulative count) using numpy's gradient function
    df['Derivative'] = np.gradient(df['Cumulative_Count'], df['Distance'])

    # Create subplots with two graphs (main cumulative distribution and derivative)
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

    # Main cumulative distribution plot
    ax1.plot(df['Distance'], df['Cumulative_Count'], marker='o', linestyle='-', color='b')
    ax1.set_xlabel('Distance from Barycenter')
    ax1.set_ylabel('Number of Cells (Cumulative)')
    ax1.set_title('Cumulative Distribution of Cell Distances from Barycenter')

    # Derivative plot
    ax2.plot(df['Distance'], df['Derivative'], marker='o', linestyle='-', color='r')
    ax2.set_xlabel('Distance from Barycenter')
    ax2.set_ylabel('Derivative (Change in Cumulative Count)')
    ax2.set_title('Derivative of Cumulative Distribution')

    # Optionally, you can save the plot to a file or display it
    plt.savefig('cumulative_and_derivative_plot.png')
    plt.show()

# Example usage:
# Assuming you have a DataFrame 'data' with columns 'X' and 'Y', representing cell coordinates,
# and 'Distance', 'Cumulative_Count', and 'Derivative' columns calculated using your function.
# plot_distance_vs_cell_location(data)


In [None]:
import matplotlib.pyplot as plt
import math

def plot_distance_vs_value(data):
    df = data.copy()
    # Calculate the barycenter (centroid)
    Barycenter_x = df['X'].mean()
    Barycenter_y = df['Y'].mean()

    # Calculate the distance for each data point
    df['Distance'] = df.apply(lambda row: math.sqrt((row['X'] - Barycenter_x)**2 + (row['Y'] - Barycenter_y)**2), axis=1)

    # Create a scatter plot of 'Distance' vs 'IntDen'
    plt.scatter(df['Distance'], df['IntDen'], alpha=0.7)
    plt.xlabel('Distance to Barycenter')
    plt.ylabel('IntDen')
    plt.title('IntDen vs. Distance to Barycenter')
    plt.grid(True)

    plt.show()

# Example usage:
# plot_distance_vs_value(your_data)

In [None]:
import matplotlib.pyplot as plt
import math

def plot_distance_vs_value_with_histogram(data, bins=10):
    df = data.copy()
    # Calculate the barycenter (centroid)
    Barycenter_x = df['X'].mean()
    Barycenter_y = df['Y'].mean()

    # Calculate the distance for each data point
    df['Distance'] = df.apply(lambda row: math.sqrt((row['X'] - Barycenter_x)**2 + (row['Y'] - Barycenter_y)**2), axis=1)

    # Create a histogram of the 'Distance' column
    plt.hist(df['Distance'], bins=bins, edgecolor='k', alpha=0.7)
    plt.xlabel('Distance to Barycenter')
    plt.ylabel('Frequency')
    plt.title('Distance to Barycenter Histogram')
    plt.grid(True)

    plt.show()


In [None]:
import matplotlib.pyplot as plt
import math

def plot_moving_average(data, window_size=3500):
    df = data.copy()
    # Calculate the barycenter (centroid)
    Barycenter_x = df['X'].mean()
    Barycenter_y = df['Y'].mean()

    # Calculate the distance for each data point
    df['Distance'] = df.apply(lambda row: math.sqrt((row['X'] - Barycenter_x)**2 + (row['Y'] - Barycenter_y)**2), axis=1)

    # Create a new DataFrame for plotting
    plot_df = df[['Distance', 'IntDen']]

    # Sort the DataFrame by distance for a cleaner plot
    plot_df = plot_df.sort_values(by='Distance')

    # Calculate the moving average of the 'IntDen' column with a window size of 30
    plot_df['Moving_Average'] = plot_df['IntDen'].rolling(window=window_size).mean()

    # Plot the moving average
    plt.plot(plot_df['Distance'], plot_df['Moving_Average'], linestyle='--', label='Moving Average')
    plt.xlabel('Distance to Barycenter')
    plt.ylabel('Moving Average of IntDen')
    plt.title(f'Moving Average of IntDen vs. Distance to Barycenter (Window Size {window_size})')
    plt.grid(True)
    plt.legend()

    plt.show()

In [None]:
import math
import matplotlib.pyplot as plt

def plot_values_based_on_the_area_in_space(data, d_value=7.5):
    # Calculate the barycenter (centroid)
    Barycenter_x = data['X'].mean()
    Barycenter_y = data['Y'].mean()

    # Calculate the distance for each data point
    data['Distance'] = ((data['X'] - Barycenter_x)**2 + (data['Y'] - Barycenter_y)**2).apply(math.sqrt)

    # Create a new DataFrame for plotting
    plot_df = data[['Distance', 'IntDen']]

    # Sort the DataFrame by distance for a cleaner plot
    plot_df = plot_df.sort_values('Distance')

    # Assign labels based on distance
    plot_df['Label'] = plot_df['Distance'].apply(lambda distance: math.ceil(distance / d_value))
    plot_df['New_Distance'] = plot_df['Label']*d_value - d_value / 2

    # Calculate area and sum of IntDen
    plot_df['Area'] = (math.pi * (plot_df['Label']**2 - (plot_df['Label'] - 1)**2))*d_value * d_value
    plot_df['Sum_IntDen'] = plot_df.groupby('Label')['IntDen'].transform('sum')
    plot_df['Ratio'] = plot_df['Sum_IntDen'] / plot_df['Area']

    # Plotting the function
    plt.plot(plot_df['New_Distance'], plot_df['Ratio'])
    plt.xlabel('New Distance')
    plt.ylabel('Ratio')
    plt.title('Ratio vs New Distance')
    plt.show()

    return plot_df

# Example usage:
# plot_values_based_on_the_area_in_space(selected_df)

In [None]:
import os
import re
import cv2
from google.colab import drive

def natural_sort_key(s):
    # Extract and return the numeric part of the filename
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', s)]

def create_video_from_pngs(folder_path, output_video_path, frame_size=(1920, 1080), frame_rate=10):
    """
    Create a video from PNG files in a Google Drive folder.

    Args:
        folder_path (str): The path to the folder containing PNG files.
        output_video_path (str): The path where the output video will be saved.
        frame_size (tuple): The frame size of the output video (width, height).
        frame_rate (int): The frame rate of the output video.

    Returns:
        None
    """
    # Sort the PNG files by their filenames using natural sorting
    png_files = sorted([os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith('.png')], key=natural_sort_key)

    # Initialize the video writer
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # You can use other codecs like 'XVID' or 'MJPG'
    out = cv2.VideoWriter(output_video_path, fourcc, frame_rate, frame_size)

    # Create the video from PNG files
    for png_file in png_files:
        frame = cv2.imread(png_file)
        frame = cv2.resize(frame, frame_size)
        out.write(frame)

    # Release the video writer
    out.release()

    # Unmount Google Drive to avoid errors when saving the video
    drive.flush_and_unmount()

    print(f"Video '{output_video_path}' created successfully.")

In [None]:
import pandas as pd
import math
import matplotlib.pyplot as plt

def plot_distance_vs_moving_average_with_labels(data, labels=[0, 1, 2, 3], window_size=2, interruption_point=None):
    df = data.copy()

    # Filter the DataFrame to select rows with specific labels
    df = df[df['Labels'].isin(labels)]

    # Initialize an empty dictionary to store barycenters for each label
    barycenters = {}

    # Calculate the barycenter (centroid) for each selected label
    for label in labels:
        label_df = df[df['Labels'] == label]
        barycenter_x = label_df['X'].mean()
        barycenter_y = label_df['Y'].mean()
        barycenters[label] = (barycenter_x, barycenter_y)

    # Calculate the distance for each data point based on its label-dependent barycenter
    df['Distance'] = df.apply(lambda row: math.sqrt((row['X'] - barycenters[row['Labels']][0])**2 + (row['Y'] - barycenters[row['Labels']][1])**2), axis=1)

    # Determine the maximum distance for the x-axis
    max_distance = df['Distance'].max()

    if interruption_point is not None:
        # Filter the DataFrame to include only data points with distances less than the interruption_point
        df = df[df['Distance'] < interruption_point]

    # Create a new DataFrame for plotting
    plot_df = df[['Distance', 'IntDen']]

    # Sort the DataFrame by distance for a cleaner plot
    plot_df = plot_df.sort_values(by='Distance')

    # Calculate the moving average of the 'IntDen' column
    plot_df['Moving_Average'] = plot_df['IntDen'].rolling(window=window_size).mean()

    # Plot only the moving average
    plt.plot(plot_df['Distance'], plot_df['Moving_Average'], linestyle='--', label='Moving Average')
    plt.xlabel('Distance to Barycenter')
    plt.ylabel('Moving Average of IntDen')
    plt.title('Moving Average of IntDen vs. Distance to Barycenter')
    plt.grid(True)
    plt.legend()

    plt.show()

# Example usage:
# Assuming you have a DataFrame 'df' with columns 'X', 'Y', 'IntDen', and 'Labels'
# plot_distance_vs_moving_average_with_labels(df, labels=[0, 1, 2, 3], window_size=2, interruption_point=10)

In [None]:
import pandas as pd
import math

def calculate_correlation(data, labels=[0, 1, 2, 3], interruption_point=None):
    df = data.copy()

    # Filter the DataFrame to select rows with specific labels
    df = df[df['Labels'].isin(labels)]

    # Initialize an empty dictionary to store barycenters for each label
    barycenters = {}

    # Calculate the barycenter (centroid) for each selected label
    for label in labels:
        label_df = df[df['Labels'] == label]
        barycenter_x = label_df['X'].mean()
        barycenter_y = label_df['Y'].mean()
        barycenters[label] = (barycenter_x, barycenter_y)

    # Calculate the distance for each data point based on its label-dependent barycenter
    df['Distance'] = df.apply(lambda row: math.sqrt((row['X'] - barycenters[row['Labels']][0])**2 + (row['Y'] - barycenters[row['Labels']][1])**2), axis=1)

    if interruption_point is not None:
        # Filter the DataFrame to include only data points with distances less than the interruption_point
        df = df[df['Distance'] < interruption_point]

    # Calculate the correlation between 'Distance' and 'IntDen' columns
    correlation = df['Distance'].corr(df['IntDen'])

    return correlation

# Example usage:
# Assuming you have a DataFrame 'df' with columns 'X', 'Y', 'IntDen', and 'Labels'
# correlation = calculate_correlation(df, labels=[0, 1, 2, 3], interruption_point=10)
# print("Correlation between Distance and IntDen:", correlation)


In [None]:
def calculate_correlation_single(data, labels=[0, 1, 2, 3], interruption_point=None):
    df = data.copy()

    # Initialize an empty dictionary to store barycenters for each label
    barycenters = {}

    correlations = {}

    for label in labels:
        # Filter the DataFrame to select rows with the current label
        label_df = df[df['Labels'] == label]

        if label_df.empty:
            continue  # Skip labels with no data

        # Calculate the barycenter (centroid) for the current label
        barycenter_x = label_df['X'].mean()
        barycenter_y = label_df['Y'].mean()
        barycenters[label] = (barycenter_x, barycenter_y)

        # Calculate the distance for each data point based on its label-dependent barycenter
        label_df['Distance'] = label_df.apply(lambda row: math.sqrt((row['X'] - barycenters[label][0])**2 + (row['Y'] - barycenters[label][1])**2), axis=1)

        if interruption_point is not None:
            # Filter the DataFrame to include only data points with distances less than the interruption_point
            label_df = label_df[label_df['Distance'] < interruption_point]

        # Calculate the correlation between 'Distance' and 'IntDen' columns for the current label
        correlation = label_df['Distance'].corr(label_df['IntDen'])
        correlations[label] = correlation


    return correlations


In [None]:
def calculate_correlation_single_Area(data, labels=[0, 1, 2, 3], interruption_point=None):
    df = data.copy()

    # Initialize an empty dictionary to store barycenters for each label
    barycenters = {}

    correlations = {}

    for label in labels:
        # Filter the DataFrame to select rows with the current label
        label_df = df[df['Labels'] == label]

        if label_df.empty:
            continue  # Skip labels with no data

        # Calculate the barycenter (centroid) for the current label
        barycenter_x = label_df['X'].mean()
        barycenter_y = label_df['Y'].mean()
        barycenters[label] = (barycenter_x, barycenter_y)

        # Calculate the distance for each data point based on its label-dependent barycenter
        label_df['Distance'] = label_df.apply(lambda row: math.sqrt((row['X'] - barycenters[label][0])**2 + (row['Y'] - barycenters[label][1])**2), axis=1)

        if interruption_point is not None:
            # Filter the DataFrame to include only data points with distances less than the interruption_point
            label_df = label_df[label_df['Distance'] < interruption_point]

        # Calculate the correlation between 'Distance' and 'IntDen' columns for the current label
        correlation = label_df['Distance'].corr(label_df['Area'])
        correlations[label] = correlation


    return correlations

###### Process

In [None]:
from scipy import stats
import matplotlib.pyplot as plt

# Replace 'data' with your actual data
data = df_all_new_updated['IntDen']

# Create probability plots for different distributions
plt.figure(figsize=(12, 6))

# Normal distribution
plt.subplot(231)
stats.probplot(data, dist="norm", plot=plt)
plt.title("Normal Distribution")

# Exponential distribution
plt.subplot(232)
stats.probplot(data, dist="expon", plot=plt)
plt.title("Exponential Distribution")


plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'points_cluster_0_time_5' is your DataFrame with the 'IntDen' column

# Plot the Gaussian distribution using seaborn
sns.histplot(points_cluster_0_time_5['IntDen'], kde=True, bins=20, color='blue')
plt.xlabel('IntDen')
plt.ylabel('Frequency')
plt.title('Gaussian Distribution of IntDen')
plt.grid(True)
plt.show()


In [None]:
plot_distance_vs_value_with_moving_average(points_cluster_0_time_5, 3)

In [None]:
plot_distance_vs_value(selected_df)

In [None]:
plot_moving_average(selected_df)

In [None]:
folder_id = '/content/drive/MyDrive/Academic Work/Utkarsha Khanal/CSV_files/New_algorithm/clusters/N=0123'
output_video_path = '/content/drive/MyDrive/Academic Work/Utkarsha Khanal/CSV_files/New_algorithm/clusters/N=0123/video.mp4'
create_video_from_pngs(folder_id,output_video_path)

In [None]:
visual = plot_values_based_on_the_area_in_space(selected_df)

In [None]:
from scipy import stats
import matplotlib.pyplot as plt

# Replace 'data' with your actual data
data = visual['IntDen']

# Create probability plots for different distributions
plt.figure(figsize=(12, 6))

# Normal distribution
plt.subplot(231)
stats.probplot(data, dist="norm", plot=plt)
plt.title("Normal Distribution")

# Exponential distribution
plt.subplot(232)
stats.probplot(data, dist="expon", plot=plt)
plt.title("Exponential Distribution")


plt.tight_layout()
plt.show()

In [None]:
f0 = plot_distance_histogram_with_custom_ticks(df_distance_C0_T5)

In [None]:
plot_distance_vs_cell_location(df_distance_C0_T5)

In [None]:
f1

In [None]:
f1 = plot_distance_histogram_with_custom_ticks(df_distance_C1_T5)

In [None]:
plot_distance_vs_cell_location(df_distance_C1_T5)

In [None]:
plot_distance_vs_cell_location(df_distance_C2_T5)

In [None]:
f2

In [None]:
f2 = plot_distance_histogram_with_custom_ticks(df_distance_C2_T5)

In [None]:
df_see = plot_distance_vs_cell_location(df_distance_C3_T5)

In [None]:
plot_distance_histogram_with_custom_ticks(df_distance_C3_T5)

In [None]:
first_d = plot_distance_histogram_with_custom_ticks(df_distance_C3_T5)

In [None]:
first_d

In [None]:
import pandas as pd
import math
import matplotlib.pyplot as plt
import numpy as np

def plot_distance_histogram_with_custom_ticks(data):
    df = data.copy()

    # Calculate the barycenter (centroid)
    Barycenter_x = df['X'].mean()
    Barycenter_y = df['Y'].mean()

    # Calculate the distance for each data point
    df['Distance'] = df.apply(lambda row: math.sqrt((row['X'] - Barycenter_x)**2 + (row['Y'] - Barycenter_y)**2), axis=1)
    custom_ticks = int(max(df['Distance']) / 7.5)
    # Create a histogram of distances with custom ticks
    plt.figure(figsize=(8, 6))
    n, bins, patches = plt.hist(df['Distance'], bins=custom_ticks, color='b', alpha=0.7)
    plt.xlabel('Distance from Barycenter')
    plt.ylabel('Frequency')
    plt.title('Histogram of Cell Distances from Barycenter')

    # Find the first bin with a count of zero or close to zero
    first_zero_bin = None
    for i, count in enumerate(n):
        if count <= 1e-6:  # You can adjust this threshold as needed
            first_zero_bin = bins[i]
            break

    plt.show()
    return first_zero_bin


In [None]:
plot_distance_vs_moving_average(points_cluster_0_time_5, 20)

study other points

In [None]:
points_cluster_0_time_25 = df_all_new_updated[(df_all_new_updated['Time'] == 25) & (df_all_new_updated['Labels'] == 0)]

In [None]:
plot_distance_vs_moving_average(points_cluster_0_time_25, 20)

In [None]:
points_cluster_0_time_50 = df_all_new_updated[(df_all_new_updated['Time'] == 50) & (df_all_new_updated['Labels'] == 0)]

In [None]:
plot_distance_vs_moving_average(points_cluster_0_time_50, 20)

In [None]:
points_cluster_0_time_75 = df_all_new_updated[(df_all_new_updated['Time'] == 75) & (df_all_new_updated['Labels'] == 0)]

In [None]:
plot_distance_vs_moving_average(points_cluster_0_time_75, 20)

In [None]:
points_cluster_0_time_100 = df_all_new_updated[(df_all_new_updated['Time'] == 100) & (df_all_new_updated['Labels'] == 0)]

In [None]:
plot_distance_vs_moving_average(points_cluster_0_time_100, 20)

I can take all the values and calculate a distance based on all of them. I can see what happens with several points. The distance can be more covered in this way

In [None]:
plot_distance_vs_moving_average_with_labels(df_all_new_updated, window_size = 20)

In [None]:
plot_distance_vs_moving_average_with_labels(df_all_new_updated, window_size = 20, interruption_point = 1000)

Just a few of them

In [None]:
plot_distance_vs_moving_average_with_labels(df_all_new_updated, labels = [0], window_size = 20)

In [None]:
plot_distance_vs_moving_average_with_labels(df_all_new_updated, labels = [1], window_size = 20)

In [None]:
plot_distance_vs_moving_average_with_labels(df_all_new_updated, labels = [2], window_size = 20)

In [None]:
plot_distance_vs_moving_average_with_labels(df_all_new_updated, labels = [3], window_size = 20)

After this analysis on the intensity, it may be useful to understand how the values are correlated. We can study the correlation considering single clusters or all the values taken together

###### Correlation with IntDen

In [None]:
correlation_df = pd.DataFrame.from_dict(correlations, orient='index', columns=['Correlation'])

In [None]:
correlation_df.loc['Total'] = calculate_correlation(df_all_new_updated)

In [None]:
# Create a heat map
sns.heatmap(correlation_df, annot=True, cmap='coolwarm', fmt=".2f")
plt.xlabel('Labels')
plt.ylabel('Correlations')
plt.title('Correlation Heat Map')
plt.show()

Another question that may be useful to answer is whether there is a difference between the area and the distance. That can define if it's possible to identify any correlation between the area and the distance

In [None]:
area_correlations

##### Additional Analysis

In [None]:
import matplotlib.pyplot as plt

# Assuming df_all_new_updated is your DataFrame
data = df_all_new_updated['IntDen']

# Create a histogram
plt.hist(data, bins=10, color='blue', edgecolor='black')

# Add labels and a title
plt.xlabel('IntDen')
plt.ylabel('Frequency')
plt.title('Histogram of IntDen')

# Show the plot
plt.show()


In [None]:
# Select specific columns from the DataFrame
selected_columns = cluster_new_updated[['Area', 'Geo_light_intensity', 'IntDen', 'Mean', 'N_cell', 'Mean', 'Labels']]
selected_columns2 = cluster_new_updated[['Area', 'Geo_light_intensity', 'IntDen', 'Mean', 'N_cell', 'Mean']]

# If you want to create a new DataFrame with these columns, you can do so:
clustered_df = selected_columns.copy()
measurement_data = selected_columns2.copy()

In [None]:
clustered_df_L0 = clustered_df[clustered_df['Labels'] == 0]
clustered_df_L1 = clustered_df[clustered_df['Labels'] == 1]
clustered_df_L2 = clustered_df[clustered_df['Labels'] == 2]
clustered_df_L3 = clustered_df[clustered_df['Labels'] == 3]

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
luminescence_pca = pca.fit_transform(clustered_df_L0)

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Assuming you have already performed PCA on 'clustered_df_L0'
pca = PCA()
luminescence_pca = pca.fit_transform(clustered_df)

# Create a scatter plot
plt.figure(figsize=(8, 6))  # Adjust the figure size as needed
plt.scatter(luminescence_pca[:, 0], luminescence_pca[:, 1], alpha=0.5)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA Scatter Plot')

# Show the plot
plt.show()


In [None]:
pip install scanpy

In [None]:
import scanpy as sc
# Perform PCA with optional data normalization
import scanpy as sc
import pandas as pd

# Assuming you have a DataFrame named measurement_data
# Convert your DataFrame to an AnnData object
adata = sc.AnnData(X=measurement_data.values)
#sc.pp.normalize_total(adata, target_sum=1e4)

# Perform PCA with optional data normalization
sc.tl.pca(adata)

# Visualize PCA results
sc.pl.pca(adata)



In [None]:
adata = sc.AnnData(X=measurement_data.values)

# Perform PCA with optional data normalization
sc.tl.pca(adata)

# Visualize PCA results
sc.pl.pca(adata)


In [None]:
!pip install leidenalg

In [None]:
import scanpy as sc

# Assuming you have an AnnData object named 'adata'

# Calculate neighbors
sc.pp.neighbors(adata, n_pcs=6)

# Perform clustering using the Leiden algorithm
sc.tl.leiden(adata, resolution=1.0)  # You can adjust the resolution parameter

# UMAP embedding
sc.tl.umap(adata)

# Visualize the UMAP plot with cluster coloring
sc.pl.umap(adata, color='leiden')


In [None]:
pip install pysal

###### Spatial Statistics

In [None]:
import geopandas as gpd
from shapely.geometry import Point
import libpysal
import esda
import matplotlib.pyplot as plt

# Select specific columns from the DataFrame
selected_columns_points = df_all_new_updated[['X', 'Y', 'IntDen']]

# Create a GeoDataFrame with Point geometries
geometry = [Point(x, y) for x, y in zip(selected_columns_points['X'], selected_columns_points['Y'])]
gdf = gpd.GeoDataFrame(selected_columns_points, geometry=geometry)

# Create a spatial weights matrix (W) using Queen contiguity
w = libpysal.weights.Queen.from_dataframe(gdf)

# Extract the 'IntDen' column for Moran's I analysis
y = selected_columns_points['IntDen']

# Perform Moran's I analysis
moran = esda.Moran(y, w)

# Access Moran's I results
moran_I = moran.I
moran_EI = moran.EI
moran_p_value = moran.p_sim



In [None]:

import matplotlib.pyplot as plt
from splot.esda import plot_moran

# Plot Moran scatterplot
plot_moran(moran, zstandard=True, figsize=(10, 8))
plt.show()

In [None]:
!pip install pykrige

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial import distance

# Generate example data
n_points = len(df1)
X = df1['X']  # X coordinates
Y = df1['Y']  # Y coordinates
temperature = df1['IntDen'].values  # Temperature values

# Create a grid for prediction
x_grid, y_grid = np.meshgrid(np.linspace(0, 20, 150), np.linspace(0, 20, 150))

# Define the IDW interpolation function
def idw_interpolation(x, y, X, Y, values, power=2):
    distances = distance.cdist(np.column_stack((x.flatten(), y.flatten())), np.column_stack((X, Y)))
    weights = 1.0 / (distances**power)
    weighted_values = values * weights
    interpolated_values = np.sum(weighted_values, axis=1) / np.sum(weights, axis=1)
    return interpolated_values

# Perform the IDW interpolation
z_grid = idw_interpolation(x_grid, y_grid, X, Y, temperature)

# Reshape the grid and predicted values
z_grid = z_grid.reshape(x_grid.shape)

# Create a contour plot
plt.figure(figsize=(10, 8))
plt.contourf(x_grid, y_grid, z_grid, levels=100, cmap="viridis")
plt.colorbar(label="IntDen")
plt.scatter(X, Y, c=temperature, cmap="coolwarm", edgecolors="k", s=100)
plt.xlabel("X")
plt.ylabel("Y")
plt.title("IntDen Interpolation (IDW)")
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial import distance

# Generate example data
n_points = len(df2)
X = df2['X']  # X coordinates
Y = df2['Y']  # Y coordinates
temperature = df2['IntDen'].values  # Temperature values

# Create a grid for prediction
x_grid, y_grid = np.meshgrid(np.linspace(0, 20, 150), np.linspace(0, 20, 150))

# Define the IDW interpolation function
def idw_interpolation(x, y, X, Y, values, power=2):
    distances = distance.cdist(np.column_stack((x.flatten(), y.flatten())), np.column_stack((X, Y)))
    weights = 1.0 / (distances**power)
    weighted_values = values * weights
    interpolated_values = np.sum(weighted_values, axis=1) / np.sum(weights, axis=1)
    return interpolated_values

# Perform the IDW interpolation
z_grid = idw_interpolation(x_grid, y_grid, X, Y, temperature)

# Reshape the grid and predicted values
z_grid = z_grid.reshape(x_grid.shape)

# Create a contour plot
plt.figure(figsize=(10, 8))
plt.contourf(x_grid, y_grid, z_grid, levels=100, cmap="viridis")
plt.colorbar(label="IntDen")
plt.scatter(X, Y, c=temperature, cmap="coolwarm", edgecolors="k", s=100)
plt.xlabel("X")
plt.ylabel("Y")
plt.title("IntDen Interpolation (IDW)")
plt.show()


In [None]:
import spaghetti
import esda
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
import libpysal

# Sample dataset with X, Y, and IntDen columns
# Example data (replace with your actual data):
# df1 = pd.DataFrame({
#     'X': [1, 2, 3, 4, 5],
#     'Y': [2, 3, 4, 5, 6],
#     'IntDen': [10, 15, 20, 25, 30]
# })

# Create a GeoDataFrame from your DataFrame with Point geometries
geometry = [Point(xy) for xy in zip(df1['X'], df1['Y'])]
gdf = gpd.GeoDataFrame(df1, geometry=geometry)

# Create a spatial weights matrix (k-nearest neighbors with k=3, for example)
w = libpysal.weights.KNN.from_dataframe(gdf, k=3)

# Calculate Geary's C for spatial autocorrelation
gearys_c = esda.geary.Geary(gdf['IntDen'], w)

# Access Geary's C statistic and p-value
gearys_c_statistic = gearys_c.C
p_value = gearys_c.p_sim

print("Geary's C:", gearys_c_statistic)
print("p-value:", p_value)


In [None]:
import spaghetti
import esda
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
import libpysal
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Sample dataset with X, Y, and IntDen columns
# Example data (replace with your actual data):
# df1 = pd.DataFrame({
#     'X': [1, 2, 3, 4, 5],
#     'Y': [2, 3, 4, 5, 6],
#     'IntDen': [10, 15, 20, 25, 30]
# })

# Create a GeoDataFrame from your DataFrame with Point geometries
geometry = [Point(xy) for xy in zip(df1['X'], df1['Y'])]
gdf = gpd.GeoDataFrame(df1, geometry=geometry)

# Create a spatial weights matrix (k-nearest neighbors with k=3, for example)
w = libpysal.weights.KNN.from_dataframe(gdf, k=3)

# Suppress the warning about disconnected components
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Calculate Geary's C for spatial autocorrelation
gearys_c = esda.geary.Geary(gdf['IntDen'], w)

# Access Geary's C statistic and p-value
gearys_c_statistic = gearys_c.C
p_value = gearys_c.p_sim

# Create a Moran scatterplot
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))

# Calculate the spatial lag of IntDen
lag_intden = libpysal.weights.lag_spatial(w, gdf['IntDen'])

# Scatterplot
plt.scatter(gdf['IntDen'], lag_intden, color='b', alpha=0.5)

# Add labels and title
plt.xlabel("IntDen")
plt.ylabel("Spatial Lag of IntDen")
plt.title("Moran Scatterplot (Geary's C)")

# Show the plot
plt.show()



##### Forecast

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Define parameters
initial_population = selected_clusters[(selected_clusters['Labels'] == 0) & (selected_clusters['Time'] == 92)]['N_cell'].values[0]  # Initial population or value
decay_rate = 1.000000001603458         # Rate at which the population decays
simulation_time = 26    # Total simulation time

# Initialize time and population arrays
time_points = [92]  # Initialize with the starting time of 92
population_values = [initial_population]

# Gillespie algorithm
while time_points[-1] < 92 + simulation_time and population_values[-1] > 0:
    # Calculate the decay rate (event rate)
    event_rate = decay_rate * population_values[-1]

    # Generate a random time for the next event based on exponential distribution
    time_to_next_event = np.random.exponential(1 / event_rate)

    # Update time and population values
    time_points.append(time_points[-1] + time_to_next_event)
    population_values.append(population_values[-1] - 1)

# Plot the simulation results
plt.plot(time_points, population_values)
plt.xlabel('Time')
plt.ylabel('Population')
plt.title('Gillespie Algorithm Simulation')
plt.show()


The result is not super convincing. Because we know from our perspective how the value may change over time. It's still higher than 0 at least until 118

In [None]:
N_t1 = selected_clusters[(selected_clusters['Labels'] == 0) & (selected_clusters['Time'] == 92)]['N_cell'].values[0]
N_t2 = 1
decay_rate = math.log(N_t1 / N_t2) / (118 - 92)

In [None]:
decay_rate

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Define parameters
initial_population = selected_clusters[(selected_clusters['Labels'] == 0) & (selected_clusters['Time'] == 92)]['N_cell'].values[0]  # Initial population or value
decay_rate = decay_rate         # Rate at which the population decays
simulation_time = 26    # Total simulation time

# Initialize time and population arrays
time_points = [92]  # Initialize with the starting time of 92
population_values = [initial_population]

# Gillespie algorithm
while time_points[-1] < 92 + simulation_time and population_values[-1] > 0:
    # Calculate the decay rate (event rate)
    event_rate = decay_rate * population_values[-1]

    # Generate a random time for the next event based on exponential distribution
    time_to_next_event = np.random.exponential(1 / event_rate)

    # Update time and population values
    time_points.append(time_points[-1] + time_to_next_event)
    population_values.append(population_values[-1] - 1)

# Plot the simulation results
plt.plot(time_points, population_values)
plt.xlabel('Time')
plt.ylabel('Population')
plt.title('Gillespie Algorithm Simulation')
plt.show()

In [None]:
float(decay_rate)

In [None]:
from cayenne.simulation import Simulation
model_str = """
        const compartment comp1;
        comp1 = 1.0; # volume of compartment

        r1: N_cell => B; k1;

        k1 = 0.1000001603458;
        chem_flag = false;

        N_cell = 300;
        B = 0;
    """
sim = Simulation.load_model(model_str, "ModelString")
# Run the simulation
sim.simulate(max_t=26, max_iter=1000, n_rep=5, algorithm="tau_adaptive")
sim.plot()

In [None]:
sim.plot(species_names = ["N_cell"])

#### Original Algorithm

In [None]:
linked_clusters_or = linked_clusters.copy()
df_all_or = df_all.copy()

We can display the results

In [None]:
output_folder_png = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/Original_Algorithm/png_files'
create_png_plots(df_all_or, output_folder_png)

gdrive_folder = output_folder_png
output_pdf_name = gdrive_folder + '/new_model.pdf'

result_pdf_path = merge_pngs_to_pdf(gdrive_folder, output_pdf_name)

print(f"PDF saved at: {result_pdf_path}")

In [None]:
df = df_all.copy()
selected_labels = [0, 1, 2, 3]
df = df[df['Labels'].isin(selected_labels)]

data = df
output_folder_png = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/clusters/N=0123'
create_png_plots(data, output_folder_png)

In [None]:
gdrive_folder = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/clusters/N=0123'
output_pdf_name = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/clusters/N=0123/new_model.pdf'

result_pdf_path = merge_pngs_to_pdf(gdrive_folder, output_pdf_name)

print(f"PDF saved at: {result_pdf_path}")

##### Fixing some values due to code

What is possible to notice is that the value here is really close. And there are just minimal differences

In [None]:
df_all_or = adjust_clusters_disappeared(df_all_or)

In [None]:
output_folder_png = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/Original_Algorithm/png_files_v2'
create_png_plots(df_all_or, output_folder_png)

gdrive_folder = output_folder_png
output_pdf_name = gdrive_folder + '/new_model.pdf'

result_pdf_path = merge_pngs_to_pdf(gdrive_folder, output_pdf_name)

print(f"PDF saved at: {result_pdf_path}")

perform final adjustments

In [None]:
df_all_or = order_labels(df_all_or)

In [None]:
output_folder_png = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/Original_Algorithm/png_files_v2_ordered'
create_png_plots(df_all_or, output_folder_png)

gdrive_folder = output_folder_png
output_pdf_name = gdrive_folder + '/new_model.pdf'

result_pdf_path = merge_pngs_to_pdf(gdrive_folder, output_pdf_name)

folder_id = gdrive_folder
output_video_path = gdrive_folder + '/video.mp4'
create_video_from_pngs(folder_id,output_video_path)

print(f"PDF saved at: {result_pdf_path}")

In [None]:
# Define replacement dictionary
replacement_dict = {35: 3, 97:3, 24:3, 106:3, 30:1, 22:1, 84:1, 95:1, 103:2, 28:2, 129:0, 39:0}

# Replace values in the 'label' column using the dictionary
df_all_new['Labels'] = df_all_new['Labels'].replace(replacement_dict)


In [None]:
df_all_new = order_labels(df_all_new)

In [None]:
output_folder_png = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/Original_Algorithm/png_files_v3'
create_png_plots(df_all_or, output_folder_png)

gdrive_folder = output_folder_png
output_pdf_name = gdrive_folder + '/new_model.pdf'

result_pdf_path = merge_pngs_to_pdf(gdrive_folder, output_pdf_name)

print(f"PDF saved at: {result_pdf_path}")

Study the development of all the clusters

In [None]:
for i in range(4):
  selected_label = i
  selected_cluster = df_all_new_updated[df_all_new_updated['Labels'] == selected_label]

  output_folder_png = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/clusters/N='+ str(i)
  create_png_plots(selected_cluster, output_folder_png)

  gdrive_folder = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/clusters/N='+ str(i)
  output_pdf_name = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/clusters/N='+ str(i) + '/complete_pdf_N=' + str(i) + '.pdf'

  result_pdf_path = merge_pngs_to_pdf(gdrive_folder, output_pdf_name)


As there is the same data frame, it's also possible to select just one particular cluster and to see how it would develop

after we have saved the final value we can also create a new cluster from it. So we can analyse our data

In [None]:
df_all_new_updated = pd.read_csv('/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/df_all_new_updated_v2.csv')

Obtain updated clusters

In [None]:
cluster_partial_1 = df_all_new_updated.groupby(['Labels', 'Time']).mean().reset_index()

# Group by 'Labels' and 'Time' and apply the custom aggregation function
cluster_partial_2 = df_all_new_updated.copy()

# Group by 'Old_Labels' and 'Time' and apply the custom aggregation function
cluster_partial_2 = cluster_partial_2.groupby(['Labels', 'Time']).apply(data_cluster_updated).reset_index()

# Merge based on the 'Labels' and 'Time' columns using an inner join
merged_df = pd.merge(result_df, result_df1, how='inner', on=['Labels', 'Time'])


In [None]:
merged_df.to_csv('/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/cluster_new_updated_v2.csv')

In [None]:
cluster_new_updated = pd.read_csv('/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/cluster_new_updated_v2.csv')

###### Location

In [None]:
df = cluster_new_updated.copy()
selected_labels = [0, 1, 2, 3]
df = df[df['Labels'].isin(selected_labels)]

# Get unique labels from the DataFrame
unique_labels = df['Labels'].unique()

# Create a 3D plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Plot each label's X_x and Y_y positions over time
for label in unique_labels:
    label_data = df[df['Labels'] == label]

    ax.plot(label_data['Time'], label_data['X_x'], label_data['Y_y'], label=label)

# Set labels and title
ax.set_xlabel('Time')
ax.set_ylabel('X_x')
ax.set_zlabel('Y_y')
plt.title('3D Plot of Position vs. Time for Each Label')
plt.legend()

# Show the plot
plt.show()


###### Correlation

In [None]:
# Calculate Pearson correlation
correlation_matrix_pearson = clustered_df.corr()

# Calculate Spearman correlation
correlation_matrix_spearman = clustered_df.corr(method='spearman')

# Calculate Kendall correlation
correlation_matrix_kendall = clustered_df.corr(method='kendall')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming df is your DataFrame
correlation_matrix = correlation_matrix_pearson

# Create a figure and axis
fig, ax = plt.subplots(figsize=(8, 6))

# Plot the correlation matrix with a background gradient
cax = ax.matshow(correlation_matrix, cmap='coolwarm')

# Add a colorbar
cbar = fig.colorbar(cax)

# Set the precision of the colorbar labels to 2 decimal places
cbar.set_label('Correlation', rotation=270, labelpad=15)
cbar.set_ticks([-1, -0.5, 0, 0.5, 1])
cbar.set_ticklabels([-1, -0.5, 0, 0.5, 1])

# Set the x and y axis labels and titles
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=90)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)
plt.title('Correlation Matrix')

# Show the plot
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming df is your DataFrame
correlation_matrix = correlation_matrix_spearman

# Create a figure and axis
fig, ax = plt.subplots(figsize=(8, 6))

# Plot the correlation matrix with a background gradient
cax = ax.matshow(correlation_matrix, cmap='coolwarm')

# Add a colorbar
cbar = fig.colorbar(cax)

# Set the precision of the colorbar labels to 2 decimal places
cbar.set_label('Correlation', rotation=270, labelpad=15)
cbar.set_ticks([-1, -0.5, 0, 0.5, 1])
cbar.set_ticklabels([-1, -0.5, 0, 0.5, 1])

# Set the x and y axis labels and titles
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=90)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)
plt.title('Correlation Matrix')

# Show the plot
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming df is your DataFrame
correlation_matrix = correlation_matrix_kendall

# Create a figure and axis
fig, ax = plt.subplots(figsize=(8, 6))

# Plot the correlation matrix with a background gradient
cax = ax.matshow(correlation_matrix, cmap='coolwarm')

# Add a colorbar
cbar = fig.colorbar(cax)

# Set the precision of the colorbar labels to 2 decimal places
cbar.set_label('Correlation', rotation=270, labelpad=15)
cbar.set_ticks([-1, -0.5, 0, 0.5, 1])
cbar.set_ticklabels([-1, -0.5, 0, 0.5, 1])

# Set the x and y axis labels and titles
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=90)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)
plt.title('Correlation Matrix')

# Show the plot
plt.show()

From this graph is possible to understand some key components

In [None]:
# Select specific columns from the DataFrame
selected_columns = cluster_new_updated[['Area', 'Geo_light_intensity', 'IntDen', 'Mean', 'N_cell', 'Mean']]

# If you want to create a new DataFrame with these columns, you can do so:
clustered_df = selected_columns.copy()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming your DataFrame is clustered_df
correlation_matrix = clustered_df.corr(method='kendall')

# Create a figure and axis
fig, ax = plt.subplots(figsize=(8, 6))

# Plot the correlation matrix with a background gradient
cax = ax.matshow(correlation_matrix, cmap='coolwarm')

# Add a colorbar
cbar = fig.colorbar(cax)

# Set the precision of the colorbar labels to 2 decimal places
cbar.set_label('Correlation', rotation=270, labelpad=15)
cbar.set_ticks([-1, -0.5, 0, 0.5, 1])
cbar.set_ticklabels([-1, -0.5, 0, 0.5, 1])

# Set the x and y axis labels and titles
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=90)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)
plt.title('Correlation Matrix')

# Show the plot
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming your DataFrame is clustered_df
correlation_matrix = clustered_df.corr(method='kendall')

# Create a figure and axis
fig, ax = plt.subplots(figsize=(8, 6))

# Plot the correlation matrix with a background gradient
cax = ax.matshow(correlation_matrix, cmap='coolwarm')

# Add a colorbar
cbar = fig.colorbar(cax)

# Set the precision of the colorbar labels to 2 decimal places
cbar.set_label('Correlation', rotation=270, labelpad=15)
cbar.set_ticks([-1, -0.5, 0, 0.5, 1])
cbar.set_ticklabels([-1, -0.5, 0, 0.5, 1])

# Set the x and y axis labels and titles
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=90)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)
plt.title('Correlation Matrix')

# Add correlation values as text annotations
for i in range(len(correlation_matrix.columns)):
    for j in range(len(correlation_matrix.columns)):
        text = f"{correlation_matrix.iloc[i, j]:.2f}"
        ax.text(j, i, text, ha='center', va='center', color='black', fontsize=10)

# Show the plot
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming your DataFrame is clustered_df
correlation_matrix = clustered_df.corr(method='kendall')

# Create a figure and axis
fig, ax = plt.subplots(figsize=(8, 6))

# Plot the correlation matrix with a background gradient
cax = ax.matshow(correlation_matrix, cmap='coolwarm')

# Add a colorbar
cbar = fig.colorbar(cax)

# Set the precision of the colorbar labels to 2 decimal places
cbar.set_label('Correlation', rotation=270, labelpad=15)
cbar.set_ticks([-1, -0.5, 0, 0.5, 1])
cbar.set_ticklabels([-1, -0.5, 0, 0.5, 1])

# Set the x-axis labels at the bottom
ax.xaxis.set_ticks_position('bottom')
ax.set_xticks(range(len(correlation_matrix.columns)))
ax.set_xticklabels(correlation_matrix.columns, rotation=90)

# Set the y-axis labels on the left
ax.set_yticks(range(len(correlation_matrix.columns)))
ax.set_yticklabels(correlation_matrix.columns)

# Add correlation values as text annotations
for i in range(len(correlation_matrix.columns)):
    for j in range(len(correlation_matrix.columns)):
        text = f"{correlation_matrix.iloc[i, j]:.2f}"
        ax.text(j, i, text, ha='center', va='center', color='black', fontsize=10)

# Show the plot
plt.title('Correlation Matrix')
plt.show()


In [None]:
# Example - Major - Minor

Column1 = 'Major'
Column2 = 'Minor'
correlation_pearson = clustered_df[Column1].corr(clustered_df[Column2], method='spearman')

correlation_spearman = clustered_df[Column1].corr(clustered_df[Column2], method='spearman')

correlation_kendall = clustered_df[Column1].corr(clustered_df[Column2], method='spearman')

# Print or inspect the Spearman correlation value
print(correlation_pearson, correlation_spearman, correlation_kendall)

###### Plotting graphs

As see in the cluster, there are some values that are missing. That is mostly due to the segmentation algorithm and or some particular values that have been excluded through the clustering. For this reason one possibility is to use some ML system to solve this problem. It may be hard to understand the number of cells, or their location. However, it can also be possible to use some ML algorithms to solve this problem. Creating synthetic data

There may be some works in another notebook, as it can require more than expected + several iterations.

###### Geo_light_intensity

In [None]:
import numpy as np
from scipy.interpolate import CubicSpline
import pandas as pd

# Assuming you have a DataFrame named 'clustered_df' with columns 'Labels', 'Time', and 'Geo_light_intensity'

# Get unique labels
unique_labels = clustered_df['Labels'].unique()

# Initialize an empty DataFrame to store the results
Geo_light_intensity_df = pd.DataFrame()

# Loop through each unique label and apply the spline fitting
for label in unique_labels:
    # Select data for the current label
    label_data = clustered_df[clustered_df['Labels'] == label]

    # Check if the cluster has at least 2 elements
    if len(label_data) < 2:
        continue  # Skip clusters with less than 2 elements

    # Extract Time and Value columns
    t_data = label_data['Time']
    w_data = label_data['Geo_light_intensity']

    # Create a cubic spline object
    spline = CubicSpline(t_data, w_data)

    # Generate finer x values for plotting
    t_fine = np.linspace(min(t_data), max(t_data), int(max(t_data)))
    w_fine = spline(t_fine)

    # Create a new DataFrame for the current label's results
    label_results = pd.DataFrame({'Time': t_fine, 'Geo_light_intensity': w_fine})

    # Add a 'Labels' column with the current label value
    label_results['Labels'] = label

    # Append the current label's results to the overall results DataFrame
    Geo_light_intensity_df = pd.concat([Geo_light_intensity_df, label_results])


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Sort the DataFrame by "Time" in descending order
df_sorted = Geo_light_intensity_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot them with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)
    plt.plot(cluster_data['Time'], cluster_data['Geo_light_intensity'], label=f'Cluster {cluster_label}', color=color)

plt.xlabel('Time')
plt.ylabel('Geo_light_intensity')
plt.title('Geo_light_intensity Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)
plt.show()


###### Geo_cell_size

In [None]:
import numpy as np
from scipy.interpolate import CubicSpline
import pandas as pd

# Assuming you have a DataFrame named 'clustered_df' with columns 'Labels', 'Time', and 'Geo_light_intensity'

# Get unique labels
unique_labels = clustered_df['Labels'].unique()

# Initialize an empty DataFrame to store the results
Geo_cell_size_df = pd.DataFrame()

# Loop through each unique label and apply the spline fitting
for label in unique_labels:
    # Select data for the current label
    label_data = clustered_df[clustered_df['Labels'] == label]

    # Check if the cluster has at least 2 elements
    if len(label_data) < 2:
        continue  # Skip clusters with less than 2 elements

    # Extract Time and Value columns
    t_data = label_data['Time']
    w_data = label_data['Geo_cell_size']

    # Create a cubic spline object
    spline = CubicSpline(t_data, w_data)

    # Generate finer x values for plotting
    t_fine = np.linspace(min(t_data), max(t_data), int(max(t_data)))
    w_fine = spline(t_fine)

    # Create a new DataFrame for the current label's results
    label_results = pd.DataFrame({'Time': t_fine, 'Geo_cell_size': w_fine})

    # Add a 'Labels' column with the current label value
    label_results['Labels'] = label

    # Append the current label's results to the overall results DataFrame
    Geo_cell_size_df = pd.concat([Geo_cell_size_df, label_results])


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Sort the DataFrame by "Time" in descending order
df_sorted = Geo_cell_size_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot them with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)
    plt.plot(cluster_data['Time'], cluster_data['Geo_cell_size'], label=f'Cluster {cluster_label}', color=color)

plt.xlabel('Time')
plt.ylabel('Geo_cell_size')
plt.title('Geo_cell_size Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)
plt.show()


###### Cluster_dimension

In [None]:
import numpy as np
from scipy.interpolate import CubicSpline
import pandas as pd

# Assuming you have a DataFrame named 'clustered_df' with columns 'Labels', 'Time', and 'Geo_light_intensity'

# Get unique labels
unique_labels = clustered_df['Labels'].unique()

# Initialize an empty DataFrame to store the results
Cluster_dimension_df = pd.DataFrame()

# Loop through each unique label and apply the spline fitting
for label in unique_labels:
    # Select data for the current label
    label_data = clustered_df[clustered_df['Labels'] == label]

    # Check if the cluster has at least 2 elements
    if len(label_data) < 2:
        continue  # Skip clusters with less than 2 elements

    # Extract Time and Value columns
    t_data = label_data['Time']
    w_data = label_data['Cluster_dimension']

    # Create a cubic spline object
    spline = CubicSpline(t_data, w_data)

    # Generate finer x values for plotting
    t_fine = np.linspace(min(t_data), max(t_data), int(max(t_data)))
    w_fine = spline(t_fine)

    # Create a new DataFrame for the current label's results
    label_results = pd.DataFrame({'Time': t_fine, 'Cluster_dimension': w_fine})

    # Add a 'Labels' column with the current label value
    label_results['Labels'] = label

    # Append the current label's results to the overall results DataFrame
    Cluster_dimension_df = pd.concat([Cluster_dimension_df, label_results])

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Sort the DataFrame by "Time" in descending order
df_sorted = Cluster_dimension_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot them with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)
    plt.plot(cluster_data['Time'], cluster_data['Cluster_dimension'], label=f'Cluster {cluster_label}', color=color)

plt.xlabel('Time')
plt.ylabel('Cluster_dimension')
plt.title('Cluster_dimension Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Sort the DataFrame by "Time" in descending order
df_sorted = Cluster_dimension_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot moving averages with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)

    # Calculate the moving average with a window size of, for example, 5
    window_size = 5
    moving_avg = cluster_data['Cluster_dimension'].rolling(window=window_size).mean()

    plt.plot(cluster_data['Time'], moving_avg, label=f'Cluster {cluster_label} (Moving Avg)', color=color)

plt.xlabel('Time')
plt.ylabel('Moving Average of Cluster_dimension')
plt.title('Moving Average of Cluster_dimension Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Sort the DataFrame by "Time" in descending order
df_sorted = Cluster_dimension_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot them with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)
    plt.plot(cluster_data['Time'], cluster_data['Cluster_dimension'], label=f'Cluster {cluster_label}', color=color)

    # Calculate and plot the moving average with a window of your choice, e.g., window=5
    moving_average = cluster_data['Cluster_dimension'].rolling(window=5).mean()
    plt.plot(cluster_data['Time'], moving_average, label=f'Moving Avg {cluster_label}', linestyle='--', color=color)

plt.xlabel('Time')
plt.ylabel('Cluster_dimension')
plt.title('Cluster_dimension Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)

plt.show()


###### N_cell

In [None]:
clustered_df = cluster_new_updated.copy()

In [None]:
import numpy as np
from scipy.interpolate import CubicSpline
import pandas as pd

# Assuming you have a DataFrame named 'clustered_df' with columns 'Labels', 'Time', and 'Geo_light_intensity'

# Get unique labels
unique_labels = clustered_df['Labels'].unique()

# Initialize an empty DataFrame to store the results
N_cell_df = pd.DataFrame()

# Loop through each unique label and apply the spline fitting
for label in unique_labels:
    # Select data for the current label
    label_data = clustered_df[clustered_df['Labels'] == label]

    # Check if the cluster has at least 2 elements
    if len(label_data) < 2:
        continue  # Skip clusters with less than 2 elements

    # Extract Time and Value columns
    t_data = label_data['Time']
    w_data = label_data['N_cell']

    # Create a cubic spline object
    spline = CubicSpline(t_data, w_data)

    # Generate finer x values for plotting
    t_fine = np.linspace(min(t_data), max(t_data), int(max(t_data)))
    w_fine = spline(t_fine)

    # Create a new DataFrame for the current label's results
    label_results = pd.DataFrame({'Time': t_fine, 'N_cell': w_fine})

    # Add a 'Labels' column with the current label value
    label_results['Labels'] = label

    # Append the current label's results to the overall results DataFrame
    N_cell_df = pd.concat([N_cell_df, label_results])


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Sort the DataFrame by "Time" in descending order
df_sorted = N_cell_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot them with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)
    plt.plot(cluster_data['Time'], cluster_data['N_cell'], label=f'Cluster {cluster_label}', color=color)

plt.xlabel('Time')
plt.ylabel('N_cell')
plt.title('N_cell Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Sort the DataFrame by "Time" in descending order
df_sorted = N_cell_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot them with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)
    plt.plot(cluster_data['Time'], cluster_data['N_cell'], label=f'Cluster {cluster_label}', color=color)

plt.xlabel('Time')
plt.ylabel('N_cell')
plt.title('N_cell Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)

# Set the y-axis to log scale
plt.yscale('log')

plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Sort the DataFrame by "Time" in descending order
df_sorted = N_cell_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot them with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)

    # Plot the original data
    plt.plot(cluster_data['Time'], cluster_data['N_cell'], label=f'Cluster {cluster_label}', color=color)

    # Compute and plot the moving average with a window of your choice, e.g., window=5
    moving_average = cluster_data['N_cell'].rolling(window=5).mean()
    plt.plot(cluster_data['Time'], moving_average, label=f'Moving Avg {cluster_label}', linestyle='--', color=color)

plt.xlabel('Time')
plt.ylabel('N_cell')
plt.title('N_cell Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)

# Set the y-axis to log scale
plt.yscale('log')

plt.show()


We can understand the closest exponential function achievable

In [None]:
import pandas as pd
import itertools
import matplotlib.pyplot as plt

# Sort the DataFrame by "Time" in descending order
df_sorted = N_cell_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot them with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)

    # Calculate the moving average with a window size of, for example, 5
    window_size = 5
    moving_avg = cluster_data['N_cell'].rolling(window=window_size).mean()

    plt.plot(cluster_data['Time'], moving_avg, label=f'Cluster {cluster_label} (Moving Avg)', color=color)

plt.xlabel('Time')
plt.ylabel('N_cell')
plt.title('Moving Average of N_cell Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)
plt.show()


###### Mean

In [None]:
import numpy as np
from scipy.interpolate import CubicSpline
import pandas as pd

# Assuming you have a DataFrame named 'clustered_df' with columns 'Labels', 'Time', and 'Geo_light_intensity'

# Get unique labels
unique_labels = clustered_df['Labels'].unique()

# Initialize an empty DataFrame to store the results
Mean_df = pd.DataFrame()

# Loop through each unique label and apply the spline fitting
for label in unique_labels:
    # Select data for the current label
    label_data = clustered_df[clustered_df['Labels'] == label]

    # Check if the cluster has at least 2 elements
    if len(label_data) < 2:
        continue  # Skip clusters with less than 2 elements

    # Extract Time and Value columns
    t_data = label_data['Time']
    w_data = label_data['Mean']

    # Create a cubic spline object
    spline = CubicSpline(t_data, w_data)

    # Generate finer x values for plotting
    t_fine = np.linspace(min(t_data), max(t_data), int(max(t_data)))
    w_fine = spline(t_fine)

    # Create a new DataFrame for the current label's results
    label_results = pd.DataFrame({'Time': t_fine, 'Mean': w_fine})

    # Add a 'Labels' column with the current label value
    label_results['Labels'] = label

    # Append the current label's results to the overall results DataFrame
    Mean_df = pd.concat([Mean_df, label_results])

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Sort the DataFrame by "Time" in descending order
df_sorted = Mean_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot them with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)
    plt.plot(cluster_data['Time'], cluster_data['Mean'], label=f'Cluster {cluster_label}', color=color)

    # Calculate and plot the moving average with a window of your choice, e.g., window=5
    moving_average = cluster_data['Mean'].rolling(window=5).mean()
    plt.plot(cluster_data['Time'], moving_average, label=f'Moving Avg {cluster_label}', linestyle='--', color=color)

plt.xlabel('Time')
plt.ylabel('Mean')
plt.title('Mean Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)

plt.show()

###### IntDen

In [None]:
import numpy as np
from scipy.interpolate import CubicSpline
import pandas as pd

# Assuming you have a DataFrame named 'clustered_df' with columns 'Labels', 'Time', and 'Geo_light_intensity'

# Get unique labels
unique_labels = clustered_df['Labels'].unique()

# Initialize an empty DataFrame to store the results
IntDen_df = pd.DataFrame()

# Loop through each unique label and apply the spline fitting
for label in unique_labels:
    # Select data for the current label
    label_data = clustered_df[clustered_df['Labels'] == label]

    # Check if the cluster has at least 2 elements
    if len(label_data) < 2:
        continue  # Skip clusters with less than 2 elements

    # Extract Time and Value columns
    t_data = label_data['Time']
    w_data = label_data['IntDen']

    # Create a cubic spline object
    spline = CubicSpline(t_data, w_data)

    # Generate finer x values for plotting
    t_fine = np.linspace(min(t_data), max(t_data), int(max(t_data)))
    w_fine = spline(t_fine)

    # Create a new DataFrame for the current label's results
    label_results = pd.DataFrame({'Time': t_fine, 'IntDen': w_fine})

    # Add a 'Labels' column with the current label value
    label_results['Labels'] = label

    # Append the current label's results to the overall results DataFrame
    IntDen_df = pd.concat([IntDen_df, label_results])

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Sort the DataFrame by "Time" in descending order
df_sorted = IntDen_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot them with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)
    plt.plot(cluster_data['Time'], cluster_data['IntDen'], label=f'Cluster {cluster_label}', color=color)

    # Calculate and plot the moving average with a window of your choice, e.g., window=5
    moving_average = cluster_data['IntDen'].rolling(window=5).mean()
    plt.plot(cluster_data['Time'], moving_average, label=f'Moving Avg {cluster_label}', linestyle='--', color=color)

plt.xlabel('Time')
plt.ylabel('IntDen')
plt.title('IntDen Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)

plt.show()

###### StdDev

In [None]:
import numpy as np
from scipy.interpolate import CubicSpline
import pandas as pd

# Assuming you have a DataFrame named 'clustered_df' with columns 'Labels', 'Time', and 'Geo_light_intensity'

# Get unique labels
unique_labels = clustered_df['Labels'].unique()

# Initialize an empty DataFrame to store the results
StdDev_df = pd.DataFrame()

# Loop through each unique label and apply the spline fitting
for label in unique_labels:
    # Select data for the current label
    label_data = clustered_df[clustered_df['Labels'] == label]

    # Check if the cluster has at least 2 elements
    if len(label_data) < 2:
        continue  # Skip clusters with less than 2 elements

    # Extract Time and Value columns
    t_data = label_data['Time']
    w_data = label_data['StdDev']

    # Create a cubic spline object
    spline = CubicSpline(t_data, w_data)

    # Generate finer x values for plotting
    t_fine = np.linspace(min(t_data), max(t_data), int(max(t_data)))
    w_fine = spline(t_fine)

    # Create a new DataFrame for the current label's results
    label_results = pd.DataFrame({'Time': t_fine, 'StdDev': w_fine})

    # Add a 'Labels' column with the current label value
    label_results['Labels'] = label

    # Append the current label's results to the overall results DataFrame
    StdDev_df = pd.concat([StdDev_df, label_results])

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Sort the DataFrame by "Time" in descending order
df_sorted = StdDev_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot them with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)
    plt.plot(cluster_data['Time'], cluster_data['StdDev'], label=f'Cluster {cluster_label}', color=color)

    # Calculate and plot the moving average with a window of your choice, e.g., window=5
    moving_average = cluster_data['StdDev'].rolling(window=5).mean()
    plt.plot(cluster_data['Time'], moving_average, label=f'Moving Avg {cluster_label}', linestyle='--', color=color)

plt.xlabel('Time')
plt.ylabel('StdDev')
plt.title('StdDev Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)

plt.show()

###### Area

In [None]:
import numpy as np
from scipy.interpolate import CubicSpline
import pandas as pd

# Assuming you have a DataFrame named 'clustered_df' with columns 'Labels', 'Time', and 'Geo_light_intensity'

# Get unique labels
unique_labels = clustered_df['Labels'].unique()

# Initialize an empty DataFrame to store the results
Area_df = pd.DataFrame()

# Loop through each unique label and apply the spline fitting
for label in unique_labels:
    # Select data for the current label
    label_data = clustered_df[clustered_df['Labels'] == label]

    # Check if the cluster has at least 2 elements
    if len(label_data) < 2:
        continue  # Skip clusters with less than 2 elements

    # Extract Time and Value columns
    t_data = label_data['Time']
    w_data = label_data['Area']

    # Create a cubic spline object
    spline = CubicSpline(t_data, w_data)

    # Generate finer x values for plotting
    t_fine = np.linspace(min(t_data), max(t_data), int(max(t_data)))
    w_fine = spline(t_fine)

    # Create a new DataFrame for the current label's results
    label_results = pd.DataFrame({'Time': t_fine, 'Area': w_fine})

    # Add a 'Labels' column with the current label value
    label_results['Labels'] = label

    # Append the current label's results to the overall results DataFrame
    Area_df = pd.concat([Area_df, label_results])

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools

# Sort the DataFrame by "Time" in descending order
df_sorted = Area_df.sort_values(by='Time', ascending=False)

# Filter the DataFrame to include only labels 0, 1, 2, and 3
selected_labels = [0, 1, 2, 3]
selected_clusters = df_sorted[df_sorted['Labels'].isin(selected_labels)]

# Create a color cycle for plotting
color_cycle = itertools.cycle(plt.cm.tab10.colors)

plt.figure(figsize=(10, 6))

# Iterate through the selected clusters and plot them with different colors
for cluster_label in selected_labels:
    cluster_data = selected_clusters[selected_clusters['Labels'] == cluster_label]

    color = next(color_cycle)
    plt.plot(cluster_data['Time'], cluster_data['Area'], label=f'Cluster {cluster_label}', color=color)

    # Calculate and plot the moving average with a window of your choice, e.g., window=5
    moving_average = cluster_data['Area'].rolling(window=5).mean()
    plt.plot(cluster_data['Time'], moving_average, label=f'Moving Avg {cluster_label}', linestyle='--', color=color)

plt.xlabel('Time')
plt.ylabel('Area')
plt.title('Area Over Time for Labels 0, 1, 2, and 3')
plt.legend()
plt.grid(True)

plt.show()

##### Analysis of a particular cluster in space

###### Functions used

In [None]:
import pandas as pd
import math
import matplotlib.pyplot as plt
import numpy as np

def plot_distance_vs_cell_location(data, window_size=2):
    df = data.copy()

    # Calculate the barycenter (centroid)
    Barycenter_x = df['X'].mean()
    Barycenter_y = df['Y'].mean()

    # Calculate the distance for each data point
    df['Distance'] = df.apply(lambda row: math.sqrt((row['X'] - Barycenter_x)**2 + (row['Y'] - Barycenter_y)**2), axis=1)

    # Sort the data by distance
    df.sort_values(by='Distance', inplace=True)

    # Calculate the cumulative count of cells with distances lower than or equal to each distance
    df['Cumulative_Count'] = np.arange(1, len(df) + 1)

    # Calculate the derivative (change in cumulative count) using numpy's gradient function
    df['Derivative'] = np.gradient(df['Cumulative_Count'], df['Distance'])

    # Create subplots with two graphs (main cumulative distribution and derivative)
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

    # Main cumulative distribution plot
    ax1.plot(df['Distance'], df['Cumulative_Count'], marker='o', linestyle='-', color='b')
    ax1.set_xlabel('Distance from Barycenter')
    ax1.set_ylabel('Number of Cells (Cumulative)')
    ax1.set_title('Cumulative Distribution of Cell Distances from Barycenter')

    # Derivative plot
    ax2.plot(df['Distance'], df['Derivative'], marker='o', linestyle='-', color='r')
    ax2.set_xlabel('Distance from Barycenter')
    ax2.set_ylabel('Derivative (Change in Cumulative Count)')
    ax2.set_title('Derivative of Cumulative Distribution')

    # Optionally, you can save the plot to a file or display it
    plt.savefig('cumulative_and_derivative_plot.png')
    plt.show()

# Example usage:
# Assuming you have a DataFrame 'data' with columns 'X' and 'Y', representing cell coordinates,
# and 'Distance', 'Cumulative_Count', and 'Derivative' columns calculated using your function.
# plot_distance_vs_cell_location(data)


In [None]:
import matplotlib.pyplot as plt
import math

def plot_distance_vs_value(data):
    df = data.copy()
    # Calculate the barycenter (centroid)
    Barycenter_x = df['X'].mean()
    Barycenter_y = df['Y'].mean()

    # Calculate the distance for each data point
    df['Distance'] = df.apply(lambda row: math.sqrt((row['X'] - Barycenter_x)**2 + (row['Y'] - Barycenter_y)**2), axis=1)

    # Create a scatter plot of 'Distance' vs 'IntDen'
    plt.scatter(df['Distance'], df['IntDen'], alpha=0.7)
    plt.xlabel('Distance to Barycenter')
    plt.ylabel('IntDen')
    plt.title('IntDen vs. Distance to Barycenter')
    plt.grid(True)

    plt.show()

# Example usage:
# plot_distance_vs_value(your_data)

In [None]:
import matplotlib.pyplot as plt
import math

def plot_distance_vs_value_with_histogram(data, bins=10):
    df = data.copy()
    # Calculate the barycenter (centroid)
    Barycenter_x = df['X'].mean()
    Barycenter_y = df['Y'].mean()

    # Calculate the distance for each data point
    df['Distance'] = df.apply(lambda row: math.sqrt((row['X'] - Barycenter_x)**2 + (row['Y'] - Barycenter_y)**2), axis=1)

    # Create a histogram of the 'Distance' column
    plt.hist(df['Distance'], bins=bins, edgecolor='k', alpha=0.7)
    plt.xlabel('Distance to Barycenter')
    plt.ylabel('Frequency')
    plt.title('Distance to Barycenter Histogram')
    plt.grid(True)

    plt.show()


In [None]:
import matplotlib.pyplot as plt
import math

def plot_moving_average(data, window_size=3500):
    df = data.copy()
    # Calculate the barycenter (centroid)
    Barycenter_x = df['X'].mean()
    Barycenter_y = df['Y'].mean()

    # Calculate the distance for each data point
    df['Distance'] = df.apply(lambda row: math.sqrt((row['X'] - Barycenter_x)**2 + (row['Y'] - Barycenter_y)**2), axis=1)

    # Create a new DataFrame for plotting
    plot_df = df[['Distance', 'IntDen']]

    # Sort the DataFrame by distance for a cleaner plot
    plot_df = plot_df.sort_values(by='Distance')

    # Calculate the moving average of the 'IntDen' column with a window size of 30
    plot_df['Moving_Average'] = plot_df['IntDen'].rolling(window=window_size).mean()

    # Plot the moving average
    plt.plot(plot_df['Distance'], plot_df['Moving_Average'], linestyle='--', label='Moving Average')
    plt.xlabel('Distance to Barycenter')
    plt.ylabel('Moving Average of IntDen')
    plt.title(f'Moving Average of IntDen vs. Distance to Barycenter (Window Size {window_size})')
    plt.grid(True)
    plt.legend()

    plt.show()

In [None]:
def plot_values_based_on_the_area_in_space(data, window_size=3500):
    df = data.copy()
    # Calculate the barycenter (centroid)
    Barycenter_x = df['X'].mean()
    Barycenter_y = df['Y'].mean()

    # Calculate the distance for each data point
    df['Distance'] = df.apply(lambda row: math.sqrt((row['X'] - Barycenter_x)**2 + (row['Y'] - Barycenter_y)**2), axis=1)

    # Create a new DataFrame for plotting
    plot_df = df[['Distance', 'IntDen']]

    # Sort the DataFrame by distance for a cleaner plot
    plot_df = plot_df.sort_values(by='Distance')

    # I want to divide my label my points based on the distance they are. I can consider a value called d_value. Making an example, if there is a point whose distance is x < d_value, the label is one. If the distance is between 2*d_value < x < d_value, the label is two.
    # After that, I want to create a new dataframe where I have three columns. One is the sum of IntDen of the raws with the same labels, the other is the area of the considered area, considered as pi*(value of the label**2 - (value of the label - 2)**2), the last one
    # is the ratio between those values.

In [None]:
import math
import matplotlib.pyplot as plt

def plot_values_based_on_the_area_in_space(data, d_value=7.5):
    # Calculate the barycenter (centroid)
    Barycenter_x = data['X'].mean()
    Barycenter_y = data['Y'].mean()

    # Calculate the distance for each data point
    data['Distance'] = ((data['X'] - Barycenter_x)**2 + (data['Y'] - Barycenter_y)**2).apply(math.sqrt)

    # Create a new DataFrame for plotting
    plot_df = data[['Distance', 'IntDen']]

    # Sort the DataFrame by distance for a cleaner plot
    plot_df = plot_df.sort_values('Distance')

    # Assign labels based on distance
    plot_df['Label'] = plot_df['Distance'].apply(lambda distance: math.ceil(distance / d_value))
    plot_df['New_Distance'] = plot_df['Label']*d_value - d_value / 2

    # Calculate area and sum of IntDen
    plot_df['Area'] = math.pi * (plot_df['Label']**2 - (plot_df['Label'] - 1)**2)
    plot_df['Sum_IntDen'] = plot_df.groupby('Label')['IntDen'].transform('sum')
    plot_df['Ratio'] = plot_df['Sum_IntDen'] / plot_df['Area']

    # Plotting the function
    plt.plot(plot_df['New_Distance'], plot_df['Ratio'])
    plt.xlabel('New Distance')
    plt.ylabel('Ratio')
    plt.title('Ratio vs New Distance')
    plt.show()

    return plot_df

# Example usage:
# plot_values_based_on_the_area_in_space(selected_df)

In [None]:
import os
import re
import cv2
from google.colab import drive

def natural_sort_key(s):
    # Extract and return the numeric part of the filename
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', s)]

def create_video_from_pngs(folder_path, output_video_path, frame_size=(1920, 1080), frame_rate=10):
    """
    Create a video from PNG files in a Google Drive folder.

    Args:
        folder_path (str): The path to the folder containing PNG files.
        output_video_path (str): The path where the output video will be saved.
        frame_size (tuple): The frame size of the output video (width, height).
        frame_rate (int): The frame rate of the output video.

    Returns:
        None
    """
    # Sort the PNG files by their filenames using natural sorting
    png_files = sorted([os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith('.png')], key=natural_sort_key)

    # Initialize the video writer
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # You can use other codecs like 'XVID' or 'MJPG'
    out = cv2.VideoWriter(output_video_path, fourcc, frame_rate, frame_size)

    # Create the video from PNG files
    for png_file in png_files:
        frame = cv2.imread(png_file)
        frame = cv2.resize(frame, frame_size)
        out.write(frame)

    # Release the video writer
    out.release()

    # Unmount Google Drive to avoid errors when saving the video
    drive.flush_and_unmount()

    print(f"Video '{output_video_path}' created successfully.")

In [None]:
import pandas as pd
import math
import matplotlib.pyplot as plt

def plot_distance_vs_moving_average_with_labels(data, labels=[0, 1, 2, 3], window_size=2, interruption_point=None):
    df = data.copy()

    # Filter the DataFrame to select rows with specific labels
    df = df[df['Labels'].isin(labels)]

    # Initialize an empty dictionary to store barycenters for each label
    barycenters = {}

    # Calculate the barycenter (centroid) for each selected label
    for label in labels:
        label_df = df[df['Labels'] == label]
        barycenter_x = label_df['X'].mean()
        barycenter_y = label_df['Y'].mean()
        barycenters[label] = (barycenter_x, barycenter_y)

    # Calculate the distance for each data point based on its label-dependent barycenter
    df['Distance'] = df.apply(lambda row: math.sqrt((row['X'] - barycenters[row['Labels']][0])**2 + (row['Y'] - barycenters[row['Labels']][1])**2), axis=1)

    # Determine the maximum distance for the x-axis
    max_distance = df['Distance'].max()

    if interruption_point is not None:
        # Filter the DataFrame to include only data points with distances less than the interruption_point
        df = df[df['Distance'] < interruption_point]

    # Create a new DataFrame for plotting
    plot_df = df[['Distance', 'IntDen']]

    # Sort the DataFrame by distance for a cleaner plot
    plot_df = plot_df.sort_values(by='Distance')

    # Calculate the moving average of the 'IntDen' column
    plot_df['Moving_Average'] = plot_df['IntDen'].rolling(window=window_size).mean()

    # Plot only the moving average
    plt.plot(plot_df['Distance'], plot_df['Moving_Average'], linestyle='--', label='Moving Average')
    plt.xlabel('Distance to Barycenter')
    plt.ylabel('Moving Average of IntDen')
    plt.title('Moving Average of IntDen vs. Distance to Barycenter')
    plt.grid(True)
    plt.legend()

    plt.show()

# Example usage:
# Assuming you have a DataFrame 'df' with columns 'X', 'Y', 'IntDen', and 'Labels'
# plot_distance_vs_moving_average_with_labels(df, labels=[0, 1, 2, 3], window_size=2, interruption_point=10)

In [None]:
import pandas as pd
import math

def calculate_correlation(data, labels=[0, 1, 2, 3], interruption_point=None):
    df = data.copy()

    # Filter the DataFrame to select rows with specific labels
    df = df[df['Labels'].isin(labels)]

    # Initialize an empty dictionary to store barycenters for each label
    barycenters = {}

    # Calculate the barycenter (centroid) for each selected label
    for label in labels:
        label_df = df[df['Labels'] == label]
        barycenter_x = label_df['X'].mean()
        barycenter_y = label_df['Y'].mean()
        barycenters[label] = (barycenter_x, barycenter_y)

    # Calculate the distance for each data point based on its label-dependent barycenter
    df['Distance'] = df.apply(lambda row: math.sqrt((row['X'] - barycenters[row['Labels']][0])**2 + (row['Y'] - barycenters[row['Labels']][1])**2), axis=1)

    if interruption_point is not None:
        # Filter the DataFrame to include only data points with distances less than the interruption_point
        df = df[df['Distance'] < interruption_point]

    # Calculate the correlation between 'Distance' and 'IntDen' columns
    correlation = df['Distance'].corr(df['IntDen'])

    return correlation

# Example usage:
# Assuming you have a DataFrame 'df' with columns 'X', 'Y', 'IntDen', and 'Labels'
# correlation = calculate_correlation(df, labels=[0, 1, 2, 3], interruption_point=10)
# print("Correlation between Distance and IntDen:", correlation)


In [None]:
def calculate_correlation_single(data, labels=[0, 1, 2, 3], interruption_point=None):
    df = data.copy()

    # Initialize an empty dictionary to store barycenters for each label
    barycenters = {}

    correlations = {}

    for label in labels:
        # Filter the DataFrame to select rows with the current label
        label_df = df[df['Labels'] == label]

        if label_df.empty:
            continue  # Skip labels with no data

        # Calculate the barycenter (centroid) for the current label
        barycenter_x = label_df['X'].mean()
        barycenter_y = label_df['Y'].mean()
        barycenters[label] = (barycenter_x, barycenter_y)

        # Calculate the distance for each data point based on its label-dependent barycenter
        label_df['Distance'] = label_df.apply(lambda row: math.sqrt((row['X'] - barycenters[label][0])**2 + (row['Y'] - barycenters[label][1])**2), axis=1)

        if interruption_point is not None:
            # Filter the DataFrame to include only data points with distances less than the interruption_point
            label_df = label_df[label_df['Distance'] < interruption_point]

        # Calculate the correlation between 'Distance' and 'IntDen' columns for the current label
        correlation = label_df['Distance'].corr(label_df['IntDen'])
        correlations[label] = correlation


    return correlations


In [None]:
def calculate_correlation_single_Area(data, labels=[0, 1, 2, 3], interruption_point=None):
    df = data.copy()

    # Initialize an empty dictionary to store barycenters for each label
    barycenters = {}

    correlations = {}

    for label in labels:
        # Filter the DataFrame to select rows with the current label
        label_df = df[df['Labels'] == label]

        if label_df.empty:
            continue  # Skip labels with no data

        # Calculate the barycenter (centroid) for the current label
        barycenter_x = label_df['X'].mean()
        barycenter_y = label_df['Y'].mean()
        barycenters[label] = (barycenter_x, barycenter_y)

        # Calculate the distance for each data point based on its label-dependent barycenter
        label_df['Distance'] = label_df.apply(lambda row: math.sqrt((row['X'] - barycenters[label][0])**2 + (row['Y'] - barycenters[label][1])**2), axis=1)

        if interruption_point is not None:
            # Filter the DataFrame to include only data points with distances less than the interruption_point
            label_df = label_df[label_df['Distance'] < interruption_point]

        # Calculate the correlation between 'Distance' and 'IntDen' columns for the current label
        correlation = label_df['Distance'].corr(label_df['Area'])
        correlations[label] = correlation


    return correlations

###### Process

In [None]:
from scipy import stats
import matplotlib.pyplot as plt

# Replace 'data' with your actual data
data = df_all_new_updated['IntDen']

# Create probability plots for different distributions
plt.figure(figsize=(12, 6))

# Normal distribution
plt.subplot(231)
stats.probplot(data, dist="norm", plot=plt)
plt.title("Normal Distribution")

# Exponential distribution
plt.subplot(232)
stats.probplot(data, dist="expon", plot=plt)
plt.title("Exponential Distribution")


plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'points_cluster_0_time_5' is your DataFrame with the 'IntDen' column

# Plot the Gaussian distribution using seaborn
sns.histplot(points_cluster_0_time_5['IntDen'], kde=True, bins=20, color='blue')
plt.xlabel('IntDen')
plt.ylabel('Frequency')
plt.title('Gaussian Distribution of IntDen')
plt.grid(True)
plt.show()


In [None]:
plot_distance_vs_value_with_moving_average(points_cluster_0_time_5, 3)

In [None]:
plot_distance_vs_value(selected_df)

In [None]:
plot_moving_average(selected_df)

In [None]:
folder_id = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/clusters/N=0123'
output_video_path = '/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/clusters/N=0123/video.mp4'
create_video_from_pngs(folder_id,output_video_path)

In [None]:
visual = plot_values_based_on_the_area_in_space(selected_df)

In [None]:
from scipy import stats
import matplotlib.pyplot as plt

# Replace 'data' with your actual data
data = visual['IntDen']

# Create probability plots for different distributions
plt.figure(figsize=(12, 6))

# Normal distribution
plt.subplot(231)
stats.probplot(data, dist="norm", plot=plt)
plt.title("Normal Distribution")

# Exponential distribution
plt.subplot(232)
stats.probplot(data, dist="expon", plot=plt)
plt.title("Exponential Distribution")


plt.tight_layout()
plt.show()

In [None]:
f0 = plot_distance_histogram_with_custom_ticks(df_distance_C0_T5)

In [None]:
plot_distance_vs_cell_location(df_distance_C0_T5)

In [None]:
f1

In [None]:
f1 = plot_distance_histogram_with_custom_ticks(df_distance_C1_T5)

In [None]:
plot_distance_vs_cell_location(df_distance_C1_T5)

In [None]:
plot_distance_vs_cell_location(df_distance_C2_T5)

In [None]:
f2

In [None]:
f2 = plot_distance_histogram_with_custom_ticks(df_distance_C2_T5)

In [None]:
df_see = plot_distance_vs_cell_location(df_distance_C3_T5)

In [None]:
plot_distance_histogram_with_custom_ticks(df_distance_C3_T5)

In [None]:
first_d = plot_distance_histogram_with_custom_ticks(df_distance_C3_T5)

In [None]:
first_d

In [None]:
import pandas as pd
import math
import matplotlib.pyplot as plt
import numpy as np

def plot_distance_histogram_with_custom_ticks(data):
    df = data.copy()

    # Calculate the barycenter (centroid)
    Barycenter_x = df['X'].mean()
    Barycenter_y = df['Y'].mean()

    # Calculate the distance for each data point
    df['Distance'] = df.apply(lambda row: math.sqrt((row['X'] - Barycenter_x)**2 + (row['Y'] - Barycenter_y)**2), axis=1)
    custom_ticks = int(max(df['Distance']) / 7.5)
    # Create a histogram of distances with custom ticks
    plt.figure(figsize=(8, 6))
    n, bins, patches = plt.hist(df['Distance'], bins=custom_ticks, color='b', alpha=0.7)
    plt.xlabel('Distance from Barycenter')
    plt.ylabel('Frequency')
    plt.title('Histogram of Cell Distances from Barycenter')

    # Find the first bin with a count of zero or close to zero
    first_zero_bin = None
    for i, count in enumerate(n):
        if count <= 1e-6:  # You can adjust this threshold as needed
            first_zero_bin = bins[i]
            break

    plt.show()
    return first_zero_bin


In [None]:
plot_distance_vs_moving_average(points_cluster_0_time_5, 20)

study other points

In [None]:
points_cluster_0_time_25 = df_all_new_updated[(df_all_new_updated['Time'] == 25) & (df_all_new_updated['Labels'] == 0)]

In [None]:
plot_distance_vs_moving_average(points_cluster_0_time_25, 20)

In [None]:
points_cluster_0_time_50 = df_all_new_updated[(df_all_new_updated['Time'] == 50) & (df_all_new_updated['Labels'] == 0)]

In [None]:
plot_distance_vs_moving_average(points_cluster_0_time_50, 20)

In [None]:
points_cluster_0_time_75 = df_all_new_updated[(df_all_new_updated['Time'] == 75) & (df_all_new_updated['Labels'] == 0)]

In [None]:
plot_distance_vs_moving_average(points_cluster_0_time_75, 20)

In [None]:
points_cluster_0_time_100 = df_all_new_updated[(df_all_new_updated['Time'] == 100) & (df_all_new_updated['Labels'] == 0)]

In [None]:
plot_distance_vs_moving_average(points_cluster_0_time_100, 20)

I can take all the values and calculate a distance based on all of them. I can see what happens with several points. The distance can be more covered in this way

In [None]:
plot_distance_vs_moving_average_with_labels(df_all_new_updated, window_size = 20)

In [None]:
plot_distance_vs_moving_average_with_labels(df_all_new_updated, window_size = 20, interruption_point = 1000)

Just a few of them

In [None]:
plot_distance_vs_moving_average_with_labels(df_all_new_updated, labels = [0], window_size = 20)

In [None]:
plot_distance_vs_moving_average_with_labels(df_all_new_updated, labels = [1], window_size = 20)

In [None]:
plot_distance_vs_moving_average_with_labels(df_all_new_updated, labels = [2], window_size = 20)

In [None]:
plot_distance_vs_moving_average_with_labels(df_all_new_updated, labels = [3], window_size = 20)

After this analysis on the intensity, it may be useful to understand how the values are correlated. We can study the correlation considering single clusters or all the values taken together

###### Correlation with IntDen

In [None]:
correlation_df = pd.DataFrame.from_dict(correlations, orient='index', columns=['Correlation'])

In [None]:
correlation_df.loc['Total'] = calculate_correlation(df_all_new_updated)

In [None]:
# Create a heat map
sns.heatmap(correlation_df, annot=True, cmap='coolwarm', fmt=".2f")
plt.xlabel('Labels')
plt.ylabel('Correlations')
plt.title('Correlation Heat Map')
plt.show()

Another question that may be useful to answer is whether there is a difference between the area and the distance. That can define if it's possible to identify any correlation between the area and the distance

In [None]:
area_correlations

##### Additional Analysis

In [None]:
import matplotlib.pyplot as plt

# Assuming df_all_new_updated is your DataFrame
data = df_all_new_updated['IntDen']

# Create a histogram
plt.hist(data, bins=10, color='blue', edgecolor='black')

# Add labels and a title
plt.xlabel('IntDen')
plt.ylabel('Frequency')
plt.title('Histogram of IntDen')

# Show the plot
plt.show()


In [None]:
# Select specific columns from the DataFrame
selected_columns = cluster_new_updated[['Area', 'Geo_light_intensity', 'IntDen', 'Mean', 'N_cell', 'Mean', 'Labels']]
selected_columns2 = cluster_new_updated[['Area', 'Geo_light_intensity', 'IntDen', 'Mean', 'N_cell', 'Mean']]

# If you want to create a new DataFrame with these columns, you can do so:
clustered_df = selected_columns.copy()
measurement_data = selected_columns2.copy()

In [None]:
clustered_df_L0 = clustered_df[clustered_df['Labels'] == 0]
clustered_df_L1 = clustered_df[clustered_df['Labels'] == 1]
clustered_df_L2 = clustered_df[clustered_df['Labels'] == 2]
clustered_df_L3 = clustered_df[clustered_df['Labels'] == 3]

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
luminescence_pca = pca.fit_transform(clustered_df_L0)

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Assuming you have already performed PCA on 'clustered_df_L0'
pca = PCA()
luminescence_pca = pca.fit_transform(clustered_df)

# Create a scatter plot
plt.figure(figsize=(8, 6))  # Adjust the figure size as needed
plt.scatter(luminescence_pca[:, 0], luminescence_pca[:, 1], alpha=0.5)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA Scatter Plot')

# Show the plot
plt.show()


In [None]:
pip install scanpy

In [None]:
import scanpy as sc
# Perform PCA with optional data normalization
import scanpy as sc
import pandas as pd

# Assuming you have a DataFrame named measurement_data
# Convert your DataFrame to an AnnData object
adata = sc.AnnData(X=measurement_data.values)
#sc.pp.normalize_total(adata, target_sum=1e4)

# Perform PCA with optional data normalization
sc.tl.pca(adata)

# Visualize PCA results
sc.pl.pca(adata)



In [None]:
adata = sc.AnnData(X=measurement_data.values)

# Perform PCA with optional data normalization
sc.tl.pca(adata)

# Visualize PCA results
sc.pl.pca(adata)


In [None]:
!pip install leidenalg

In [None]:
import scanpy as sc

# Assuming you have an AnnData object named 'adata'

# Calculate neighbors
sc.pp.neighbors(adata, n_pcs=6)

# Perform clustering using the Leiden algorithm
sc.tl.leiden(adata, resolution=1.0)  # You can adjust the resolution parameter

# UMAP embedding
sc.tl.umap(adata)

# Visualize the UMAP plot with cluster coloring
sc.pl.umap(adata, color='leiden')


In [None]:
pip install pysal

###### Spatial Statistics

In [None]:
import geopandas as gpd
from shapely.geometry import Point
import libpysal
import esda
import matplotlib.pyplot as plt

# Select specific columns from the DataFrame
selected_columns_points = df_all_new_updated[['X', 'Y', 'IntDen']]

# Create a GeoDataFrame with Point geometries
geometry = [Point(x, y) for x, y in zip(selected_columns_points['X'], selected_columns_points['Y'])]
gdf = gpd.GeoDataFrame(selected_columns_points, geometry=geometry)

# Create a spatial weights matrix (W) using Queen contiguity
w = libpysal.weights.Queen.from_dataframe(gdf)

# Extract the 'IntDen' column for Moran's I analysis
y = selected_columns_points['IntDen']

# Perform Moran's I analysis
moran = esda.Moran(y, w)

# Access Moran's I results
moran_I = moran.I
moran_EI = moran.EI
moran_p_value = moran.p_sim



In [None]:

import matplotlib.pyplot as plt
from splot.esda import plot_moran

# Plot Moran scatterplot
plot_moran(moran, zstandard=True, figsize=(10, 8))
plt.show()

In [None]:
!pip install pykrige

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial import distance

# Generate example data
n_points = len(df1)
X = df1['X']  # X coordinates
Y = df1['Y']  # Y coordinates
temperature = df1['IntDen'].values  # Temperature values

# Create a grid for prediction
x_grid, y_grid = np.meshgrid(np.linspace(0, 20, 150), np.linspace(0, 20, 150))

# Define the IDW interpolation function
def idw_interpolation(x, y, X, Y, values, power=2):
    distances = distance.cdist(np.column_stack((x.flatten(), y.flatten())), np.column_stack((X, Y)))
    weights = 1.0 / (distances**power)
    weighted_values = values * weights
    interpolated_values = np.sum(weighted_values, axis=1) / np.sum(weights, axis=1)
    return interpolated_values

# Perform the IDW interpolation
z_grid = idw_interpolation(x_grid, y_grid, X, Y, temperature)

# Reshape the grid and predicted values
z_grid = z_grid.reshape(x_grid.shape)

# Create a contour plot
plt.figure(figsize=(10, 8))
plt.contourf(x_grid, y_grid, z_grid, levels=100, cmap="viridis")
plt.colorbar(label="IntDen")
plt.scatter(X, Y, c=temperature, cmap="coolwarm", edgecolors="k", s=100)
plt.xlabel("X")
plt.ylabel("Y")
plt.title("IntDen Interpolation (IDW)")
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial import distance

# Generate example data
n_points = len(df2)
X = df2['X']  # X coordinates
Y = df2['Y']  # Y coordinates
temperature = df2['IntDen'].values  # Temperature values

# Create a grid for prediction
x_grid, y_grid = np.meshgrid(np.linspace(0, 20, 150), np.linspace(0, 20, 150))

# Define the IDW interpolation function
def idw_interpolation(x, y, X, Y, values, power=2):
    distances = distance.cdist(np.column_stack((x.flatten(), y.flatten())), np.column_stack((X, Y)))
    weights = 1.0 / (distances**power)
    weighted_values = values * weights
    interpolated_values = np.sum(weighted_values, axis=1) / np.sum(weights, axis=1)
    return interpolated_values

# Perform the IDW interpolation
z_grid = idw_interpolation(x_grid, y_grid, X, Y, temperature)

# Reshape the grid and predicted values
z_grid = z_grid.reshape(x_grid.shape)

# Create a contour plot
plt.figure(figsize=(10, 8))
plt.contourf(x_grid, y_grid, z_grid, levels=100, cmap="viridis")
plt.colorbar(label="IntDen")
plt.scatter(X, Y, c=temperature, cmap="coolwarm", edgecolors="k", s=100)
plt.xlabel("X")
plt.ylabel("Y")
plt.title("IntDen Interpolation (IDW)")
plt.show()


In [None]:
import spaghetti
import esda
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
import libpysal

# Sample dataset with X, Y, and IntDen columns
# Example data (replace with your actual data):
# df1 = pd.DataFrame({
#     'X': [1, 2, 3, 4, 5],
#     'Y': [2, 3, 4, 5, 6],
#     'IntDen': [10, 15, 20, 25, 30]
# })

# Create a GeoDataFrame from your DataFrame with Point geometries
geometry = [Point(xy) for xy in zip(df1['X'], df1['Y'])]
gdf = gpd.GeoDataFrame(df1, geometry=geometry)

# Create a spatial weights matrix (k-nearest neighbors with k=3, for example)
w = libpysal.weights.KNN.from_dataframe(gdf, k=3)

# Calculate Geary's C for spatial autocorrelation
gearys_c = esda.geary.Geary(gdf['IntDen'], w)

# Access Geary's C statistic and p-value
gearys_c_statistic = gearys_c.C
p_value = gearys_c.p_sim

print("Geary's C:", gearys_c_statistic)
print("p-value:", p_value)


In [None]:
import spaghetti
import esda
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
import libpysal
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Sample dataset with X, Y, and IntDen columns
# Example data (replace with your actual data):
# df1 = pd.DataFrame({
#     'X': [1, 2, 3, 4, 5],
#     'Y': [2, 3, 4, 5, 6],
#     'IntDen': [10, 15, 20, 25, 30]
# })

# Create a GeoDataFrame from your DataFrame with Point geometries
geometry = [Point(xy) for xy in zip(df1['X'], df1['Y'])]
gdf = gpd.GeoDataFrame(df1, geometry=geometry)

# Create a spatial weights matrix (k-nearest neighbors with k=3, for example)
w = libpysal.weights.KNN.from_dataframe(gdf, k=3)

# Suppress the warning about disconnected components
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Calculate Geary's C for spatial autocorrelation
gearys_c = esda.geary.Geary(gdf['IntDen'], w)

# Access Geary's C statistic and p-value
gearys_c_statistic = gearys_c.C
p_value = gearys_c.p_sim

# Create a Moran scatterplot
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))

# Calculate the spatial lag of IntDen
lag_intden = libpysal.weights.lag_spatial(w, gdf['IntDen'])

# Scatterplot
plt.scatter(gdf['IntDen'], lag_intden, color='b', alpha=0.5)

# Add labels and title
plt.xlabel("IntDen")
plt.ylabel("Spatial Lag of IntDen")
plt.title("Moran Scatterplot (Geary's C)")

# Show the plot
plt.show()



##### Forecast

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Define parameters
initial_population = selected_clusters[(selected_clusters['Labels'] == 0) & (selected_clusters['Time'] == 92)]['N_cell'].values[0]  # Initial population or value
decay_rate = 1.000000001603458         # Rate at which the population decays
simulation_time = 26    # Total simulation time

# Initialize time and population arrays
time_points = [92]  # Initialize with the starting time of 92
population_values = [initial_population]

# Gillespie algorithm
while time_points[-1] < 92 + simulation_time and population_values[-1] > 0:
    # Calculate the decay rate (event rate)
    event_rate = decay_rate * population_values[-1]

    # Generate a random time for the next event based on exponential distribution
    time_to_next_event = np.random.exponential(1 / event_rate)

    # Update time and population values
    time_points.append(time_points[-1] + time_to_next_event)
    population_values.append(population_values[-1] - 1)

# Plot the simulation results
plt.plot(time_points, population_values)
plt.xlabel('Time')
plt.ylabel('Population')
plt.title('Gillespie Algorithm Simulation')
plt.show()


The result is not super convincing. Because we know from our perspective how the value may change over time. It's still higher than 0 at least until 118

In [None]:
N_t1 = selected_clusters[(selected_clusters['Labels'] == 0) & (selected_clusters['Time'] == 92)]['N_cell'].values[0]
N_t2 = 1
decay_rate = math.log(N_t1 / N_t2) / (118 - 92)

In [None]:
decay_rate

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Define parameters
initial_population = selected_clusters[(selected_clusters['Labels'] == 0) & (selected_clusters['Time'] == 92)]['N_cell'].values[0]  # Initial population or value
decay_rate = decay_rate         # Rate at which the population decays
simulation_time = 26    # Total simulation time

# Initialize time and population arrays
time_points = [92]  # Initialize with the starting time of 92
population_values = [initial_population]

# Gillespie algorithm
while time_points[-1] < 92 + simulation_time and population_values[-1] > 0:
    # Calculate the decay rate (event rate)
    event_rate = decay_rate * population_values[-1]

    # Generate a random time for the next event based on exponential distribution
    time_to_next_event = np.random.exponential(1 / event_rate)

    # Update time and population values
    time_points.append(time_points[-1] + time_to_next_event)
    population_values.append(population_values[-1] - 1)

# Plot the simulation results
plt.plot(time_points, population_values)
plt.xlabel('Time')
plt.ylabel('Population')
plt.title('Gillespie Algorithm Simulation')
plt.show()

In [None]:
float(decay_rate)

In [None]:
from cayenne.simulation import Simulation
model_str = """
        const compartment comp1;
        comp1 = 1.0; # volume of compartment

        r1: N_cell => B; k1;

        k1 = 0.1000001603458;
        chem_flag = false;

        N_cell = 300;
        B = 0;
    """
sim = Simulation.load_model(model_str, "ModelString")
# Run the simulation
sim.simulate(max_t=26, max_iter=1000, n_rep=5, algorithm="tau_adaptive")
sim.plot()

In [None]:
sim.plot(species_names = ["N_cell"])

#### Adjust clusters

In [None]:
import seaborn as sns

In [None]:
create_inline_plots(cleaned)

In [None]:

import os
import matplotlib.pyplot as plt
from IPython.display import display, Image
import pandas as pd

def create_inline_plots(data):
    # Iterate over each timepoint in your DataFrame
    for timepoint in data['Time'].unique():
        # Create a sub-dataframe for the current timepoint
        sub_data = data[data['Time'] == timepoint]

        # Call your plotting function with the sub-dataframe and get the figure object
        fig = plotPointsFromCSV_same_colour(sub_data, title=f"Timepoint {timepoint}")

        # Display the current figure inline in Google Colab
        display(fig)

        # Close the current figure to free up memory
        plt.close(fig)

# Example usage:
# Assuming you have a DataFrame 'your_data' with columns 'Time' and other data columns
# create_inline_plots(your_data)


In [None]:
a = df_all_new_updated[(df_all_new_updated['Labels'] == 1) & (df_all_new_updated['Time'] == 2)]
b = df_all_new_updated[(df_all_new_updated['Labels'] == 0) & (df_all_new_updated['Time'] == 2)]
c = df_all_new_updated[(df_all_new_updated['Labels'] == 2) & (df_all_new_updated['Time'] == 2)]
d = df_all_new_updated[(df_all_new_updated['Labels'] == 3) & (df_all_new_updated['Time'] == 2)]

In [None]:
plot_values_based_on_the_area_in_space(d)

In [None]:
plot_values_based_on_the_area_in_space(c)

In [None]:
plot_values_based_on_the_area_in_space(b)

In [None]:
plot_values_based_on_the_area_in_space(a)

In [None]:
removed = remove_values_based_on_the_area_in_space(df_adjusted)

In [None]:
removed = remove_values_after_zerobin(df_adjusted)

In [None]:
len(cleaned)

In [None]:
print(len(df_all_imp), len(cleaned))

In [None]:
cleaned = remove_values_after_zerobin(df_all_imp, T_max=16)

In [None]:
import pandas as pd
import math

def remove_values_after_zerobin(data, label_column='Labels', d_value=7.5, min_bin_value=100, T_max=None):
    # Create an empty DataFrame to store the filtered data
    filtered_data = pd.DataFrame()

    # Create a subset of timepoints to consider, up to T_max if specified
    if T_max is not None:
        data_until_T_max = data[data['Time'] <= T_max]
        timepoints_to_process = data_until_T_max['Time'].unique()
    else:
        timepoints_to_process = data['Time'].unique()

    # Iterate over unique timepoints in the 'Time' column
    for timepoint in timepoints_to_process:
        # Get the data for the current timepoint
        timepoint_data = data[data['Time'] == timepoint]

        # Iterate over unique labels in the 'label_column' within the current timepoint
        for label in timepoint_data[label_column].unique():
            # Get the data for the current label within the current timepoint
            label_data = timepoint_data[timepoint_data[label_column] == label]

            # Calculate the barycenter (centroid) for the current label's data
            Barycenter_x = label_data['X'].mean()
            Barycenter_y = label_data['Y'].mean()

            # Calculate the distance for each data point in the current label's data
            label_data['Distance'] = label_data.apply(lambda row: math.sqrt((row['X'] - Barycenter_x)**2 + (row['Y'] - Barycenter_y)**2), axis=1)

            # Create a histogram of distances with custom ticks for the current label's data
            custom_ticks = int(max(label_data['Distance']) / d_value)
            n, bins, patches = plt.hist(label_data['Distance'], bins=custom_ticks, color='b', alpha=0.7)

            # Find the first bin with a count of zero or close to zero and with a value of at least min_bin_value
            first_zero_bin = None
            for i, count in enumerate(n):
                if count <= 1e-6 and bins[i] >= min_bin_value:  # Adjust threshold and min_bin_value as needed
                    first_zero_bin = bins[i]
                    break

            # If there's no suitable first_zero_bin, append the original label_data to the filtered_data
            if first_zero_bin is None:
                filtered_data = pd.concat([filtered_data, label_data], ignore_index=True)
            else:
                # Filter the current label's data to keep only rows with 'Distance' less than or equal to first_zero_bin
                label_data_filtered = label_data[label_data['Distance'] <= first_zero_bin]
                filtered_data = pd.concat([filtered_data, label_data_filtered], ignore_index=True)

    if T_max is not None:
        data_after_T_max = data[data['Time'] > T_max]
        filtered_data = pd.concat([filtered_data, data_after_T_max], ignore_index=True)
    return filtered_data
# Disable the SettingWithCopyWarning for chained assignments
pd.options.mode.chained_assignment = None


In [None]:
import pandas as pd
import math

def remove_values_based_on_the_area_in_space(data, d_value=7.5, threshold=0.1, T_max=None):
    # Create an empty DataFrame to store the filtered data
    filtered_data = pd.DataFrame()

    # Create a subset of timepoints to consider, up to T_max if specified
    if T_max is not None:
        data_until_T_max = data[data['Time'] <= T_max]
        timepoints_to_process = data_until_T_max['Time'].unique()
    else:
        timepoints_to_process = data['Time'].unique()

    # Iterate over unique timepoints in the 'Time' column
    for timepoint in timepoints_to_process:
        # Get the data for the current timepoint
        timepoint_data = data_until_T_max[data_until_T_max['Time'] == timepoint]

        # Calculate the barycenter (centroid)
        Barycenter_x = timepoint_data['X'].mean()
        Barycenter_y = timepoint_data['Y'].mean()

        # Calculate the distance for each data point
        timepoint_data['Distance'] = ((timepoint_data['X'] - Barycenter_x)**2 + (timepoint_data['Y'] - Barycenter_y)**2).apply(math.sqrt)

        # Sort the DataFrame by distance for a cleaner plot
        timepoint_data = timepoint_data.sort_values('Distance')

        # Assign labels based on distance
        timepoint_data['Label'] = timepoint_data['Distance'].apply(lambda distance: math.ceil(distance / d_value))
        timepoint_data['New_Distance'] = timepoint_data['Label']*d_value - d_value / 2

        # Calculate area and sum of IntDen
        timepoint_data['Area_space'] = math.pi * (timepoint_data['Label']**2 - (timepoint_data['Label'] - 1)**2)
        timepoint_data['Sum_IntDen'] = timepoint_data.groupby('Label')['IntDen'].transform('sum')
        timepoint_data['Ratio'] = timepoint_data['Sum_IntDen'] / timepoint_data['Area_space']

        # Filter based on the threshold
        timepoint_data = timepoint_data[timepoint_data['Ratio'] > threshold]

        # Append the filtered data for this timepoint to the overall filtered_data
        filtered_data = pd.concat([filtered_data, timepoint_data], ignore_index=True)

    if T_max is not None:
        data_after_T_max = data[data['Time'] > T_max]
        filtered_data = pd.concat([filtered_data, data_after_T_max], ignore_index=True)

    return filtered_data

# Disable the SettingWithCopyWarning for chained assignments
pd.options.mode.chained_assignment = None


In [None]:
import math
import matplotlib.pyplot as plt

def remove_values_based_on_the_area_in_space(data, d_value=7.5, threshold=0.9):
    # Calculate the barycenter (centroid)
    Barycenter_x = data['X'].mean()
    Barycenter_y = data['Y'].mean()

    # Calculate the distance for each data point
    data['Distance'] = ((data['X'] - Barycenter_x)**2 + (data['Y'] - Barycenter_y)**2).apply(math.sqrt)

    # Sort the DataFrame by distance for a cleaner plot
    data = data.sort_values('Distance')

    # Assign labels based on distance
    data['Label'] = data['Distance'].apply(lambda distance: math.ceil(distance / d_value))
    data['New_Distance'] = data['Label']*d_value - d_value / 2

    # Calculate area and sum of IntDen
    data['Area_space'] = math.pi * (data['Label']**2 - (data['Label'] - 1)**2)
    data['Sum_IntDen'] = data.groupby('Label')['IntDen'].transform('sum')
    data['Ratio'] = data['Sum_IntDen'] / data['Area_space']

    plt.plot(data['New_Distance'], data['Ratio'])
    plt.xlabel('New Distance')
    plt.ylabel('Ratio')
    plt.title('Ratio vs New Distance')
    plt.show()

    data = data[data['Ratio'] > threshold]

    return data

In [None]:
df_adjusted = df_adjusted[(df_adjusted['Time'] == 1) & (df_adjusted['Labels'] == 3)]

In [None]:
plot_values_based_on_the_area_in_space(df_adjusted)

In [None]:
import math
import matplotlib.pyplot as plt

def plot_values_based_on_the_area_in_space(data, d_value=7.5):
    # Calculate the barycenter (centroid)
    Barycenter_x = data['X'].mean()
    Barycenter_y = data['Y'].mean()

    # Calculate the distance for each data point
    data['Distance'] = ((data['X'] - Barycenter_x)**2 + (data['Y'] - Barycenter_y)**2).apply(math.sqrt)

    # Create a new DataFrame for plotting
    plot_df = data[['Distance', 'IntDen']]

    # Sort the DataFrame by distance for a cleaner plot
    plot_df = plot_df.sort_values('Distance')

    # Assign labels based on distance
    plot_df['Label'] = plot_df['Distance'].apply(lambda distance: math.ceil(distance / d_value))
    plot_df['New_Distance'] = plot_df['Label']*d_value - d_value / 2

    # Calculate area and sum of IntDen
    plot_df['Area'] = math.pi * (plot_df['Label']**2 - (plot_df['Label'] - 1)**2)
    plot_df['Sum_IntDen'] = plot_df.groupby('Label')['IntDen'].transform('sum')
    plot_df['Ratio'] = plot_df['Sum_IntDen'] / plot_df['Area']

    # Plotting the function
    plt.plot(plot_df['New_Distance'], plot_df['Ratio'])
    plt.xlabel('New Distance')
    plt.ylabel('Ratio')
    plt.title('Ratio vs New Distance')
    plt.show()

    return plot_df

# Example usage:
# plot_values_based_on_the_area_in_space(selected_df)

In [None]:
def plot_see_watch(data, folder_path):
  output_folder_png = folder_path
  create_png_plots(data, output_folder_png)
  gdrive_folder = output_folder_png
  output_pdf_name = gdrive_folder + '/new_model.pdf'
  result_pdf_path = merge_pngs_to_pdf(gdrive_folder, output_pdf_name)
  folder_id = gdrive_folder
  output_video_path = gdrive_folder + '/video.mp4'
  create_video_from_pngs(folder_id,output_video_path)
  return

###### Used functions

In [None]:
import pandas as pd
import math
import matplotlib.pyplot as plt
import numpy as np

def plot_distance_histogram_with_custom_ticks(data, d_value=7.5):
    df = data.copy()

    # Calculate the barycenter (centroid)
    Barycenter_x = df['X'].mean()
    Barycenter_y = df['Y'].mean()

    # Calculate the distance for each data point
    df['Distance'] = df.apply(lambda row: math.sqrt((row['X'] - Barycenter_x)**2 + (row['Y'] - Barycenter_y)**2), axis=1)
    custom_ticks = int(max(df['Distance']) / d_value)
    # Create a histogram of distances with custom ticks
    plt.figure(figsize=(8, 6))
    n, bins, patches = plt.hist(df['Distance'], bins=custom_ticks, color='b', alpha=0.7)
    plt.xlabel('Distance from Barycenter')
    plt.ylabel('Frequency')
    plt.title('Histogram of Cell Distances from Barycenter')

    # Find the first bin with a count of zero or close to zero
    first_zero_bin = None
    for i, count in enumerate(n):
        if count <= 1e-6:  # You can adjust this threshold as needed
            first_zero_bin = bins[i]
            break

    plt.show()
    return first_zero_bin


In [None]:
def remove_values_after_zerobin(data, label_column='Labels', d_value=7.5, min_bin_value=100):
    # Create an empty DataFrame to store the filtered data
    filtered_data = pd.DataFrame()

    # Iterate over unique labels in the 'label_column'
    for label in data[label_column].unique():
        # Get the data for the current label
        label_data = data[data[label_column] == label]

        # Calculate the barycenter (centroid) for the current label's data
        Barycenter_x = label_data['X'].mean()
        Barycenter_y = label_data['Y'].mean()

        # Calculate the distance for each data point in the current label's data
        label_data['Distance'] = label_data.apply(lambda row: math.sqrt((row['X'] - Barycenter_x)**2 + (row['Y'] - Barycenter_y)**2), axis=1)

        # Create a histogram of distances with custom ticks for the current label's data
        custom_ticks = int(max(label_data['Distance']) / d_value)
        n, bins, patches = plt.hist(label_data['Distance'], bins=custom_ticks, color='b', alpha=0.7)

        # Find the first bin with a count of zero or close to zero and with a value of at least min_bin_value
        first_zero_bin = None
        for i, count in enumerate(n):
            if count <= 1e-6 and bins[i] >= min_bin_value:  # Adjust threshold and min_bin_value as needed
                first_zero_bin = bins[i]
                break

        # If there's no suitable first_zero_bin, append the original label_data to the filtered_data
        if first_zero_bin is None:
            filtered_data = pd.concat([filtered_data, label_data], ignore_index=True)
        else:
            # Filter the current label's data to keep only rows with 'Distance' less than or equal to first_zero_bin
            label_data_filtered = label_data[label_data['Distance'] <= first_zero_bin]
            filtered_data = pd.concat([filtered_data, label_data_filtered], ignore_index=True)

    return filtered_data
# Disable the SettingWithCopyWarning for chained assignments
pd.options.mode.chained_assignment = None




import pandas as pd
import math
import matplotlib.pyplot as plt
import numpy as np

def remove_values_after_zerobin(data, label_column='Labels', d_value=7.5, min_bin_value=100):
    # Create an empty DataFrame to store the filtered data
    filtered_data = pd.DataFrame()

    # Iterate over unique labels in the 'label_column'
    for label in data[label_column].unique():
        # Get the data for the current label
        label_data = data[data[label_column] == label]

        # Calculate the barycenter (centroid) for the current label's data
        Barycenter_x = label_data['X'].mean()
        Barycenter_y = label_data['Y'].mean()

        # Calculate the distance for each data point in the current label's data
        label_data['Distance'] = label_data.apply(lambda row: math.sqrt((row['X'] - Barycenter_x)**2 + (row['Y'] - Barycenter_y)**2), axis=1)

        # Create a histogram of distances with custom ticks for the current label's data
        custom_ticks = int(max(label_data['Distance']) / d_value)
        plt.figure(figsize=(8, 6))
        n, bins, patches = plt.hist(label_data['Distance'], bins=custom_ticks, color='b', alpha=0.7)

        # Find the first bin with a count of zero or close to zero and with a value of at least min_bin_value
        first_zero_bin = None
        for i, count in enumerate(n):
            if count <= 1e-6 and bins[i] >= min_bin_value:  # Adjust threshold and min_bin_value as needed
                first_zero_bin = bins[i]
                break

        # If there's no suitable first_zero_bin, append the original label_data to the filtered_data
        if first_zero_bin is None:
            filtered_data = pd.concat([filtered_data, label_data], ignore_index=True)
        else:
            # Filter the current label's data to keep only rows with 'Distance' less than or equal to first_zero_bin
            label_data_filtered = label_data[label_data['Distance'] <= first_zero_bin]
            filtered_data = pd.concat([filtered_data, label_data_filtered], ignore_index=True)

    return filtered_data
# Disable the SettingWithCopyWarning for chained assignments
pd.options.mode.chained_assignment = None


# Example usage:
# Assuming you have a DataFrame 'your_data' with columns 'X', 'Y', and 'Label'
# filtered_data = remove_values_after_zerobin(your_data)


import pandas as pd
import math
import matplotlib.pyplot as plt
import numpy as np

def remove_values_after_zerobin(data, d_value=7.5, min_bin_value=100):
    df = data.copy()

    # Calculate the barycenter (centroid)
    Barycenter_x = df['X'].mean()
    Barycenter_y = df['Y'].mean()

    # Calculate the distance for each data point
    df['Distance'] = df.apply(lambda row: math.sqrt((row['X'] - Barycenter_x)**2 + (row['Y'] - Barycenter_y)**2), axis=1)
    custom_ticks = int(max(df['Distance']) / d_value)
    # Create a histogram of distances with custom ticks
    plt.figure(figsize=(8, 6))
    n, bins, patches = plt.hist(df['Distance'], bins=custom_ticks, color='b', alpha=0.7)

    # Find the first bin with a count of zero or close to zero and with a value of at least min_bin_value
    first_zero_bin = None
    for i, count in enumerate(n):
        if count <= 1e-6 and bins[i] >= min_bin_value:  # Adjust threshold and min_bin_value as needed
            first_zero_bin = bins[i]
            break

    # If there's no suitable first_zero_bin, return the original DataFrame
    if first_zero_bin is None:
        return df

    # Filter the DataFrame to keep only rows with 'Distance' less than or equal to first_zero_bin
    df_filtered = df[df['Distance'] <= first_zero_bin]

    return df_filtered

# Example usage:
# Assuming you have a DataFrame 'your_data' with columns 'X' and 'Y'
# filtered_data = remove_values_further_zerobin(your_data)

#### Data Imputation

##### Functions used

In [None]:
cluster_tmp_edit, df_all_imp =  impute_cells_and_cluster(cluster_tmp_edit, df_tmp)

In [None]:
import pandas as pd

In [None]:
df_adjusted = df_adjusted[df_adjusted['Time'] == 1]

In [None]:
df_adjusted = df_all_new_updated.copy()

In [None]:
import pandas as pd

In [None]:
 cluster_new_updated = pd.read_csv('/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/cluster_new_updated_v2.csv')

In [None]:
df_all_new_updated = pd.read_csv('/content/drive/MyDrive/Academic Work/Cellgroup/CSV_files/New_algorithm/df_all_new_updated_v2.csv')

In [None]:
cluster_tmp = cluster_new_updated.copy()
df_tmp = df_all_new_updated.copy()
cluster_tmp_edit = cluster_new_updated.copy()

it's mostly to have a try and to understand what may happen in this case

In [None]:
import pandas as pd

def fill_missing_values_in_time_series(cluster_tmp_edit):
    # Find all unique labels in the 'Labels' column
    unique_labels = cluster_tmp_edit['Labels'].unique()

    # Create an empty DataFrame to store missing data
    missing_data = pd.DataFrame(columns=['Labels', 'Time'])

    # Iterate through unique labels
    for label in unique_labels:
        # Find the minimum 'Time' value in the 'Time' column for the current label
        start_time = cluster_tmp_edit[cluster_tmp_edit['Labels'] == label]['Time'].min()

        # Study the desired cluster for the current label
        desired_cluster = cluster_tmp_edit[cluster_tmp_edit['Labels'] == label]

        # Create a list of numbers starting from the 'start_time'
        max_value = desired_cluster['Time'].max()
        all_numbers = list(range(int(start_time), int(max_value) + 1))

        # Find the unique values in the 'Time' column for the current cluster
        unique_values = desired_cluster['Time'].unique()

        # Find the numbers that are not present in the column for the current cluster
        missing_numbers = list(set(all_numbers) - set(unique_values))

        # Create a DataFrame with missing values for the current cluster
        missing_cluster_data = pd.DataFrame({'Labels': [label] * len(missing_numbers), 'Time': missing_numbers})

        # Append the missing_cluster_data DataFrame to the overall missing_data
        missing_data = pd.concat([missing_data, missing_cluster_data], ignore_index=True)

    # Convert 'Labels' and 'Time' columns to int64
    missing_data = missing_data.astype({'Labels': 'int64', 'Time': 'int64'})

    # Append the missing_data DataFrame to your existing DataFrame
    cluster_tmp_edit = pd.concat([cluster_tmp_edit, missing_data], ignore_index=True)

    # Sort the DataFrame by 'Time' column if needed
    cluster_tmp_edit.sort_values(by=['Time'], inplace=True)

    # Reset the index
    cluster_tmp_edit.reset_index(drop=True, inplace=True)

    return cluster_tmp_edit

# Example usage:
# cluster_tmp_edit = fill_missing_values_in_time_series(cluster_tmp_edit)


In [None]:
import pandas as pd
import numpy as np
from scipy.interpolate import CubicSpline

def interpolate_missing_values_with_spline(cluster_tmp_edit):
    # Find the unique labels in the 'Labels' column
    unique_labels = cluster_tmp_edit['Labels'].unique()

    # Create a dictionary to store cubic spline functions for each column (excluding 'Time')
    spline_functions = {}

    # Iterate through unique labels and create spline functions
    for label in unique_labels:
        # Make a copy of the data for this label to avoid SettingWithCopyWarning
        label_data = cluster_tmp_edit[cluster_tmp_edit['Labels'] == label].copy()

        # Sort the data by 'Time' column if it's not already sorted
        label_data.sort_values(by=['Time'], inplace=True)

        # Iterate through columns (excluding 'Time') to create spline functions
        for column in label_data.columns:
            if column != 'Time':
                # Check if there are missing values in the current column
                missing_indices = label_data[column].isna()

                if missing_indices.any():
                    # Extract non-missing time and value data
                    t_data = label_data.loc[~missing_indices, 'Time']
                    w_data = label_data.loc[~missing_indices, column]

                    # Create a cubic spline object
                    spline = CubicSpline(t_data, w_data)

                    # Store the spline function in the dictionary
                    spline_functions[(label, column)] = spline

    # Now you have spline functions for each (label, column) pair with missing values

    # Iterate through unique labels and columns to interpolate missing values
    for label in unique_labels:
        for column in label_data.columns:
            if column != 'Time':
                # Check if there are missing values in the current column
                missing_indices = cluster_tmp_edit['Labels'] == label
                missing_values = cluster_tmp_edit.loc[missing_indices, column].isna()

                if missing_values.any():
                    # Extract time values where interpolation is needed
                    t_missing = cluster_tmp_edit.loc[missing_indices, 'Time']

                    # Use the corresponding spline function to interpolate missing values
                    spline = spline_functions.get((label, column))
                    if spline:
                        interpolated_values = spline(t_missing)

                        # Update the DataFrame with interpolated values
                        cluster_tmp_edit.loc[missing_indices, column] = interpolated_values

    # Now, cluster_tmp_edit contains interpolated values for missing data
    return cluster_tmp_edit

# Example usage:
# cluster_tmp_edit = interpolate_missing_values_with_spline(cluster_tmp_edit)


In [None]:
import pandas as pd

def adjust_data_for_missing_times(df_tmp, cluster_tmp_edit):
    # Function to find common columns between two DataFrames
    def find_common_columns(df1, df2):
        common_columns = set(df1.columns).intersection(df2.columns)
        return list(common_columns)

    # Find unique labels in 'Labels' column of 'cluster_tmp_edit'
    unique_labels = cluster_tmp_edit['Labels'].unique()

    # Find common columns between df_all_imp and cluster_tmp_edit
    common_columns = find_common_columns(df_tmp, cluster_tmp_edit)

    # List to store DataFrame fragments
    rows_to_concat = []

    # Iterate through unique labels
    for label in unique_labels:
        # Find unique 'Time' values in 'cluster_tmp_edit' and 'df_tmp' for the current label
        unique_times_clusters = cluster_tmp_edit[(cluster_tmp_edit['Labels'] == label)]['Time'].unique()
        unique_times_df_tmp = df_tmp[(df_tmp['Labels'] == label)]['Time'].unique()
        missing_times = set(unique_times_clusters) - set(unique_times_df_tmp)
        # Iterate through missing 'Time' values for the current label
        for considered_time in missing_times:
            last_alive = 1
            while len(df_tmp[(df_tmp['Time'] == (considered_time - last_alive)) & (df_tmp['Labels'] == label)]) == 0:
                last_alive += 1

            # Create a DataFrame to store the modified data
            df_all_imp = pd.DataFrame()

            # Find 'N_cell' value for the considered 'Time' and label from 'clusters'
            N = int(cluster_tmp_edit[(cluster_tmp_edit['Time'] == considered_time) & (cluster_tmp_edit['Labels'] == label)]['N_cell'])

            # Find 'N_cell' value for the previous 'Time' and label from 'clusters'
            N_prev = int(cluster_tmp_edit[(cluster_tmp_edit['Time'] == (considered_time - last_alive)) & (cluster_tmp_edit['Labels'] == label)]['N_cell'])

            if N >= N_prev:
                # Copy all rows of df_tmp with the same label and 'Time' == (considered_time - 1) to df_all_imp
                additional_rows = pd.concat([df_all_imp, df_tmp[(df_tmp['Time'] == (considered_time - last_alive)) & (df_tmp['Labels'] == label)]], ignore_index=True)
                additional_rows['Time'] = considered_time
                df_all_imp = pd.concat([df_all_imp, additional_rows], ignore_index=True)

                if (N - N_prev) > 0:
                    # Adjust the common columns in additional_rows
                    selected_rows = df_tmp[(df_tmp['Time'] == (considered_time - last_alive)) & (df_tmp['Labels'] == label)].sample(n=(N - N_prev), replace=True)
                    additional_rows['Time'] = considered_time
                    df_all_imp = pd.concat([df_all_imp, additional_rows], ignore_index=True)

            elif N < N_prev:
                # Copy all rows of df_tmp with the same label and 'Time' == (considered_time - 1) to df_all_imp
                diminished_rows = df_tmp[(df_tmp['Time'] == (considered_time - last_alive)) & (df_tmp['Labels'] == label)].sample(n=(N), replace=True)
                diminished_rows['Time'] = considered_time
                diminished_rows.reset_index(drop=True, inplace=True)
                df_all_imp = pd.concat([df_all_imp, diminished_rows], ignore_index=True)

            # Adjust the columns in df_all_imp for the specific label
            for common_column in common_columns:
                cluster_difference = (
                    cluster_tmp_edit[(cluster_tmp_edit['Time'] == considered_time) & (cluster_tmp_edit['Labels'] == label)][common_column].values[0] -
                    cluster_tmp_edit[(cluster_tmp_edit['Time'] == (considered_time - last_alive)) & (cluster_tmp_edit['Labels'] == label)][common_column].values[0]
                )

                # Adjust the cluster_difference in df_all_imp for the common columns
                df_all_imp[df_all_imp['Time'] == considered_time][common_column] += cluster_difference
            # Append the modified DataFrame to the list
            rows_to_concat.append(df_all_imp)

    # Concatenate all DataFrame fragments into a single DataFrame
    df_all_imp = pd.concat(rows_to_concat, ignore_index=True)

    # Now, df_all_imp contains the modified data with 'Time' values as considered_time and common columns adjusted for the same label and label-specific 'Time' values
    return df_all_imp

# Usage example:
# df_all_imp_result = adjust_data_for_missing_times(df_tmp, cluster_tmp_edit)



In [None]:
def impute_cells_and_cluster(cluster_tmp_edit, df_tmp, selected = []):
  cluster_tmp_edit = fill_missing_values_in_time_series(cluster_tmp_edit)
  cluster_tmp_edit = interpolate_missing_values_with_spline(cluster_tmp_edit)
  if len(selected) > 0:
    cluster_tmp_edit = cluster_tmp_edit[cluster_tmp_edit['Labels'].isin(selected)]
  cluster_tmp_edit['N_cell'] = cluster_tmp_edit['N_cell'].apply(lambda x: max(0, int(x)))
  df_all_imp = adjust_data_for_missing_times(df_tmp, cluster_tmp_edit)
  df_all_imp = pd.concat([df_all_imp, df_tmp], ignore_index=True)
  return cluster_tmp_edit, df_all_imp

###### Here is something not to use. Maybe useful if something bad happens

In [None]:
import pandas as pd
import random

# Function to find common columns between two DataFrames
def find_common_columns(df1, df2):
    common_columns = set(df1.columns).intersection(df2.columns)
    return list(common_columns)

# Find unique labels in 'Labels' column of 'cluster_tmp_edit'
unique_labels = cluster_tmp_edit['Labels'].unique()

# Find common columns between df_all_imp and cluster_tmp_edit
common_columns = find_common_columns(df_tmp, cluster_tmp_edit)

# List to store DataFrame fragments
rows_to_concat = []

# Iterate through unique labels
for label in unique_labels:
    # Find unique 'Time' values in 'cluster_tmp_edit' and 'df_tmp' for the current label
    unique_times_clusters = cluster_tmp_edit[(cluster_tmp_edit['Labels'] == label)]['Time'].unique()
    unique_times_df_tmp = df_tmp[(df_tmp['Labels'] == label)]['Time'].unique()
    missing_times = set(unique_times_clusters) - set(unique_times_df_tmp)
    # Iterate through missing 'Time' values for the current label
    for considered_time in missing_times:
        last_alive = 1
        while len(df_tmp[(df_tmp['Time'] == (considered_time - last_alive)) & (df_tmp['Labels'] == label)]) == 0:
            last_alive += 1

        # Create a DataFrame to store the modified data
        df_all_imp = pd.DataFrame()

        # Find 'N_cell' value for the considered 'Time' and label from 'clusters'
        N = int(cluster_tmp_edit[(cluster_tmp_edit['Time'] == considered_time) & (cluster_tmp_edit['Labels'] == label)]['N_cell'])

        # Find 'N_cell' value for the previous 'Time' and label from 'clusters'
        N_prev = int(cluster_tmp_edit[(cluster_tmp_edit['Time'] == (considered_time - last_alive)) & (cluster_tmp_edit['Labels'] == label)]['N_cell'])

        if N >= N_prev:
            # Copy all rows of df_tmp with the same label and 'Time' == (considered_time - 1) to df_all_imp
            additional_rows = pd.concat([df_all_imp, df_tmp[(df_tmp['Time'] == (considered_time - last_alive)) & (df_tmp['Labels'] == label)]], ignore_index=True)
            additional_rows['Time'] = considered_time
            df_all_imp = pd.concat([df_all_imp, additional_rows], ignore_index=True)

            if (N - N_prev) > 0:
                # Adjust the common columns in additional_rows
                selected_rows = df_tmp[(df_tmp['Time'] == (considered_time - last_alive)) & (df_tmp['Labels'] == label)].sample(n=(N - N_prev), replace=True)
                additional_rows['Time'] = considered_time
                df_all_imp = pd.concat([df_all_imp, additional_rows], ignore_index=True)

        elif N < N_prev:
            # Copy all rows of df_tmp with the same label and 'Time' == (considered_time - 1) to df_all_imp
            diminished_rows = df_tmp[(df_tmp['Time'] == (considered_time - last_alive)) & (df_tmp['Labels'] == label)].sample(n=(N), replace=True)
            diminished_rows['Time'] = considered_time
            diminished_rows.reset_index(drop=True, inplace=True)
            df_all_imp = pd.concat([df_all_imp, diminished_rows], ignore_index=True)

        # Adjust the columns in df_all_imp for the specific label
        for common_column in common_columns:
            cluster_difference = (
                cluster_tmp_edit[(cluster_tmp_edit['Time'] == considered_time) & (cluster_tmp_edit['Labels'] == label)][common_column].values[0] -
                cluster_tmp_edit[(cluster_tmp_edit['Time'] == (considered_time - last_alive)) & (cluster_tmp_edit['Labels'] == label)][common_column].values[0]
            )

            # Adjust the cluster_difference in df_all_imp for the common columns
            # deprecated method -> df_all_imp.loc[df_all_imp['Time'] == considered_time, common_column] += cluster_difference
            df_all_imp[df_all_imp['Time'] == considered_time][common_column] += cluster_difference
        # Append the modified DataFrame to the list
        rows_to_concat.append(df_all_imp)

# Concatenate all DataFrame fragments into a single DataFrame
df_all_imp = pd.concat(rows_to_concat, ignore_index=True)

# Now, df_all_imp contains the modified data with 'Time' values as considered_time and common columns adjusted for the same label and label-specific 'Time' values


In [None]:
import pandas as pd
import random

# Function to find common columns between two DataFrames
def find_common_columns(df1, df2):
    common_columns = set(df1.columns).intersection(df2.columns)
    return list(common_columns)

# Find unique labels in 'Labels' column of 'cluster_tmp_edit'
unique_labels = cluster_tmp_edit['Labels'].unique()

# Find common columns between df_all_imp and cluster_tmp_edit
common_columns = find_common_columns(df_tmp, cluster_tmp_edit)

# List to store DataFrame fragments
rows_to_concat = []

# Iterate through unique labels
for label in unique_labels:
    # Find unique 'Time' values in 'cluster_tmp_edit' and 'df_tmp' for the current label
    unique_times_clusters = cluster_tmp_edit[(cluster_tmp_edit['Labels'] == label)]['Time'].unique()
    unique_times_df_tmp = df_tmp[(df_tmp['Labels'] == label)]['Time'].unique()

    # Iterate through missing 'Time' values for the current label
    for considered_time in unique_times_clusters:
        last_alive = 1
        while len(df_tmp[(df_tmp['Time'] == (considered_time - last_alive)) & (df_tmp['Labels'] == label)]) == 0:
            last_alive += 1

        # Create a DataFrame to store the modified data
        df_all_imp = pd.DataFrame()

        # Find 'N_cell' value for the considered 'Time' and label from 'clusters'
        N = int(cluster_tmp_edit[(cluster_tmp_edit['Time'] == considered_time) & (cluster_tmp_edit['Labels'] == label)]['N_cell'])

        # Find 'N_cell' value for the previous 'Time' and label from 'clusters'
        N_prev = int(cluster_tmp_edit[(cluster_tmp_edit['Time'] == (considered_time - last_alive)) & (cluster_tmp_edit['Labels'] == label)]['N_cell'])

        if N >= N_prev:
            # Copy all rows of df_tmp with the same label and 'Time' == (considered_time - 1) to df_all_imp
            additional_rows = pd.concat([df_all_imp, df_tmp[(df_tmp['Time'] == (considered_time - last_alive)) & (df_tmp['Labels'] == label)]], ignore_index=True)
            additional_rows['Time'] = considered_time
            df_all_imp = pd.concat([df_all_imp, additional_rows], ignore_index=True)

            if (N - N_prev) > 0:
                # Adjust the common columns in additional_rows
                selected_rows = df_tmp[(df_tmp['Time'] == (considered_time - last_alive)) & (df_tmp['Labels'] == label)].head(n=(N - N_prev))
                additional_rows['Time'] = considered_time
                df_all_imp = pd.concat([df_all_imp, additional_rows], ignore_index=True)

        elif N < N_prev:
            # Copy all rows of df_tmp with the same label and 'Time' == (considered_time - 1) to df_all_imp
            diminished_rows = df_tmp[(df_tmp['Time'] == (considered_time - last_alive)) & (df_tmp['Labels'] == label)].head(n=(N))
            diminished_rows['Time'] = considered_time
            diminished_rows.reset_index(drop=True, inplace=True)
            df_all_imp = pd.concat([df_all_imp, diminished_rows], ignore_index=True)

        # Adjust the columns in df_all_imp for the specific label
        for common_column in common_columns:
            cluster_difference = (
                cluster_tmp_edit[(cluster_tmp_edit['Time'] == considered_time) & (cluster_tmp_edit['Labels'] == label)][common_column].values[0] -
                cluster_tmp_edit[(cluster_tmp_edit['Time'] == (considered_time - last_alive)) & (cluster_tmp_edit['Labels'] == label)][common_column].values[0]
            )

            # Adjust the cluster_difference in df_all_imp for the common columns
            # deprecated method -> df_all_imp.loc[df_all_imp['Time'] == considered_time, common_column] += cluster_difference
            df_all_imp[df_all_imp['Time'] == considered_time][common_column] += cluster_difference
        # Append the modified DataFrame to the list
        rows_to_concat.append(df_all_imp)

# Concatenate all DataFrame fragments into a single DataFrame
df_all_imp = pd.concat(rows_to_concat, ignore_index=True)

# Now, df_all_imp contains the modified data with 'Time' values as considered_time and common columns adjusted for the same label and label-specific 'Time' values


In [None]:
import pandas as pd
import random

# Function to find common columns between two DataFrames
def find_common_columns(df1, df2):
    common_columns = set(df1.columns).intersection(df2.columns)
    return list(common_columns)

# Create a DataFrame to store the modified data
df_all_imp = pd.DataFrame()

# Find unique labels in 'Labels' column of 'cluster_tmp_edit'
unique_labels = cluster_tmp_edit['Labels'].unique()
# Find common columns between df_all_imp and cluster_tmp_edit
common_columns = find_common_columns(df_tmp, cluster_tmp_edit)
# Iterate through unique labels
for label in unique_labels:
    # Find unique 'Time' values in 'cluster_tmp_edit' and 'df_tmp' for the current label
    unique_times_clusters = cluster_tmp_edit[(cluster_tmp_edit['Labels'] == label)]['Time'].unique()
    unique_times_df_tmp = df_tmp[(df_tmp['Labels'] == label)]['Time'].unique()

    # Identify 'Time' values in 'clusters' not present in 'df_tmp' for the current label
    missing_times = set(unique_times_clusters) - set(unique_times_df_tmp)

    # Iterate through missing 'Time' values for the current label
    for considered_time in missing_times:
        last_alive = 1
        while len(df_tmp[(df_tmp['Time'] == (considered_time - last_alive)) & (df_tmp['Labels'] == label)]) = 0:
            last_alive += 1
        # Find 'N_cell' value for the considered 'Time' and label from 'clusters'
        N = int(cluster_tmp_edit[(cluster_tmp_edit['Time'] == considered_time) & (cluster_tmp_edit['Labels'] == label)]['N_cell'])

        # Find 'N_cell' value for the previous 'Time' and label from 'clusters'
        N_prev = int(cluster_tmp_edit[(cluster_tmp_edit['Time'] == (considered_time - last_alive)) & (cluster_tmp_edit['Labels'] == label)]['N_cell'])

        if N >= N_prev:

            # Copy all rows of df_tmp with the same label and 'Time' == (considered_time - 1) to df_all_imp
            additional_rows = pd.concat([df_all_imp, df_tmp[(df_tmp['Time'] == (considered_time - last_alive)) & (df_tmp['Labels'] == label)]], ignore_index=True)
            additional_rows['Time'] = considered_time
            # Reset the index to have consecutive numbers starting from 1

            df_all_imp = pd.concat([df_all_imp, additional_rows], ignore_index=True)


            if (N - N_prev) > 0:
                # Adjust the common columns in additional_rows
                selected_rows = df_tmp[(df_tmp['Time'] == (considered_time - last_alive)) & (df_tmp['Labels'] == label)].head(n=(N - N_prev))
                #additional_rows = df_tmp[(df_tmp['Time'] == (considered_time - 1)) & (df_tmp['Labels'] == label)].sample(n=(N - N_prev), replace=True)
                # Update 'Time' in additional_rows to become considered_time
                additional_rows['Time'] = considered_time
                df_all_imp = pd.concat([df_all_imp, additional_rows], ignore_index=True)

        elif N < N_prev:
            # Copy all rows of df_tmp with the same label and 'Time' == (considered_time - 1) to df_all_imp
            diminished_rows = df_tmp[(df_tmp['Time'] == (considered_time - last_alive)) & (df_tmp['Labels'] == label)].head(n=(N))
            # Update 'Time' in additional_rows to become considered_time
            diminished_rows['Time'] = considered_time

            diminished_rows.reset_index(drop=True, inplace=True)
            # Remove (N_prev - N) rows randomly from df_all_imp
            #indices_to_remove = random.sample(range(len(diminished_rows)), N_prev - N)
            #diminished_rows = diminished_rows.drop(indices_to_remove)
            df_all_imp = pd.concat([df_all_imp, diminished_rows], ignore_index=True)
            #diminished_rows = pd.DataFrame()
                # Adjust the columns in additional_rows for the specific label

        for common_column in common_columns:
          cluster_difference = (
            cluster_tmp_edit[(cluster_tmp_edit['Time'] == considered_time) & (cluster_tmp_edit['Labels'] == label)][common_column].values[0] -
            cluster_tmp_edit[(cluster_tmp_edit['Time'] == (considered_time - last_alive)) & (cluster_tmp_edit['Labels'] == label)][common_column].values[0])

            # Adjust the cluster_difference in df_all_imp for the common columns
        df_all_imp.loc[df_all_imp['Time'] == considered_time, common_column] += cluster_difference

# Now, df_all_imp contains the modified data with 'Time' values as considered_time and common columns adjusted for the same label and label-specific 'Time' values


In [None]:
import pandas as pd
import random
def find_common_columns(df1, df2):
    common_columns = set(df1.columns).intersection(df2.columns)
    return list(common_columns)

#############df_all_imp = df_tmp.copy()
unique_labels = cluster_tmp_edit['Labels'].unique()
common_columns = find_common_columns(df_tmp, cluster_tmp_edit)
for label in unique_labels:
    unique_times_clusters = cluster_tmp_edit[(cluster_tmp_edit['Labels'] == label)]['Time'].unique()
    unique_times_df_tmp = df_tmp[(df_tmp['Labels'] == label)]['Time'].unique()

    missing_times = set(unique_times_clusters) - set(unique_times_df_tmp)

    for considered_time in missing_times:
        N = int(cluster_tmp_edit[(cluster_tmp_edit['Time'] == considered_time) & (cluster_tmp_edit['Labels'] == label)]['N_cell'])

        N_prev = int(cluster_tmp_edit[(cluster_tmp_edit['Time'] == (considered_time - 1)) & (cluster_tmp_edit['Labels'] == label)]['N_cell'])

        if N >= N_prev:
            additional_rows = pd.concat([df_all_imp, df_tmp[(df_tmp['Time'] == (considered_time - 1)) & (df_tmp['Labels'] == label)]], ignore_index=True)
            additional_rows['Time'] = considered_time

            df_all_imp = pd.concat([df_all_imp, additional_rows], ignore_index=True)


            if (N - N_prev) > 0:
                selected_rows = df_tmp[(df_tmp['Time'] == (considered_time - 1)) & (df_tmp['Labels'] == label)].head(n=(N - N_prev))
                additional_rows['Time'] = considered_time
                df_all_imp = pd.concat([df_all_imp, additional_rows], ignore_index=True)

        elif N < N_prev:
            diminished_rows = df_tmp[(df_tmp['Time'] == (considered_time - 1)) & (df_tmp['Labels'] == label)].head(n=(N))
            diminished_rows['Time'] = considered_time
            diminished_rows.reset_index(drop=True, inplace=True)
            df_all_imp = pd.concat([df_all_imp, diminished_rows], ignore_index=True)

        for common_column in common_columns:
          cluster_difference = (
            cluster_tmp_edit[(cluster_tmp_edit['Time'] == considered_time) & (cluster_tmp_edit['Labels'] == label)][common_column].values[0] -
            cluster_tmp_edit[(cluster_tmp_edit['Time'] == (considered_time - 1)) & (cluster_tmp_edit['Labels'] == label)][common_column].values[0])

        df_all_imp[df_all_imp['Time'] == considered_time][common_column] += cluster_difference

In [None]:
import pandas as pd
import random

# Function to find common columns between two DataFrames
def find_common_columns(df1, df2):
    common_columns = set(df1.columns).intersection(df2.columns)
    return list(common_columns)


# Find unique labels in 'Labels' column of 'cluster_tmp_edit'
unique_labels = cluster_tmp_edit['Labels'].unique()
# Find common columns between df_all_imp and cluster_tmp_edit
common_columns = find_common_columns(df_tmp, cluster_tmp_edit)
# Iterate through unique labels
rows_to_concat = []
for label in unique_labels:
    # Create a DataFrame to store the modified data
    # Find unique 'Time' values in 'cluster_tmp_edit' and 'df_tmp' for the current label
    unique_times_clusters = cluster_tmp_edit[(cluster_tmp_edit['Labels'] == label)]['Time'].unique()
    unique_times_df_tmp = df_tmp[(df_tmp['Labels'] == label)]['Time'].unique()

    # Identify 'Time' values in 'clusters' not present in 'df_tmp' for the current label
    missing_times = set(unique_times_clusters) - set(unique_times_df_tmp)

    # Iterate through missing 'Time' values for the current label
    for considered_time in missing_times:
        last_alive = 1
        while len(df_tmp[(df_tmp['Time'] == (considered_time - last_alive)) & (df_tmp['Labels'] == label)]) == 0:
            last_alive += 1
        df_all_imp = pd.DataFrame()
        # Find 'N_cell' value for the considered 'Time' and label from 'clusters'
        N = int(cluster_tmp_edit[(cluster_tmp_edit['Time'] == considered_time) & (cluster_tmp_edit['Labels'] == label)]['N_cell'])

        # Find 'N_cell' value for the previous 'Time' and label from 'clusters'
        N_prev = int(cluster_tmp_edit[(cluster_tmp_edit['Time'] == (considered_time - last_alive)) & (cluster_tmp_edit['Labels'] == label)]['N_cell'])

        if N >= N_prev:
            # Copy all rows of df_tmp with the same label and 'Time' == (considered_time - 1) to df_all_imp
            additional_rows = pd.concat([df_all_imp, df_tmp[(df_tmp['Time'] == (considered_time - last_alive)) & (df_tmp['Labels'] == label)]], ignore_index=True)
            additional_rows['Time'] = considered_time
            # Reset the index to have consecutive numbers starting from 1

            df_all_imp = pd.concat([df_all_imp, additional_rows], ignore_index=True)
            #additional_rows = pd.DataFrame()

            if (N - N_prev) > 0:
                # Adjust the common columns in additional_rows
                selected_rows = df_tmp[(df_tmp['Time'] == (considered_time - last_alive)) & (df_tmp['Labels'] == label)].head(n=(N - N_prev))
                #additional_rows = df_tmp[(df_tmp['Time'] == (considered_time - 1)) & (df_tmp['Labels'] == label)].sample(n=(N - N_prev), replace=True)
                # Update 'Time' in additional_rows to become considered_time
                additional_rows['Time'] = considered_time
                df_all_imp = pd.concat([df_all_imp, additional_rows], ignore_index=True)

        elif N < N_prev:
            # Copy all rows of df_tmp with the same label and 'Time' == (considered_time - 1) to df_all_imp
            diminished_rows = df_tmp[(df_tmp['Time'] == (considered_time - last_alive)) & (df_tmp['Labels'] == label)].head(n=(N))
            # Update 'Time' in additional_rows to become considered_time
            diminished_rows['Time'] = considered_time

            diminished_rows.reset_index(drop=True, inplace=True)
            # Remove (N_prev - N) rows randomly from df_all_imp
            #indices_to_remove = random.sample(range(len(diminished_rows)), N_prev - N)
            #diminished_rows = diminished_rows.drop(indices_to_remove)
            df_all_imp = pd.concat([df_all_imp, diminished_rows], ignore_index=True)
            #diminished_rows = pd.DataFrame()
                # Adjust the columns in additional_rows for the specific label

        for common_column in common_columns:
          cluster_difference = (
            cluster_tmp_edit[(cluster_tmp_edit['Time'] == considered_time) & (cluster_tmp_edit['Labels'] == label)][common_column].values[0] -
            cluster_tmp_edit[(cluster_tmp_edit['Time'] == (considered_time - last_alive)) & (cluster_tmp_edit['Labels'] == label)][common_column].values[0])

            # Adjust the cluster_difference in df_all_imp for the common columns
        #df_all_imp.loc[df_all_imp['Time'] == considered_time, common_column] += cluster_difference
        df_all_imp[df_all_imp['Time'] == considered_time][common_column] += cluster_difference

        rows_to_concat.append(df_all_imp)

df_all_imp = pd.concat(rows_to_concat, ignore_index=True)
# Now, df_all_imp contains the modified data with 'Time' values as considered_time and common columns adjusted for the same label and label-specific 'Time' values


In [None]:
selected = [0,1,2,3]
cluster_tmp_edit = cluster_tmp_edit[cluster_tmp_edit['Labels'].isin(selected)]

In [None]:
import pandas as pd
import numpy as np
from scipy.interpolate import CubicSpline

# Assuming you have a DataFrame named cluster_tmp_edit with 'Labels', 'Time', and other columns

# Find the unique labels in the 'Labels' column
unique_labels = cluster_tmp_edit['Labels'].unique()

# Create a dictionary to store cubic spline functions for each column (excluding 'Time')
spline_functions = {}

# Iterate through unique labels and create spline functions
for label in unique_labels:
    # Make a copy of the data for this label to avoid SettingWithCopyWarning
    label_data = cluster_tmp_edit[cluster_tmp_edit['Labels'] == label].copy()

    # Sort the data by 'Time' column if it's not already sorted
    label_data.sort_values(by=['Time'], inplace=True)

    # Iterate through columns (excluding 'Time') to create spline functions
    for column in label_data.columns:
        if column != 'Time':
            # Check if there are missing values in the current column
            missing_indices = label_data[column].isna()

            if missing_indices.any():
                # Extract non-missing time and value data
                t_data = label_data.loc[~missing_indices, 'Time']
                w_data = label_data.loc[~missing_indices, column]

                # Create a cubic spline object
                spline = CubicSpline(t_data, w_data)

                # Store the spline function in the dictionary
                spline_functions[(label, column)] = spline

# Now you have spline functions for each (label, column) pair with missing values

# Iterate through unique labels and columns to interpolate missing values
for label in unique_labels:
    for column in label_data.columns:
        if column != 'Time':
            # Check if there are missing values in the current column
            missing_indices = cluster_tmp_edit['Labels'] == label
            missing_values = cluster_tmp_edit.loc[missing_indices, column].isna()

            if missing_values.any():
                # Extract time values where interpolation is needed
                t_missing = cluster_tmp_edit.loc[missing_indices, 'Time']

                # Use the corresponding spline function to interpolate missing values
                spline = spline_functions.get((label, column))
                if spline:
                    interpolated_values = spline(t_missing)

                    # Update the DataFrame with interpolated values
                    cluster_tmp_edit.loc[missing_indices, column] = interpolated_values

# Now, cluster_tmp_edit contains interpolated values for missing data


In [None]:
import pandas as pd

# Find all unique labels in the 'Labels' column
unique_labels = cluster_tmp_edit['Labels'].unique()

# Create an empty DataFrame to store missing data
missing_data = pd.DataFrame(columns=['Labels', 'Time'])

# Iterate through unique labels
for label in unique_labels:
    # Find the minimum 'Time' value in the 'Time' column for the current label
    start_time = cluster_tmp_edit[cluster_tmp_edit['Labels'] == label]['Time'].min()

    # Study the desired cluster for the current label
    desired_cluster = cluster_tmp_edit[cluster_tmp_edit['Labels'] == label]

    # Create a list of numbers starting from the 'start_time'
    max_value = desired_cluster['Time'].max()
    all_numbers = list(range(int(start_time), int(max_value) + 1))

    # Find the unique values in the 'Time' column for the current cluster
    unique_values = desired_cluster['Time'].unique()

    # Find the numbers that are not present in the column for the current cluster
    missing_numbers = list(set(all_numbers) - set(unique_values))

    # Create a DataFrame with missing values for the current cluster
    missing_cluster_data = pd.DataFrame({'Labels': [label] * len(missing_numbers), 'Time': missing_numbers})

    # Append the missing_cluster_data DataFrame to the overall missing_data
    missing_data = pd.concat([missing_data, missing_cluster_data], ignore_index=True)

# Convert 'Labels' and 'Time' columns to int64
missing_data = missing_data.astype({'Labels': 'int64', 'Time': 'int64'})

# Append the missing_data DataFrame to your existing DataFrame
cluster_tmp_edit = pd.concat([cluster_tmp_edit, missing_data], ignore_index=True)

# Sort the DataFrame by 'Time' column if needed
cluster_tmp_edit.sort_values(by=['Time'], inplace=True)

# Reset the index
cluster_tmp_edit.reset_index(drop=True, inplace=True)

