In [2]:
import pandas as pd
from pathlib import Path

csv_path = Path("data") / "lifespan_summary.csv"
if not csv_path.exists():
    raise FileNotFoundError(f"{csv_path} not found.")

df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,Filename,TimeInfoMergedVideo,PlateHasDried,LifespanInFrames,LifespanInHours,Terbinafine
0,/20240924_piworm09_1,2.45,False,49500,330,True
1,/20240924_piworm09_2,3.3,False,63000,420,True
2,/20240924_piworm09_3,3.12,False,57600,384,True
3,/20240924_piworm09_4,2.48,True,50400,336,True
4,/20240924_piworm09_5,2.43,True,48900,326,True


In [3]:
# get the highest value in LifespanInFrames and show the row(s) with that value
max_frames = df['LifespanInFrames'].max()
print("Max LifespanInFrames:", max_frames)
df[df['LifespanInFrames'] == max_frames]

Max LifespanInFrames: 129600


Unnamed: 0,Filename,TimeInfoMergedVideo,PlateHasDried,LifespanInFrames,LifespanInHours,Terbinafine
83,/20250205_piworm09_1,7.12,False,129600,864,True


In [5]:
# create a dataframe with max_frames rows
n = int(max_frames)
df_max = pd.DataFrame(index=range(n))
df_max.shape

(129600, 0)

In [6]:
import re

def filename_to_id(fname: str) -> str:
    s = str(fname).lstrip('/')  # remove leading slash if present
    m = re.match(r'(?P<date>\d{8})_piworm(?P<worm>\d+)_(?P<rep>\d+)', s)
    if m:
        date = m.group('date')
        worm = m.group('worm').zfill(2)
        rep = m.group('rep')
        return f"{date}_{worm}_{rep}"
    # fallback: remove "piworm" and any leading slashes, collapse multiple underscores
    s2 = s.replace('piworm', '').lstrip('/').replace('__', '_')
    return s2

# create new column with transformed filename values
df['SampleID'] = df['Filename'].apply(filename_to_id)

# quick check
df[['Filename', 'SampleID']].head()

Unnamed: 0,Filename,SampleID
0,/20240924_piworm09_1,20240924_09_1
1,/20240924_piworm09_2,20240924_09_2
2,/20240924_piworm09_3,20240924_09_3
3,/20240924_piworm09_4,20240924_09_4
4,/20240924_piworm09_5,20240924_09_5
