In [2]:
import pandas as pd

INPUT_PATH = "../clustering/intermediate_data/clustered_embeddings.csv"
OUTPUT_PATH = "./progress_added.csv"
NUM_BINS = 3


def time_to_seconds(t):
    """Convert MM:SS or HH:MM:SS to seconds"""
    parts = list(map(int, t.split(":")))
    if len(parts) == 2:
        return parts[0] * 60 + parts[1]
    elif len(parts) == 3:
        return parts[0] * 3600 + parts[1] * 60 + parts[2]
    else:
        return 0

def extract_start_end_sec(row):
    """Parse row['start'] and row['end'] into seconds"""
    start_sec = time_to_seconds(row["start"])
    end_sec = time_to_seconds(row["end"])
    return start_sec, end_sec


df = pd.read_csv(INPUT_PATH)
df = df[df["text"].notnull() & df["text"].str.strip().astype(bool)]
print(f"Loaded {len(df)} rows")

df[["start_sec", "end_sec"]] = df.apply(lambda row: pd.Series(extract_start_end_sec(row)), axis=1)

progress_values = []

for file_name, group in df.groupby("file"):
    total_duration = group["end_sec"].max()
    print(f"File {file_name}: total duration {total_duration:.1f} sec")
    
    for i, row in group.iterrows():
        center_time = (row["start_sec"] + row["end_sec"]) / 2
        progress = center_time / total_duration  # fraction between 0-1
        
        if NUM_BINS > 0:
            bin_size = 1.0 / NUM_BINS
            progress = round(progress / bin_size) * bin_size
        
        progress_values.append(progress)

df["progress"] = progress_values


df.to_csv(OUTPUT_PATH, index=False)
print(f"\nOK: saved {len(df)} rows with progress to {OUTPUT_PATH}")

Loaded 3474 rows
File 210.json: total duration 3648.0 sec
File 211.json: total duration 3242.0 sec
File 212.json: total duration 2447.0 sec
File 213.json: total duration 2759.0 sec
File 214.json: total duration 1626.0 sec
File 215.json: total duration 2193.0 sec
File 216.json: total duration 3062.0 sec
File 217.json: total duration 1761.0 sec
File 218.json: total duration 1574.0 sec
File 219.json: total duration 2783.0 sec
File 220.json: total duration 3593.0 sec
File 221.json: total duration 2433.0 sec
File 222.json: total duration 2152.0 sec
File 223.json: total duration 2094.0 sec
File 224.json: total duration 2461.0 sec
File 225.json: total duration 2490.0 sec

OK: saved 3474 rows with progress to ./progress_added.csv
