In [9]:
import os
import re
import pandas as pd
from pathlib import Path
from datetime import datetime

# Combine the data and remove sub-headers

In [24]:
# Merge TrackMate spot tables with a FIXED schema + SOURCE_FILE
from pathlib import Path
import pandas as pd
import os
from datetime import datetime

BASE = Path("..")                     # notebooks/ as CWD
RAW_DIR = BASE / "data" / "raw"
PROC_DIR = BASE / "data" / "processed"
PROC_DIR.mkdir(parents=True, exist_ok=True)
OUT_CSV = PROC_DIR / "combined_raw_data.csv"

# Canonical schema frozen from your sample (first header line)
CANONICAL_COLS = [
    "LABEL","ID","TRACK_ID","QUALITY","POSITION_X","POSITION_Y","POSITION_Z","POSITION_T",
    "FRAME","RADIUS","VISIBILITY","MANUAL_SPOT_COLOR","MEAN_INTENSITY_CH1","MEDIAN_INTENSITY_CH1",
    "MIN_INTENSITY_CH1","MAX_INTENSITY_CH1","TOTAL_INTENSITY_CH1","STD_INTENSITY_CH1","CONTRAST_CH1",
    "SNR_CH1","ELLIPSE_X0","ELLIPSE_Y0","ELLIPSE_MAJOR","ELLIPSE_MINOR","ELLIPSE_THETA",
    "ELLIPSE_ASPECTRATIO","AREA","PERIMETER","CIRCULARITY","SOLIDITY","SHAPE_INDEX"
]
OUTPUT_COLS = CANONICAL_COLS + ["SOURCE_FILE"]

def read_trackmate_csv(path: Path) -> pd.DataFrame:
    # Keep first header row; skip next 3 rows (friendly names, alt names, units)
    df = pd.read_csv(path, skiprows=[1, 2, 3])

    # Heuristic to drop in-body repeated sub-headers:
    numeric_candidates = [c for c in CANONICAL_COLS if c not in ("LABEL","MANUAL_SPOT_COLOR")]
    for c in numeric_candidates:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    if numeric_candidates:
        keep_cols = [c for c in numeric_candidates if c in df.columns]
        if keep_cols:
            mask_all_nan = df[keep_cols].isna().all(axis=1)
            df = df[~mask_all_nan].reset_index(drop=True)

    # Keep ONLY canonical columns; add missing ones as NA and order them
    for c in CANONICAL_COLS:
        if c not in df.columns:
            df[c] = pd.NA
    df = df[CANONICAL_COLS]

    # Append provenance
    df["SOURCE_FILE"] = path.name   # use str(path) if you prefer full path
    return df

def safe_write_csv(df: pd.DataFrame, dst: Path) -> Path:
    try:
        if dst.exists():
            os.remove(dst)
        df.to_csv(dst, index=False)
        return dst
    except PermissionError:
        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
        alt = dst.with_name(dst.stem + f"_{ts}" + dst.suffix)
        df.to_csv(alt, index=False)
        print(f"[WARN] PermissionError writing {dst}. Saved as: {alt}")
        return alt

# Collect and read CSVs
paths = sorted({p for ext in ("*.csv","*.CSV") for p in RAW_DIR.rglob(ext)})
if not paths:
    raise FileNotFoundError(f"No CSV files found under {RAW_DIR}")

frames, failed = [], []
for p in paths:
    try:
        frames.append(read_trackmate_csv(p))
    except Exception as e:
        failed.append((str(p), str(e)))

combined = pd.concat(frames, ignore_index=True)[OUTPUT_COLS] if frames else pd.DataFrame(columns=OUTPUT_COLS)

# Save
written_csv = safe_write_csv(combined, OUT_CSV)
print(f"Files read: {len(frames)} | Failed: {len(failed)}")
print(f"Combined shape: {combined.shape}")
print("Saved CSV to:", written_csv)
if failed:
    print("Example failures:", failed[:3])


Files read: 347 | Failed: 0
Combined shape: (62733, 32)
Saved CSV to: ..\data\processed\combined_raw_data.csv


In [25]:
# Display the first few rows to understand the structure and content.
print("\n--- First 5 Rows of the Dataset ---")
combined.head()


--- First 5 Rows of the Dataset ---


Unnamed: 0,LABEL,ID,TRACK_ID,QUALITY,POSITION_X,POSITION_Y,POSITION_Z,POSITION_T,FRAME,RADIUS,...,ELLIPSE_MAJOR,ELLIPSE_MINOR,ELLIPSE_THETA,ELLIPSE_ASPECTRATIO,AREA,PERIMETER,CIRCULARITY,SOLIDITY,SHAPE_INDEX,SOURCE_FILE
0,ID2945,2945,0,209.0,145.948467,95.621082,0.0,3.0,3,8.110457,...,10.486066,7.479732,1.179736,1.401931,206.652412,71.594489,0.50663,0.853061,4.980344,160727_k5_CFP_16-32_spots.csv
1,ID2946,2946,0,220.0,148.047821,94.433312,0.0,7.0,7,8.321153,...,13.582833,5.863332,0.908996,2.316572,217.528855,89.493111,0.341309,0.810313,6.067799,160727_k5_CFP_16-32_spots.csv
2,ID2947,2947,0,195.0,145.259303,91.481847,0.0,11.0,11,7.834106,...,12.629828,5.192997,1.017382,2.432088,192.809667,89.493111,0.302524,0.79918,6.445034,160727_k5_CFP_16-32_spots.csv
3,ID2948,2948,0,234.0,145.152217,91.656074,0.0,21.0,21,8.581834,...,12.06366,6.053673,0.832374,1.992784,231.3716,85.515639,0.397584,0.855576,5.621996,160727_k5_CFP_16-32_spots.csv
4,ID2951,2951,0,219.0,143.997185,89.188898,0.0,12.0,12,8.30222,...,13.837259,5.036212,1.023793,2.747553,216.540087,99.43679,0.275204,0.773852,6.757374,160727_k5_CFP_16-32_spots.csv


In [26]:
# Display basic descriptive statistics for numerical columns.
print("\n--- Descriptive Statistics for Numerical Columns ---")
combined.describe()


--- Descriptive Statistics for Numerical Columns ---


Unnamed: 0,ID,TRACK_ID,QUALITY,POSITION_X,POSITION_Y,POSITION_Z,POSITION_T,FRAME,RADIUS,VISIBILITY,...,ELLIPSE_Y0,ELLIPSE_MAJOR,ELLIPSE_MINOR,ELLIPSE_THETA,ELLIPSE_ASPECTRATIO,AREA,PERIMETER,CIRCULARITY,SOLIDITY,SHAPE_INDEX
count,62733.0,62733.0,62733.0,62733.0,62733.0,62733.0,62733.0,62733.0,62733.0,62733.0,...,62733.0,62733.0,62733.0,62733.0,62731.0,62733.0,62733.0,62733.0,62733.0,62733.0
mean,44793.245453,15.117227,177.698532,99.390484,100.097783,0.0,11.10186,11.10186,5.216079,1.0,...,0.000841,6.819664,4.294763,0.913445,1.625514,89.191319,47.793558,0.49271,0.878626,5.126711
std,26670.920014,15.171854,71.971663,59.471399,62.214191,0.0,7.894153,7.894153,1.087667,0.0,...,0.262692,1.982522,0.961046,1.192663,0.486106,38.24979,13.304966,0.089622,0.055402,0.561732
min,2932.0,0.0,1.0,0.994375,1.60455,0.0,0.0,0.0,0.280506,1.0,...,-3.080265,-0.0,0.0,-1.570679,1.002115,0.247192,1.988734,0.130887,0.459708,4.0
25%,14569.0,4.0,128.0,52.132938,54.212153,0.0,5.0,5.0,4.45683,1.0,...,-0.087661,5.418314,3.648226,0.10499,1.281984,62.402506,38.448933,0.438756,0.859259,4.743416
50%,46856.0,10.0,167.0,86.252984,85.515696,0.0,10.0,10.0,5.073308,1.0,...,-8.2e-05,6.434353,4.195993,0.87816,1.500188,80.859747,45.078061,0.508034,0.895238,4.973459
75%,67938.0,21.0,214.0,136.7132,130.784498,0.0,16.0,16.0,5.842186,1.0,...,0.087599,7.836629,4.824285,1.891467,1.828635,107.226131,54.358818,0.558505,0.915371,5.351719
max,88245.0,97.0,770.0,361.456676,387.71835,0.0,37.0,37.0,8.907281,1.0,...,3.365477,22.086081,9.499973,3.141457,5.883851,249.252887,136.560207,0.785398,1.0,9.798441


# Missing value

In [27]:
# Check for missing values in each column
print("\n--- Missing Values Before Cleaning ---")
print(combined.isnull().sum())



--- Missing Values Before Cleaning ---
LABEL                       0
ID                          0
TRACK_ID                    0
QUALITY                     0
POSITION_X                  0
POSITION_Y                  0
POSITION_Z                  0
POSITION_T                  0
FRAME                       0
RADIUS                      0
VISIBILITY                  0
MANUAL_SPOT_COLOR       62733
MEAN_INTENSITY_CH1          0
MEDIAN_INTENSITY_CH1        0
MIN_INTENSITY_CH1           0
MAX_INTENSITY_CH1           0
TOTAL_INTENSITY_CH1         0
STD_INTENSITY_CH1           2
CONTRAST_CH1                0
SNR_CH1                     2
ELLIPSE_X0                  0
ELLIPSE_Y0                  0
ELLIPSE_MAJOR               0
ELLIPSE_MINOR               0
ELLIPSE_THETA               0
ELLIPSE_ASPECTRATIO         2
AREA                        0
PERIMETER                   0
CIRCULARITY                 0
SOLIDITY                    0
SHAPE_INDEX                 0
SOURCE_FILE                 0


In [None]:
import re
import pandas as pd
from pathlib import Path

ROOT = Path("..")
INPUT  = ROOT / "data/processed/combined_raw_data.csv"
OUTPUT = ROOT / "data/processed/combined_cleaned.csv" 

df = pd.read_csv(INPUT)
print("Original shape:", df.shape)

df = df.drop(columns=["POSITION_Z"], errors="ignore")
na_rate = df.isna().mean()
to_drop_by_na = na_rate[na_rate > 0.99].index.tolist()


ch23_cols = [c for c in df.columns if re.search(r'(?:^|_)CH[23](?:_|$)', c)]

manual_drop = [c for c in ["MANUAL_SPOT_COLOR"] if c in df.columns]

to_drop = sorted(set(to_drop_by_na + ch23_cols + manual_drop))
print(f"Drop {len(to_drop)} columns:", to_drop)

df = df.drop(columns=to_drop, errors="ignore")


tiny_na_cols = [c for c in ["STD_INTENSITY_CH1", "SNR_CH1"] if c in df.columns]
if tiny_na_cols:
    na_rows_mask = df[tiny_na_cols].isna().any(axis=1)
    print(f"Rows to drop due to NA in {tiny_na_cols}: {int(na_rows_mask.sum())}")
    df = df.loc[~na_rows_mask].reset_index(drop=True)

print("New shape:", df.shape)

missing_pct = ((df.isna().sum()/len(df))*100).round(2).sort_values(ascending=False).head(20)
print("\nTop-20 missing % after cleaning:\n", missing_pct)

OUTPUT.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(OUTPUT, index=False)
print("Saved ->", OUTPUT.resolve())


Original shape: (62733, 32)
Drop 1 columns: ['MANUAL_SPOT_COLOR']
Rows to drop due to NA in ['STD_INTENSITY_CH1', 'SNR_CH1']: 2
New shape: (62731, 30)

Top-20 missing % after cleaning:
 LABEL                   0.0
ID                      0.0
SHAPE_INDEX             0.0
SOLIDITY                0.0
CIRCULARITY             0.0
PERIMETER               0.0
AREA                    0.0
ELLIPSE_ASPECTRATIO     0.0
ELLIPSE_THETA           0.0
ELLIPSE_MINOR           0.0
ELLIPSE_MAJOR           0.0
ELLIPSE_Y0              0.0
ELLIPSE_X0              0.0
SNR_CH1                 0.0
CONTRAST_CH1            0.0
STD_INTENSITY_CH1       0.0
TOTAL_INTENSITY_CH1     0.0
MAX_INTENSITY_CH1       0.0
MIN_INTENSITY_CH1       0.0
MEDIAN_INTENSITY_CH1    0.0
dtype: float64
Saved -> D:\Users\Yuhan_Li\Desktop\dataproject\MAST90107\data\processed\combined_cleaned.csv


In [29]:
# Double check the missing value
((df.isna().sum() / len(df) * 100).round(2)
 .sort_values(ascending=False).head(20))


LABEL                   0.0
STD_INTENSITY_CH1       0.0
SHAPE_INDEX             0.0
SOLIDITY                0.0
CIRCULARITY             0.0
PERIMETER               0.0
AREA                    0.0
ELLIPSE_ASPECTRATIO     0.0
ELLIPSE_THETA           0.0
ELLIPSE_MINOR           0.0
ELLIPSE_MAJOR           0.0
ELLIPSE_Y0              0.0
ELLIPSE_X0              0.0
SNR_CH1                 0.0
CONTRAST_CH1            0.0
TOTAL_INTENSITY_CH1     0.0
ID                      0.0
MAX_INTENSITY_CH1       0.0
MIN_INTENSITY_CH1       0.0
MEDIAN_INTENSITY_CH1    0.0
dtype: float64