# P0 - Data Gating & Scope Lock

## Setup Path + Output Dir

In [1]:
# P0.0 ‚Äî Setup
import os, json
from pathlib import Path
import pandas as pd

# Root input dataset (sesuai screenshot)
IN_DIR = Path("/kaggle/input/driving-video-with-object-tracking")

# Candidate paths
LABEL_CSV = IN_DIR / "mot_labels.csv"
LABEL_PARQUET = IN_DIR / "mot_labels.parquet"

# Video dir (BDD100K structure)
VIDEO_DIR = IN_DIR / "bdd100k_videos_train_00" / "bdd100k" / "videos" / "train"

# Output root
PREP_DIR = Path("/kaggle/working/preprocessed_v2")
P0_DIR = PREP_DIR / "P0"
P0_DIR.mkdir(parents=True, exist_ok=True)

print("IN_DIR:", IN_DIR)
print("LABEL_CSV exists:", LABEL_CSV.exists(), LABEL_CSV)
print("LABEL_PARQUET exists:", LABEL_PARQUET.exists(), LABEL_PARQUET)
print("VIDEO_DIR exists:", VIDEO_DIR.exists(), VIDEO_DIR)
print("P0_DIR:", P0_DIR)

IN_DIR: /kaggle/input/driving-video-with-object-tracking
LABEL_CSV exists: True /kaggle/input/driving-video-with-object-tracking/mot_labels.csv
LABEL_PARQUET exists: True /kaggle/input/driving-video-with-object-tracking/mot_labels.parquet
VIDEO_DIR exists: True /kaggle/input/driving-video-with-object-tracking/bdd100k_videos_train_00/bdd100k/videos/train
P0_DIR: /kaggle/working/preprocessed_v2/P0


## Load Labels (prefer parquet) + Schema Peek

In [2]:
# P0.1 ‚Äî Load labels
if LABEL_PARQUET.exists():
    labels = pd.read_parquet(LABEL_PARQUET)
    label_source = "parquet"
elif LABEL_CSV.exists():
    labels = pd.read_csv(LABEL_CSV)
    label_source = "csv"
else:
    raise FileNotFoundError("mot_labels.csv/parquet tidak ditemukan di IN_DIR.")

print("Loaded labels from:", label_source)
print("Shape:", labels.shape)
print("Columns:", list(labels.columns))
labels.head()

Loaded labels from: parquet
Shape: (2890846, 13)
Columns: ['name', 'videoName', 'frameIndex', 'id', 'category', 'attributes.crowd', 'attributes.occluded', 'attributes.truncated', 'box2d.x1', 'box2d.x2', 'box2d.y1', 'box2d.y2', 'haveVideo']


Unnamed: 0,name,videoName,frameIndex,id,category,attributes.crowd,attributes.occluded,attributes.truncated,box2d.x1,box2d.x2,box2d.y1,box2d.y2,haveVideo
0,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89537,car,False,True,False,825.17321,1003.094688,355.011547,418.198614,True
1,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89538,car,False,True,False,484.295612,700.461894,346.69746,424.849885,True
2,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89539,pedestrian,False,True,False,645.588915,663.879908,338.383372,358.337182,True
3,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89540,car,False,False,False,120.969977,192.471132,359.168591,409.053118,True
4,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89541,car,False,False,False,251.501155,315.51963,354.180139,400.73903,True


## Detect Key Columns (video key + haveVideo)

In [3]:
# P0.2 ‚Äî Detect required columns
cols = {c.lower(): c for c in labels.columns}

# Find haveVideo column
havevideo_col = None
for cand in ["havevideo", "have_video", "hasvideo", "has_video"]:
    if cand in cols:
        havevideo_col = cols[cand]
        break

if havevideo_col is None:
    raise KeyError("Kolom haveVideo tidak ditemukan. Cek nama kolom di labels.")

# Find video identifier column
video_col = None
video_candidates = [
    "videoname", "video_name", "video", "videoid", "video_id",
    "videokey", "video_key", "name", "filename", "file_name"
]
for cand in video_candidates:
    if cand in cols:
        video_col = cols[cand]
        break

if video_col is None:
    # fallback: cari kolom yang mengandung kata 'video'
    video_like = [c for c in labels.columns if "video" in c.lower()]
    raise KeyError(f"Kolom video id tidak ketemu. Kandidat mengandung 'video': {video_like}")

print("Using columns:")
print(" - haveVideo:", havevideo_col)
print(" - video:", video_col)

# Normalize haveVideo to boolean
hv = labels[havevideo_col]
if hv.dtype != bool:
    # common cases: 0/1, "True"/"False"
    labels["_haveVideo_bool"] = hv.astype(str).str.lower().isin(["1","true","yes","y","t"])
    havevideo_use = "_haveVideo_bool"
else:
    havevideo_use = havevideo_col

Using columns:
 - haveVideo: haveVideo
 - video: videoName


## Filter haveVideo == True ‚Üí labels_v2

In [4]:
# P0.3 ‚Äî Filter haveVideo == True
labels_v2 = labels.loc[labels[havevideo_use] == True].copy()

print("Before:", labels.shape)
print("After (haveVideo==True):", labels_v2.shape)

# Save labels_v2
labels_v2_path = P0_DIR / "labels_v2.parquet"
labels_v2.to_parquet(labels_v2_path, index=False)
print("Saved:", labels_v2_path)

Before: (2890846, 13)
After (haveVideo==True): (1922517, 13)
Saved: /kaggle/working/preprocessed_v2/P0/labels_v2.parquet


## Build video_list_v2 dari labels_v2

In [5]:
# P0.4 ‚Äî Build video_list_v2
video_list_v2 = (
    labels_v2[[video_col]]
    .dropna()
    .drop_duplicates()
    .rename(columns={video_col: "video"})
    .sort_values("video")
    .reset_index(drop=True)
)

video_list_path = P0_DIR / "video_list_v2.parquet"
video_list_v2.to_parquet(video_list_path, index=False)

print("Unique videos in labels_v2:", len(video_list_v2))
print("Saved:", video_list_path)
video_list_v2.head()

Unique videos in labels_v2: 961
Saved: /kaggle/working/preprocessed_v2/P0/video_list_v2.parquet


Unnamed: 0,video
0,0000f77c-6257be58
1,0000f77c-62c2a288
2,0000f77c-cb820c98
3,0001542f-5ce3cf52
4,0001542f-7c670be8


## Sanity Check: video file existence + missing_videos

In [6]:
# P0.5 ‚Äî Check physical video files
if not VIDEO_DIR.exists():
    raise FileNotFoundError(f"VIDEO_DIR tidak ditemukan: {VIDEO_DIR}")

# List actual video files
video_files = []
for ext in ("*.mp4", "*.mov", "*.avi", "*.mkv"):
    video_files.extend(VIDEO_DIR.glob(ext))

video_files = sorted(video_files)
video_files_set = set([vf.name for vf in video_files])  # with ext

print("Total video files found:", len(video_files))
print("Example files:", video_files[:5])

# Helper: map label video -> possible filename
def possible_names(v):
    v = str(v)
    # if already has extension
    if "." in Path(v).name:
        return [Path(v).name]
    # common extension guesses
    return [f"{v}.mp4", f"{v}.mov", f"{v}.avi", f"{v}.mkv"]

# Missing videos (label says haveVideo True, but file not found)
missing_videos = []
found_videos = 0

for v in video_list_v2["video"].tolist():
    candidates = possible_names(v)
    if any(c in video_files_set for c in candidates):
        found_videos += 1
    else:
        missing_videos.append(v)

print("Videos in labels_v2:", len(video_list_v2))
print("Found physically:", found_videos)
print("Missing physically:", len(missing_videos))

Total video files found: 1000
Example files: [PosixPath('/kaggle/input/driving-video-with-object-tracking/bdd100k_videos_train_00/bdd100k/videos/train/0000f77c-6257be58.mov'), PosixPath('/kaggle/input/driving-video-with-object-tracking/bdd100k_videos_train_00/bdd100k/videos/train/0000f77c-62c2a288.mov'), PosixPath('/kaggle/input/driving-video-with-object-tracking/bdd100k_videos_train_00/bdd100k/videos/train/0000f77c-cb820c98.mov'), PosixPath('/kaggle/input/driving-video-with-object-tracking/bdd100k_videos_train_00/bdd100k/videos/train/0001542f-5ce3cf52.mov'), PosixPath('/kaggle/input/driving-video-with-object-tracking/bdd100k_videos_train_00/bdd100k/videos/train/0001542f-7c670be8.mov')]
Videos in labels_v2: 961
Found physically: 961
Missing physically: 0


## Find videos_no_labels (file fisik ada, tapi tidak muncul di labels_v2)

In [7]:
# P0.6 ‚Äî videos_no_labels
# Normalize label video names to filenames with extension where possible
label_video_filenames = set()
for v in video_list_v2["video"].tolist():
    candidates = possible_names(v)
    # ambil kandidat yang match file jika ada, else simpan kandidat pertama
    matched = [c for c in candidates if c in video_files_set]
    if matched:
        label_video_filenames.add(matched[0])
    else:
        label_video_filenames.add(candidates[0])

videos_no_labels = sorted(list(video_files_set - label_video_filenames))

print("videos_no_labels (physical exists but not in labels_v2):", len(videos_no_labels))
print("Example:", videos_no_labels[:10])

videos_no_labels (physical exists but not in labels_v2): 39
Example: ['0004974f-05e1c285.mov', '002f8552-0cdd55c6.mov', '00495359-1d04dd8a.mov', '0060b445-5acc00ed.mov', '007eddfc-f8a80310.mov', '0081e3ea-cc69a1c4.mov', '00ca8821-17667a58.mov', '00cea101-293a30b5.mov', '00e100a8-a5e1ece9.mov', '00e5e793-22614772.mov']


## Save audit_report.json

In [8]:
# P0.7 ‚Äî Save audit report
audit = {
    "input_dir": str(IN_DIR),
    "video_dir": str(VIDEO_DIR),
    "label_source": label_source,
    "n_labels_raw": int(len(labels)),
    "n_labels_haveVideo_true": int(len(labels_v2)),
    "n_unique_videos_labels_v2": int(len(video_list_v2)),
    "n_video_files_physical": int(len(video_files_set)),
    "n_missing_videos_physical": int(len(missing_videos)),
    "n_videos_no_labels": int(len(videos_no_labels)),
    "missing_videos_sample": missing_videos[:50],
    "videos_no_labels_sample": videos_no_labels[:50],
    "used_columns": {
        "video_col": video_col,
        "haveVideo_col_original": havevideo_col,
        "haveVideo_col_used": havevideo_use,
    }
}

audit_path = P0_DIR / "audit_report.json"
with open(audit_path, "w") as f:
    json.dump(audit, f, indent=2)

print("Saved:", audit_path)
audit

Saved: /kaggle/working/preprocessed_v2/P0/audit_report.json


{'input_dir': '/kaggle/input/driving-video-with-object-tracking',
 'video_dir': '/kaggle/input/driving-video-with-object-tracking/bdd100k_videos_train_00/bdd100k/videos/train',
 'label_source': 'parquet',
 'n_labels_raw': 2890846,
 'n_labels_haveVideo_true': 1922517,
 'n_unique_videos_labels_v2': 961,
 'n_video_files_physical': 1000,
 'n_missing_videos_physical': 0,
 'n_videos_no_labels': 39,
 'missing_videos_sample': [],
 'videos_no_labels_sample': ['0004974f-05e1c285.mov',
  '002f8552-0cdd55c6.mov',
  '00495359-1d04dd8a.mov',
  '0060b445-5acc00ed.mov',
  '007eddfc-f8a80310.mov',
  '0081e3ea-cc69a1c4.mov',
  '00ca8821-17667a58.mov',
  '00cea101-293a30b5.mov',
  '00e100a8-a5e1ece9.mov',
  '00e5e793-22614772.mov',
  '0114bdd0-f317da84.mov',
  '01306b58-1c8ac4c9.mov',
  '013742f1-3a043a4e.mov',
  '01c231b4-1ebf841a.mov',
  '0204aad7-4ebd19fa.mov',
  '02065c11-0966f65d.mov',
  '022aca7c-9e9a3905.mov',
  '0252f2b5-741ac12c.mov',
  '025337f4-9404c4de.mov',
  '02537d58-3972c3e7.mov',
  '0255

# P1 - Frame Timeline Audit (untuk strategi ‚Äúmissing frame index‚Äù)

## Setup path + load labels_v2 + video_list_v2

In [9]:
# P1.0 ‚Äî Setup
import os
from pathlib import Path
import pandas as pd
import numpy as np

PREP_DIR = Path("/kaggle/working/preprocessed_v2")
P0_DIR = PREP_DIR / "P0"
P1_DIR = PREP_DIR / "P1"
P1_DIR.mkdir(parents=True, exist_ok=True)

IN_DIR = Path("/kaggle/input/driving-video-with-object-tracking")
VIDEO_DIR = IN_DIR / "bdd100k_videos_train_00" / "bdd100k" / "videos" / "train"

labels_v2_path = P0_DIR / "labels_v2.parquet"
video_list_path = P0_DIR / "video_list_v2.parquet"

labels_v2 = pd.read_parquet(labels_v2_path)
video_list_v2 = pd.read_parquet(video_list_path)

print("labels_v2:", labels_v2.shape, "from", labels_v2_path)
print("video_list_v2:", video_list_v2.shape, "from", video_list_path)
print("VIDEO_DIR exists:", VIDEO_DIR.exists(), VIDEO_DIR)
labels_v2.head()

labels_v2: (1922517, 13) from /kaggle/working/preprocessed_v2/P0/labels_v2.parquet
video_list_v2: (961, 1) from /kaggle/working/preprocessed_v2/P0/video_list_v2.parquet
VIDEO_DIR exists: True /kaggle/input/driving-video-with-object-tracking/bdd100k_videos_train_00/bdd100k/videos/train


Unnamed: 0,name,videoName,frameIndex,id,category,attributes.crowd,attributes.occluded,attributes.truncated,box2d.x1,box2d.x2,box2d.y1,box2d.y2,haveVideo
0,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89537,car,False,True,False,825.17321,1003.094688,355.011547,418.198614,True
1,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89538,car,False,True,False,484.295612,700.461894,346.69746,424.849885,True
2,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89539,pedestrian,False,True,False,645.588915,663.879908,338.383372,358.337182,True
3,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89540,car,False,False,False,120.969977,192.471132,359.168591,409.053118,True
4,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89541,car,False,False,False,251.501155,315.51963,354.180139,400.73903,True


## Detect kolom video dan frameIndex secara aman

In [10]:
# P1.1 ‚Äî Detect key columns
cols = {c.lower(): c for c in labels_v2.columns}

# video column (dari audit P0: videoName)
video_col = None
for cand in ["videoname", "video_name", "video", "videoid", "video_id", "videokey", "video_key", "filename", "file_name"]:
    if cand in cols:
        video_col = cols[cand]
        break
if video_col is None:
    video_like = [c for c in labels_v2.columns if "video" in c.lower()]
    raise KeyError(f"Kolom video tidak ketemu. Kandidat: {video_like}")

# frameIndex column
frame_col = None
for cand in ["frameindex", "frame_index", "frame", "frameid", "frame_id"]:
    if cand in cols:
        frame_col = cols[cand]
        break
if frame_col is None:
    frame_like = [c for c in labels_v2.columns if "frame" in c.lower()]
    raise KeyError(f"Kolom frameIndex tidak ketemu. Kandidat: {frame_like}")

print("Using:")
print(" - video_col:", video_col)
print(" - frame_col:", frame_col)

# Ensure frameIndex numeric
labels_v2[frame_col] = pd.to_numeric(labels_v2[frame_col], errors="coerce")
bad_frame = labels_v2[frame_col].isna().sum()
print("NaN frameIndex after coercion:", bad_frame)
if bad_frame > 0:
    labels_v2 = labels_v2.dropna(subset=[frame_col]).copy()
    labels_v2[frame_col] = labels_v2[frame_col].astype(int)

Using:
 - video_col: videoName
 - frame_col: frameIndex
NaN frameIndex after coercion: 0


## Map video name ‚Üí file path (handle ext .mov/.mp4)

In [11]:
# P1.2 ‚Äî Resolve video file paths
# Build a dict of available filenames in VIDEO_DIR
video_files = []
for ext in ("*.mov", "*.mp4", "*.avi", "*.mkv"):
    video_files.extend(VIDEO_DIR.glob(ext))
video_files = sorted(video_files)

video_files_by_name = {vf.name: vf for vf in video_files}

def resolve_video_path(vname: str) -> Path | None:
    v = str(vname)
    # If already includes extension
    if "." in Path(v).name:
        return video_files_by_name.get(Path(v).name, None)
    # Try common extensions
    for ext in (".mov", ".mp4", ".avi", ".mkv"):
        cand = f"{v}{ext}"
        if cand in video_files_by_name:
            return video_files_by_name[cand]
    return None

# Attach resolved path for each video in list
video_list_v2 = video_list_v2.copy()
video_list_v2["video_path"] = video_list_v2["video"].apply(resolve_video_path)

missing_paths = video_list_v2["video_path"].isna().sum()
print("Videos in list:", len(video_list_v2))
print("Missing resolved paths:", missing_paths)
if missing_paths > 0:
    print("Example missing:", video_list_v2[video_list_v2["video_path"].isna()].head(10))
    raise RuntimeError("Ada video di video_list_v2 yang tidak ditemukan file fisiknya. Seharusnya 0 sesuai P0.")

Videos in list: 961
Missing resolved paths: 0


## Extract video_meta (n_frames, fps, width, height) via OpenCV

In [12]:
# P1.3 ‚Äî Extract video metadata with OpenCV
import cv2
from time import time

records = []
t0 = time()

for i, row in enumerate(video_list_v2.itertuples(index=False), 1):
    vname = row.video
    vpath = str(row.video_path)

    cap = cv2.VideoCapture(vpath)
    if not cap.isOpened():
        records.append({
            "video": vname,
            "video_path": vpath,
            "opened": False,
            "n_frames": np.nan,
            "fps": np.nan,
            "width": np.nan,
            "height": np.nan
        })
        continue

    n_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    fps = cap.get(cv2.CAP_PROP_FPS)
    width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
    height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
    cap.release()

    records.append({
        "video": vname,
        "video_path": vpath,
        "opened": True,
        "n_frames": int(n_frames) if not np.isnan(n_frames) else np.nan,
        "fps": float(fps) if not np.isnan(fps) else np.nan,
        "width": int(width) if not np.isnan(width) else np.nan,
        "height": int(height) if not np.isnan(height) else np.nan
    })

    if i % 50 == 0:
        print(f"[{i}/{len(video_list_v2)}] done. elapsed={(time()-t0):.1f}s")

video_meta = pd.DataFrame.from_records(records)
print("video_meta:", video_meta.shape)
print("opened false:", (~video_meta["opened"]).sum())
video_meta.head()

[50/961] done. elapsed=1.2s
[100/961] done. elapsed=2.2s
[150/961] done. elapsed=3.2s
[200/961] done. elapsed=4.3s
[250/961] done. elapsed=5.2s
[300/961] done. elapsed=6.3s
[350/961] done. elapsed=7.2s
[400/961] done. elapsed=8.2s
[450/961] done. elapsed=9.2s
[500/961] done. elapsed=10.3s
[550/961] done. elapsed=11.3s
[600/961] done. elapsed=12.3s
[650/961] done. elapsed=13.3s
[700/961] done. elapsed=14.3s
[750/961] done. elapsed=15.3s
[800/961] done. elapsed=16.4s
[850/961] done. elapsed=17.4s
[900/961] done. elapsed=18.3s
[950/961] done. elapsed=19.4s
video_meta: (961, 7)
opened false: 0


Unnamed: 0,video,video_path,opened,n_frames,fps,width,height
0,0000f77c-6257be58,/kaggle/input/driving-video-with-object-tracki...,True,1217,30.148637,1280,720
1,0000f77c-62c2a288,/kaggle/input/driving-video-with-object-tracki...,True,1211,30.040931,1280,720
2,0000f77c-cb820c98,/kaggle/input/driving-video-with-object-tracki...,True,1215,30.151377,1280,720
3,0001542f-5ce3cf52,/kaggle/input/driving-video-with-object-tracki...,True,1213,30.20293,1280,720
4,0001542f-7c670be8,/kaggle/input/driving-video-with-object-tracki...,True,1215,30.126457,1280,720


## Save video_meta.parquet

In [13]:
# P1.4 ‚Äî Save video_meta
video_meta_path = P1_DIR / "video_meta.parquet"
video_meta.to_parquet(video_meta_path, index=False)
print("Saved:", video_meta_path)

Saved: /kaggle/working/preprocessed_v2/P1/video_meta.parquet


## Compute frame coverage metrics per video

In [14]:
# P1.5 (FIX) ‚Äî Frame coverage report (robust)
import numpy as np
import pandas as pd

# 1) Aggregate basic stats (rows, unique frames, min, max)
agg = (
    labels_v2
    .groupby(video_col)[frame_col]
    .agg(
        label_rows="size",
        label_unique_frames=lambda s: int(pd.Series(s).nunique()),
        frame_min=lambda s: int(pd.Series(s).min()),
        frame_max=lambda s: int(pd.Series(s).max()),
    )
    .reset_index()
    .rename(columns={video_col: "video"})
)

# 2) Gap stats with explicit loop (stable)
gap_records = []
for v, sub in labels_v2.groupby(video_col):
    u = np.array(sorted(pd.unique(sub[frame_col])))
    if u.size <= 1:
        gap_records.append({"video": v, "gap_count": 0, "max_gap": 0, "gap_ratio": 0.0})
        continue
    d = np.diff(u)
    gap_count = int((d > 1).sum())
    max_gap = int(d[d > 1].max()) if (d > 1).any() else 0
    gap_ratio = float(gap_count / (u.size - 1))
    gap_records.append({"video": v, "gap_count": gap_count, "max_gap": max_gap, "gap_ratio": gap_ratio})

gap_df = pd.DataFrame(gap_records)

# 3) Merge with video_meta
frame_cov = (
    agg
    .merge(gap_df, on="video", how="left")
    .merge(video_meta[["video","n_frames","fps","width","height","opened"]], on="video", how="left")
)

# 4) Compute ratios safely
frame_cov["coverage_ratio"] = frame_cov["label_unique_frames"] / frame_cov["n_frames"]
frame_cov["max_covered_ratio"] = (frame_cov["frame_max"] + 1) / frame_cov["n_frames"]

# 5) Flags
frame_cov["flag_sparse_labels"] = frame_cov["coverage_ratio"] < 0.2
frame_cov["flag_low_max_covered"] = frame_cov["max_covered_ratio"] < 0.5
frame_cov["flag_fragmented_timeline"] = frame_cov["gap_ratio"] > 0.2

print("frame_coverage_report:", frame_cov.shape)
print("Columns:", list(frame_cov.columns))
frame_cov.sort_values("coverage_ratio").head(10)

frame_coverage_report: (961, 18)
Columns: ['video', 'label_rows', 'label_unique_frames', 'frame_min', 'frame_max', 'gap_count', 'max_gap', 'gap_ratio', 'n_frames', 'fps', 'width', 'height', 'opened', 'coverage_ratio', 'max_covered_ratio', 'flag_sparse_labels', 'flag_low_max_covered', 'flag_fragmented_timeline']


Unnamed: 0,video,label_rows,label_unique_frames,frame_min,frame_max,gap_count,max_gap,gap_ratio,n_frames,fps,width,height,opened,coverage_ratio,max_covered_ratio,flag_sparse_labels,flag_low_max_covered,flag_fragmented_timeline
23,000f157f-30b30f5e,2767,202,0,201,0,0,0.0,1216,30.295229,1280,720,True,0.166118,0.166118,True,True,False
44,0024b742-acbed4fb,622,102,0,101,0,0,0.0,614,30.423652,1280,720,True,0.166124,0.166124,True,True,False
474,01460ec4-a1d65b66,977,201,0,200,0,0,0.0,1209,30.172199,1280,720,True,0.166253,0.166253,True,True,False
592,01a4deab-9d5e5a17,1569,202,0,201,0,0,0.0,1215,30.376266,1280,720,True,0.166255,0.166255,True,True,False
904,029556a0-0ec9fa6f,4340,202,0,201,0,0,0.0,1214,30.204014,1280,720,True,0.166392,0.166392,True,True,False
473,01460ec4-a12a1552,475,203,0,202,0,0,0.0,1220,30.215471,1280,720,True,0.166393,0.166393,True,True,False
230,0096bcca-bfb5ea6c,372,202,0,201,0,0,0.0,1213,30.181637,1280,720,True,0.166529,0.166529,True,True,False
108,0048f391-e9bfaf62,397,202,0,201,0,0,0.0,1213,30.231785,1280,720,True,0.166529,0.166529,True,True,False
3,0001542f-5ce3cf52,1548,202,0,201,0,0,0.0,1213,30.20293,1280,720,True,0.166529,0.166529,True,True,False
898,02927def-14c4c3cb,1904,203,0,202,0,0,0.0,1219,30.201924,1280,720,True,0.16653,0.16653,True,True,False


## Save frame_coverage_report.parquet + quick summary

In [15]:
# P1.6 ‚Äî Save report + summary
frame_cov_path = P1_DIR / "frame_coverage_report.parquet"
frame_cov.to_parquet(frame_cov_path, index=False)
print("Saved:", frame_cov_path)

# Quick summary
def q(x): 
    return x.quantile([0.0,0.25,0.5,0.75,0.9,0.95,1.0])

summary = {
    "n_videos": int(frame_cov["video"].nunique()),
    "n_opened_false": int((~frame_cov["opened"]).sum()),
    "coverage_ratio_quantiles": q(frame_cov["coverage_ratio"].dropna()).to_dict(),
    "gap_ratio_quantiles": q(frame_cov["gap_ratio"].dropna()).to_dict(),
    "max_covered_ratio_quantiles": q(frame_cov["max_covered_ratio"].dropna()).to_dict(),
    "n_sparse_labels(coverage<0.2)": int(frame_cov["flag_sparse_labels"].sum()),
    "n_fragmented_timeline(gap_ratio>0.2)": int(frame_cov["flag_fragmented_timeline"].sum()),
    "n_low_max_covered(max_covered_ratio<0.5)": int(frame_cov["flag_low_max_covered"].sum()),
}

summary_path = P1_DIR / "frame_coverage_summary.json"
import json
with open(summary_path, "w") as f:
    json.dump(summary, f, indent=2)

print("Saved:", summary_path)
summary

Saved: /kaggle/working/preprocessed_v2/P1/frame_coverage_report.parquet
Saved: /kaggle/working/preprocessed_v2/P1/frame_coverage_summary.json


{'n_videos': 961,
 'n_opened_false': 0,
 'coverage_ratio_quantiles': {0.0: 0.16611842105263158,
  0.25: 0.1672185430463576,
  0.5: 0.16763485477178422,
  0.75: 0.1679790026246719,
  0.9: 0.16833333333333333,
  0.95: 0.16875522138680032,
  1.0: 0.17147707979626486},
 'gap_ratio_quantiles': {0.0: 0.0,
  0.25: 0.0,
  0.5: 0.0,
  0.75: 0.0,
  0.9: 0.0,
  0.95: 0.0,
  1.0: 0.0},
 'max_covered_ratio_quantiles': {0.0: 0.16611842105263158,
  0.25: 0.1672185430463576,
  0.5: 0.16763485477178422,
  0.75: 0.1679790026246719,
  0.9: 0.16833333333333333,
  0.95: 0.16875522138680032,
  1.0: 0.17147707979626486},
 'n_sparse_labels(coverage<0.2)': 961,
 'n_fragmented_timeline(gap_ratio>0.2)': 0,
 'n_low_max_covered(max_covered_ratio<0.5)': 961}

## üîç Interpretasi P1 ‚Äî Frame Timeline Audit

### 1Ô∏è‚É£ Jumlah Video dan Validitas Metadata

**Jumlah video:** 961 video

**Status:** Semua video bisa dibuka (`n_opened_false = 0`) ‚úÖ

**Kesimpulan:** Metadata video valid 100%

---

### 2Ô∏è‚É£ Coverage frameIndex (TEMUAN TERBESAR)

**Coverage ratio:** ~0.167 untuk SEMUA video

**Artinya:** Rata-rata hanya ~16.7% frame video yang memiliki label, dan ini konsisten di semua video (quantile hampir sama)

#### üëâ Interpretasi Penting

- Ini **BUKAN** error atau data rusak
- Ini adalah **DESAIN DATASET**

**Biasanya pada BDD-style MOT:**
- Video memiliki ¬±1800 frame
- Label hanya pada ~300 frame (misalnya 1 fps dari 6 fps video)

#### üìå Bukti Kuat

`coverage_ratio == max_covered_ratio`

**Artinya:** Label kontinu dari frame awal sampai frame tertentu, lalu berhenti

---

### 3Ô∏è‚É£ Gap Ratio = 0 untuk Semua Video

**Ini sangat penting:**

- `gap_ratio == 0`
- `n_fragmented_timeline = 0`

#### üëâ Kesimpulan

- **Tidak ada** missing frame di tengah timeline
- Label timeline **rapi, kontigu, tanpa lompat**
- **Tidak ada** track fragmentation dari sisi frameIndex ‚ùå

---

### 4Ô∏è‚É£ Status Flag

Semua video ditandai sebagai:

- `flag_sparse_labels = True` (coverage < 0.2)
- `flag_low_max_covered = True` (label hanya di awal video)

#### üëâ Kesimpulan Akhir

Ini adalah **expected behavior**, **bukan anomaly**

# P2 - Build Frame Metadata (W/H) + Join ke Label

## Setup + load labels_v2 & video_meta

In [16]:
# P2.0 ‚Äî Setup
from pathlib import Path
import pandas as pd
import numpy as np

PREP_DIR = Path("/kaggle/working/preprocessed_v2")
P0_DIR = PREP_DIR / "P0"
P1_DIR = PREP_DIR / "P1"
P2_DIR = PREP_DIR / "P2"
P2_DIR.mkdir(parents=True, exist_ok=True)

labels_v2 = pd.read_parquet(P0_DIR / "labels_v2.parquet")
video_meta = pd.read_parquet(P1_DIR / "video_meta.parquet")

print("labels_v2:", labels_v2.shape)
print("video_meta:", video_meta.shape)
labels_v2.head()

labels_v2: (1922517, 13)
video_meta: (961, 7)


Unnamed: 0,name,videoName,frameIndex,id,category,attributes.crowd,attributes.occluded,attributes.truncated,box2d.x1,box2d.x2,box2d.y1,box2d.y2,haveVideo
0,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89537,car,False,True,False,825.17321,1003.094688,355.011547,418.198614,True
1,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89538,car,False,True,False,484.295612,700.461894,346.69746,424.849885,True
2,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89539,pedestrian,False,True,False,645.588915,663.879908,338.383372,358.337182,True
3,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89540,car,False,False,False,120.969977,192.471132,359.168591,409.053118,True
4,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89541,car,False,False,False,251.501155,315.51963,354.180139,400.73903,True


## Detect video column in labels_v2

In [17]:
# P2.1 ‚Äî Detect video column name in labels_v2
cols = {c.lower(): c for c in labels_v2.columns}

video_col = None
for cand in ["videoname", "video_name", "video", "videoid", "video_id", "videokey", "video_key", "filename", "file_name"]:
    if cand in cols:
        video_col = cols[cand]
        break
if video_col is None:
    video_like = [c for c in labels_v2.columns if "video" in c.lower()]
    raise KeyError(f"Kolom video tidak ketemu. Kandidat: {video_like}")

print("Using video_col:", video_col)

Using video_col: videoName


## Build video_wh.parquet dari video_meta

In [18]:
# P2.2 ‚Äî Build video_wh from video_meta
video_wh = (
    video_meta[["video", "width", "height"]]
    .copy()
)

# Sanity: ensure numeric
video_wh["width"] = pd.to_numeric(video_wh["width"], errors="coerce")
video_wh["height"] = pd.to_numeric(video_wh["height"], errors="coerce")

bad_wh = video_wh["width"].isna().sum() + video_wh["height"].isna().sum()
print("NaN in width/height:", bad_wh)

# Drop duplicates (should be 1 row per video)
video_wh = video_wh.drop_duplicates(subset=["video"]).reset_index(drop=True)

# Save
video_wh_path = P2_DIR / "video_wh.parquet"
video_wh.to_parquet(video_wh_path, index=False)
print("Saved:", video_wh_path)
video_wh.head()

NaN in width/height: 0
Saved: /kaggle/working/preprocessed_v2/P2/video_wh.parquet


Unnamed: 0,video,width,height
0,0000f77c-6257be58,1280,720
1,0000f77c-62c2a288,1280,720
2,0000f77c-cb820c98,1280,720
3,0001542f-5ce3cf52,1280,720
4,0001542f-7c670be8,1280,720


## Join W/H ke labels_v2 ‚Üí labels_with_meta

In [19]:
# P2.3 ‚Äî Join W/H to labels_v2
labels_with_meta = labels_v2.merge(
    video_wh.rename(columns={"video": video_col}),
    on=video_col,
    how="left",
    validate="m:1"
)

print("labels_with_meta:", labels_with_meta.shape)

# Sanity checks
n_missing_wh = labels_with_meta["width"].isna().sum() + labels_with_meta["height"].isna().sum()
print("Rows missing width/height:", n_missing_wh)

# Save
labels_with_meta_path = P2_DIR / "labels_with_meta.parquet"
labels_with_meta.to_parquet(labels_with_meta_path, index=False)
print("Saved:", labels_with_meta_path)

labels_with_meta.head()

labels_with_meta: (1922517, 15)
Rows missing width/height: 0
Saved: /kaggle/working/preprocessed_v2/P2/labels_with_meta.parquet


Unnamed: 0,name,videoName,frameIndex,id,category,attributes.crowd,attributes.occluded,attributes.truncated,box2d.x1,box2d.x2,box2d.y1,box2d.y2,haveVideo,width,height
0,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89537,car,False,True,False,825.17321,1003.094688,355.011547,418.198614,True,1280,720
1,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89538,car,False,True,False,484.295612,700.461894,346.69746,424.849885,True,1280,720
2,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89539,pedestrian,False,True,False,645.588915,663.879908,338.383372,358.337182,True,1280,720
3,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89540,car,False,False,False,120.969977,192.471132,359.168591,409.053118,True,1280,720
4,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89541,car,False,False,False,251.501155,315.51963,354.180139,400.73903,True,1280,720


## Audit summary (berapa video missing join, dll)

In [20]:
# P2.4 ‚Äî Audit join coverage
missing_video_keys = (
    labels_with_meta.loc[labels_with_meta["width"].isna() | labels_with_meta["height"].isna(), video_col]
    .dropna()
    .astype(str)
    .value_counts()
    .head(20)
)

audit = {
    "n_rows_labels_v2": int(len(labels_v2)),
    "n_rows_labels_with_meta": int(len(labels_with_meta)),
    "n_rows_missing_wh": int(((labels_with_meta["width"].isna()) | (labels_with_meta["height"].isna())).sum()),
    "n_unique_videos_labels": int(labels_v2[video_col].nunique()),
    "n_unique_videos_video_wh": int(video_wh["video"].nunique()),
    "top_missing_video_keys": missing_video_keys.to_dict(),
}

audit_path = P2_DIR / "p2_join_audit.json"
import json
with open(audit_path, "w") as f:
    json.dump(audit, f, indent=2)

print("Saved:", audit_path)
audit

Saved: /kaggle/working/preprocessed_v2/P2/p2_join_audit.json


{'n_rows_labels_v2': 1922517,
 'n_rows_labels_with_meta': 1922517,
 'n_rows_missing_wh': 0,
 'n_unique_videos_labels': 961,
 'n_unique_videos_video_wh': 961,
 'top_missing_video_keys': {}}

## ‚úÖ Kesimpulan P2

### Dari Audit:

#### ‚úÖ Kelengkapan Data Dimensi
* `n_rows_missing_wh = 0` 
* **Artinya:** Semua label memiliki width/height

#### ‚úÖ Integritas Join Video-Label
* `n_unique_videos_labels = 961` 
* `n_unique_videos_video_wh = 961` 
* **Artinya:** Join tepat 1:1 per video (perfect match)

#### ‚úÖ Konsistensi Jumlah Data
* Jumlah row tidak berubah setelah merge
* **Artinya:** Tidak ada duplikasi akibat merge operation

# P3 - Label Canonicalization & Quality Filters

## Setup + load labels_with_meta

In [21]:
# P3.0 ‚Äî Setup
from pathlib import Path
import pandas as pd
import numpy as np

PREP_DIR = Path("/kaggle/working/preprocessed_v2")
P2_DIR = PREP_DIR / "P2"
P3_DIR = PREP_DIR / "P3"
P3_DIR.mkdir(parents=True, exist_ok=True)

df = pd.read_parquet(P2_DIR / "labels_with_meta.parquet")
print("df:", df.shape)
print("Columns:", list(df.columns)[:40], "...")
df.head()

df: (1922517, 15)
Columns: ['name', 'videoName', 'frameIndex', 'id', 'category', 'attributes.crowd', 'attributes.occluded', 'attributes.truncated', 'box2d.x1', 'box2d.x2', 'box2d.y1', 'box2d.y2', 'haveVideo', 'width', 'height'] ...


Unnamed: 0,name,videoName,frameIndex,id,category,attributes.crowd,attributes.occluded,attributes.truncated,box2d.x1,box2d.x2,box2d.y1,box2d.y2,haveVideo,width,height
0,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89537,car,False,True,False,825.17321,1003.094688,355.011547,418.198614,True,1280,720
1,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89538,car,False,True,False,484.295612,700.461894,346.69746,424.849885,True,1280,720
2,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89539,pedestrian,False,True,False,645.588915,663.879908,338.383372,358.337182,True,1280,720
3,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89540,car,False,False,False,120.969977,192.471132,359.168591,409.053118,True,1280,720
4,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89541,car,False,False,False,251.501155,315.51963,354.180139,400.73903,True,1280,720


## Detect kolom kunci: video, frameIndex, track_id, class, bbox

In [22]:
# P3.1 (FIX) ‚Äî Detect key columns robustly (supports box2d.x1 etc.)
cols = {c.lower(): c for c in df.columns}

def pick_col(candidates):
    for cand in candidates:
        if cand in cols:
            return cols[cand]
    return None

# Required base columns
video_col = pick_col(["videoname","video_name","video","videoid","video_id","videokey","video_key","filename","file_name"])
frame_col = pick_col(["frameindex","frame_index","frame","frameid","frame_id"])

track_col = pick_col(["trackid","track_id","track","objectid","object_id","instanceid","instance_id","id"])
cls_col   = pick_col(["category","class","label","classname","class_name","categoryname","category_name"])

if video_col is None or frame_col is None:
    raise KeyError(f"video/frame column tidak ketemu. video_like={[c for c in df.columns if 'video' in c.lower()]}, frame_like={[c for c in df.columns if 'frame' in c.lower()]}")

if track_col is None:
    cand = [c for c in df.columns if ("track" in c.lower()) or ("object" in c.lower()) or (c.lower()=="id")]
    raise KeyError(f"track_id column tidak ketemu. Kandidat: {cand}")

if cls_col is None:
    cand = [c for c in df.columns if ("class" in c.lower()) or ("category" in c.lower()) or ("label" in c.lower())]
    raise KeyError(f"class/category column tidak ketemu. Kandidat: {cand}")

print("Using columns:")
print(" - video:", video_col)
print(" - frame:", frame_col)
print(" - track:", track_col)
print(" - class:", cls_col)

# --- BBOX detection ---
# 1) Try box2d.* pattern first
x1_col = pick_col(["box2d.x1","box2d_x1","bbox.x1","bbox_x1","box.x1"])
y1_col = pick_col(["box2d.y1","box2d_y1","bbox.y1","bbox_y1","box.y1"])
x2_col = pick_col(["box2d.x2","box2d_x2","bbox.x2","bbox_x2","box.x2"])
y2_col = pick_col(["box2d.y2","box2d_y2","bbox.y2","bbox_y2","box.y2"])

# 2) If not found, fall back to common xyxy names
if not all([x1_col, y1_col, x2_col, y2_col]):
    x1_col = x1_col or pick_col(["x1","xmin","left","x_min"])
    y1_col = y1_col or pick_col(["y1","ymin","top","y_min"])
    x2_col = x2_col or pick_col(["x2","xmax","right","x_max"])
    y2_col = y2_col or pick_col(["y2","ymax","bottom","y_max"])

has_xyxy = all([x1_col, y1_col, x2_col, y2_col])

# 3) Fall back to xywh names if xyxy not available
x_col = pick_col(["x","bbox_x","tl_x","left","box2d.x","box2d_x"])
y_col = pick_col(["y","bbox_y","tl_y","top","box2d.y","box2d_y"])
w_col = pick_col(["w","bbox_w","bbox_width","width_bbox","box2d.w","box2d_w"])
h_col = pick_col(["h","bbox_h","bbox_height","height_bbox","box2d.h","box2d_h"])
has_xywh = all([x_col, y_col, w_col, h_col])

print("BBox detection:")
print(" - xyxy cols:", (x1_col, y1_col, x2_col, y2_col), "->", has_xyxy)
print(" - xywh cols:", (x_col, y_col, w_col, h_col), "->", has_xywh)

if not (has_xyxy or has_xywh):
    bbox_like = [c for c in df.columns if any(k in c.lower() for k in ["box2d","bbox","x1","x2","y1","y2","xmin","xmax","ymin","ymax","left","right","top","bottom","w","h"])]
    raise KeyError(f"Kolom bbox tidak bisa dideteksi. Kandidat bbox: {bbox_like}")

# Ensure numeric for required numeric columns
df[frame_col] = pd.to_numeric(df[frame_col], errors="coerce")
df["width"] = pd.to_numeric(df["width"], errors="coerce")
df["height"] = pd.to_numeric(df["height"], errors="coerce")

Using columns:
 - video: videoName
 - frame: frameIndex
 - track: id
 - class: category
BBox detection:
 - xyxy cols: ('box2d.x1', 'box2d.y1', 'box2d.x2', 'box2d.y2') -> True
 - xywh cols: (None, None, None, None) -> False


## Canonicalize bbox ‚Üí xyxy, compute w/h/area/ar

In [23]:
# P3.2 ‚Äî Canonicalize bbox to xyxy + compute derived stats
work = df.copy()

# Convert bbox to xyxy
if has_xyxy:
    work["x1"] = pd.to_numeric(work[x1_col], errors="coerce")
    work["y1"] = pd.to_numeric(work[y1_col], errors="coerce")
    work["x2"] = pd.to_numeric(work[x2_col], errors="coerce")
    work["y2"] = pd.to_numeric(work[y2_col], errors="coerce")
else:
    work["x1"] = pd.to_numeric(work[x_col], errors="coerce")
    work["y1"] = pd.to_numeric(work[y_col], errors="coerce")
    wv = pd.to_numeric(work[w_col], errors="coerce")
    hv = pd.to_numeric(work[h_col], errors="coerce")
    work["x2"] = work["x1"] + wv
    work["y2"] = work["y1"] + hv

# Drop rows with missing essentials
essential = [video_col, frame_col, track_col, cls_col, "x1","y1","x2","y2","width","height"]
before = len(work)
work = work.dropna(subset=essential).copy()
after_dropna = len(work)
print("Dropped NaN essentials:", before - after_dropna)

# Clamp to image bounds
# bounds: x in [0, width], y in [0, height]
work["x1"] = work["x1"].clip(lower=0)
work["y1"] = work["y1"].clip(lower=0)
work["x2"] = work["x2"].clip(lower=0)
work["y2"] = work["y2"].clip(lower=0)

# also clip upper bound per-row (vectorized via np.minimum)
work["x1"] = np.minimum(work["x1"].values, work["width"].values)
work["x2"] = np.minimum(work["x2"].values, work["width"].values)
work["y1"] = np.minimum(work["y1"].values, work["height"].values)
work["y2"] = np.minimum(work["y2"].values, work["height"].values)

# Ensure ordering (x1<=x2, y1<=y2)
x1 = np.minimum(work["x1"].values, work["x2"].values)
x2 = np.maximum(work["x1"].values, work["x2"].values)
y1 = np.minimum(work["y1"].values, work["y2"].values)
y2 = np.maximum(work["y1"].values, work["y2"].values)
work["x1"], work["x2"], work["y1"], work["y2"] = x1, x2, y1, y2

# Compute w/h/area/ar (pixel)
work["bw"] = work["x2"] - work["x1"]
work["bh"] = work["y2"] - work["y1"]
work["area_px"] = work["bw"] * work["bh"]
work["ar"] = work["bw"] / work["bh"].replace(0, np.nan)

# Drop invalid bbox: non-positive area or zero w/h
before2 = len(work)
work = work[(work["bw"] > 0) & (work["bh"] > 0) & (work["area_px"] > 0)].copy()
after2 = len(work)
print("Dropped invalid bbox (non-positive):", before2 - after2)

# Normalize names to standard columns (optional but recommended)
work = work.rename(columns={
    video_col: "video",
    frame_col: "frameIndex",
    track_col: "track_id",
    cls_col: "category"
})

print("Work df:", work.shape)
work[["video","frameIndex","track_id","category","x1","y1","x2","y2","bw","bh","area_px","ar","width","height"]].head()

Dropped NaN essentials: 2851
Dropped invalid bbox (non-positive): 0
Work df: (1919666, 23)


Unnamed: 0,video,frameIndex,track_id,category,x1,y1,x2,y2,bw,bh,area_px,ar,width,height
0,01c71072-718028b8,0,89537,car,825.17321,355.011547,1003.094688,418.198614,177.921478,63.187067,11242.33635,2.815789,1280,720
1,01c71072-718028b8,0,89538,car,484.295612,346.69746,700.461894,424.849885,216.166282,78.152425,16893.91911,2.765957,1280,720
2,01c71072-718028b8,0,89539,pedestrian,645.588915,338.383372,663.879908,358.337182,18.290993,19.953811,364.975012,0.916667,1280,720
3,01c71072-718028b8,0,89540,car,120.969977,359.168591,192.471132,409.053118,71.501155,49.884527,3566.801252,1.433333,1280,720
4,01c71072-718028b8,0,89541,car,251.501155,354.180139,315.51963,400.73903,64.018476,46.558891,2980.629264,1.375,1280,720


## Save labels_clean.parquet + audit ringkas

In [24]:
# P3.3 ‚Äî Save labels_clean + audit
labels_clean = work.copy()

labels_clean_path = P3_DIR / "labels_clean.parquet"
labels_clean.to_parquet(labels_clean_path, index=False)
print("Saved:", labels_clean_path)

audit = {
    "n_rows_in": int(len(df)),
    "n_rows_clean": int(len(labels_clean)),
    "drop_fraction": float(1 - (len(labels_clean) / max(len(df), 1))),
    "n_videos": int(labels_clean["video"].nunique()),
    "n_unique_tracks": int(labels_clean[["video","track_id"]].drop_duplicates().shape[0]),
    "n_unique_frames": int(labels_clean[["video","frameIndex"]].drop_duplicates().shape[0]),
}
audit_path = P3_DIR / "p3_clean_audit.json"
import json
with open(audit_path, "w") as f:
    json.dump(audit, f, indent=2)

print("Saved:", audit_path)
audit

Saved: /kaggle/working/preprocessed_v2/P3/labels_clean.parquet
Saved: /kaggle/working/preprocessed_v2/P3/p3_clean_audit.json


{'n_rows_in': 1922517,
 'n_rows_clean': 1919666,
 'drop_fraction': 0.0014829517762391387,
 'n_videos': 961,
 'n_unique_tracks': 75885,
 'n_unique_frames': 187618}

## Compute track_len per (video, track_id) ‚Üí track_stats.parquet* 

In [25]:
# P3.4 ‚Äî Track stats
track_stats = (
    labels_clean
    .groupby(["video","track_id"])
    .agg(
        track_len=("frameIndex","nunique"),
        n_rows=("frameIndex","size"),
        frame_min=("frameIndex","min"),
        frame_max=("frameIndex","max"),
        mean_area_px=("area_px","mean"),
        p50_area_px=("area_px","median"),
        mean_ar=("ar","mean"),
    )
    .reset_index()
)

track_stats_path = P3_DIR / "track_stats.parquet"
track_stats.to_parquet(track_stats_path, index=False)
print("Saved:", track_stats_path)

track_stats.sort_values("track_len").head(10)

Saved: /kaggle/working/preprocessed_v2/P3/track_stats.parquet


Unnamed: 0,video,track_id,track_len,n_rows,frame_min,frame_max,mean_area_px,p50_area_px,mean_ar
14947,007eddfc-528c4da4,38034,1,1,127,127,1948.87218,1948.87218,2.947368
70542,02982dfe-1dd674e5,55048,1,1,188,188,697.081597,697.081597,1.705882
55168,0204aad7-8ad8ba92,18173,1,1,175,175,4389.735882,4389.735882,0.284722
171,0000f77c-cb820c98,42472,1,1,202,202,508.145687,508.145687,0.6
55119,0204aad7-8ad8ba92,18124,1,1,127,127,2047.651189,2047.651189,0.419753
74052,02a70c1f-6df7ce64,93284,1,1,151,151,164.745301,164.745301,2.052632
5160,002d290d-90f2bab2,35191,1,1,92,92,585.636027,585.636027,1.074074
30315,0115917e-14907a6a,9319,1,1,106,106,6659.325631,6659.325631,1.939597
21003,00b04b30-501822fa,18244,1,1,19,19,420.965497,420.965497,1.380952
21008,00b04b30-501822fa,18249,1,1,21,21,1092.851314,1092.851314,1.645161


## Drop track pendek (<=10) ‚Üí labels_clean_dropShort.parquet

In [26]:
# P3.5 ‚Äî Drop short tracks (<=10)
SHORT_THR = 10

keep_tracks = track_stats.loc[track_stats["track_len"] > SHORT_THR, ["video","track_id"]]
labels_clean_dropShort = labels_clean.merge(
    keep_tracks,
    on=["video","track_id"],
    how="inner",
    validate="m:m"
)

labels_clean_dropShort_path = P3_DIR / "labels_clean_dropShort.parquet"
labels_clean_dropShort.to_parquet(labels_clean_dropShort_path, index=False)
print("Saved:", labels_clean_dropShort_path)

print("Before dropShort:", labels_clean.shape)
print("After  dropShort:", labels_clean_dropShort.shape)
print("Dropped rows:", len(labels_clean) - len(labels_clean_dropShort))

Saved: /kaggle/working/preprocessed_v2/P3/labels_clean_dropShort.parquet
Before dropShort: (1919666, 23)
After  dropShort: (1761451, 23)
Dropped rows: 158215


## Summary dropShort (berapa track yang dibuang)

In [27]:
# P3.6 ‚Äî Summary dropShort
n_tracks_all = track_stats.shape[0]
n_tracks_keep = (track_stats["track_len"] > SHORT_THR).sum()
n_tracks_drop = n_tracks_all - n_tracks_keep

summary = {
    "SHORT_THR": SHORT_THR,
    "n_tracks_all": int(n_tracks_all),
    "n_tracks_keep": int(n_tracks_keep),
    "n_tracks_drop": int(n_tracks_drop),
    "frac_tracks_dropped": float(n_tracks_drop / max(n_tracks_all, 1)),
    "n_rows_clean": int(len(labels_clean)),
    "n_rows_after_dropShort": int(len(labels_clean_dropShort)),
    "frac_rows_dropped": float((len(labels_clean) - len(labels_clean_dropShort)) / max(len(labels_clean), 1)),
}

summary_path = P3_DIR / "p3_dropShort_summary.json"
import json
with open(summary_path, "w") as f:
    json.dump(summary, f, indent=2)

print("Saved:", summary_path)
summary

Saved: /kaggle/working/preprocessed_v2/P3/p3_dropShort_summary.json


{'SHORT_THR': 10,
 'n_tracks_all': 75885,
 'n_tracks_keep': 46962,
 'n_tracks_drop': 28923,
 'frac_tracks_dropped': 0.3811425182842459,
 'n_rows_clean': 1919666,
 'n_rows_after_dropShort': 1761451,
 'frac_rows_dropped': 0.08241798312831503}

## ‚úÖ Ringkasan Hasil P3

### üì¶ `labels_clean.parquet`

```json
{
  "n_rows_in": 1922517,
  "n_rows_clean": 1919666,
  "drop_fraction": 0.00148,
  "n_videos": 961,
  "n_unique_tracks": 75885,
  "n_unique_frames": 187618
}
```

#### Interpretasi:
* Drop karena bbox invalid sangat kecil (~0.15%) ‚Üí kualitas label tinggi
* Dataset tidak rusak (bbox mostly valid)
* 75.885 track total (sebelum filter panjang)

---

### ‚úÇÔ∏è Drop Track Pendek (`track_len ‚â§ 10`)

```json
{
  "SHORT_THR": 10,
  "n_tracks_all": 75885,
  "n_tracks_keep": 46962,
  "n_tracks_drop": 28923,
  "frac_tracks_dropped": 0.381,
  "n_rows_clean": 1919666,
  "n_rows_after_dropShort": 1761451,
  "frac_rows_dropped": 0.0824
}
```

#### Interpretasi Kunci (Penting):
* **38.1% track** itu pendek (‚â§10 frame)
* **Tapi hanya 8.24% row** yang hilang 
* **Artinya:** Banyak track pendek, tapi kontribusi frame-nya kecil

#### üìå Ini Keputusan Preprocessing yang BENAR
* Track pendek = noise untuk MOT & detector training
* Tidak membunuh distribusi frame secara signifikan

---

### üîê Keputusan Preprocessing (LOCK)

#### Mulai sekarang:

üîí **Dataset resmi untuk diagnosis lanjut:**
```
labels_clean_dropShort.parquet
```

#### Alasan:
* Bbox valid
* Track pendek (noise) sudah dibuang
* Siap untuk analisis:
   * Small object
   * Aspect ratio outlier
   * Congestion
   * Imbalance kelas

---

### üß† Insight Penting untuk Modeling (Early Warning)

#### Dari P3 saja kita sudah tahu:
* Banyak track pendek ‚Üí banyak **transient objects**
* Ini biasanya:
   * Objek kecil
   * Occlusion
   * Pinggir frame
* ‚Üí **Detector harus kuat untuk small object & crowded scenes**

# P4 - Diagnostik Inti 

## Setup + load data

In [28]:
# P4.0 ‚Äî Setup
from pathlib import Path
import pandas as pd
import numpy as np
import json

PREP_DIR = Path("/kaggle/working/preprocessed_v2")
P3_DIR = PREP_DIR / "P3"
P4_DIR = PREP_DIR / "P4"
P4_DIR.mkdir(parents=True, exist_ok=True)

df = pd.read_parquet(P3_DIR / "labels_clean_dropShort.parquet")
print("df:", df.shape)
print("cols:", list(df.columns))
df.head()

df: (1761451, 23)
cols: ['name', 'video', 'frameIndex', 'track_id', 'category', 'attributes.crowd', 'attributes.occluded', 'attributes.truncated', 'box2d.x1', 'box2d.x2', 'box2d.y1', 'box2d.y2', 'haveVideo', 'width', 'height', 'x1', 'y1', 'x2', 'y2', 'bw', 'bh', 'area_px', 'ar']


Unnamed: 0,name,video,frameIndex,track_id,category,attributes.crowd,attributes.occluded,attributes.truncated,box2d.x1,box2d.x2,...,width,height,x1,y1,x2,y2,bw,bh,area_px,ar
0,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89537,car,False,True,False,825.17321,1003.094688,...,1280,720,825.17321,355.011547,1003.094688,418.198614,177.921478,63.187067,11242.33635,2.815789
1,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89538,car,False,True,False,484.295612,700.461894,...,1280,720,484.295612,346.69746,700.461894,424.849885,216.166282,78.152425,16893.91911,2.765957
2,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89539,pedestrian,False,True,False,645.588915,663.879908,...,1280,720,645.588915,338.383372,663.879908,358.337182,18.290993,19.953811,364.975012,0.916667
3,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89540,car,False,False,False,120.969977,192.471132,...,1280,720,120.969977,359.168591,192.471132,409.053118,71.501155,49.884527,3566.801252,1.433333
4,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89541,car,False,False,False,251.501155,315.51963,...,1280,720,251.501155,354.180139,315.51963,400.73903,64.018476,46.558891,2980.629264,1.375


## Basic sanity

In [29]:
# P4.1 ‚Äî Sanity columns & types
required = ["video","frameIndex","track_id","category","x1","y1","x2","y2","bw","bh","area_px","ar","width","height"]
missing = [c for c in required if c not in df.columns]
if missing:
    raise KeyError(f"Missing required columns: {missing}")

df["frameIndex"] = pd.to_numeric(df["frameIndex"], errors="coerce").astype(int)
df["bw"] = pd.to_numeric(df["bw"], errors="coerce")
df["bh"] = pd.to_numeric(df["bh"], errors="coerce")
df["area_px"] = pd.to_numeric(df["area_px"], errors="coerce")
df["width"] = pd.to_numeric(df["width"], errors="coerce")
df["height"] = pd.to_numeric(df["height"], errors="coerce")

print("n_videos:", df["video"].nunique())
print("n_tracks:", df[["video","track_id"]].drop_duplicates().shape[0])
print("n_frames unique:", df[["video","frameIndex"]].drop_duplicates().shape[0])

n_videos: 961
n_tracks: 46962
n_frames unique: 187093


## Track fragmentation diagnosis

In [30]:
# P4.2 ‚Äî Track fragmentation report
# We compute per (video, track_id): track_len (unique frames), gap_count, max_gap, fragmentation_rate
frag_records = []
grp = df.groupby(["video","track_id"])["frameIndex"]

for (v, tid), s in grp:
    u = np.array(sorted(pd.unique(s)))
    track_len = int(u.size)
    if track_len <= 1:
        gap_count = 0
        max_gap = 0
        frag_rate = 0.0
    else:
        d = np.diff(u)
        gap_count = int((d > 1).sum())
        max_gap = int(d[d > 1].max()) if (d > 1).any() else 0
        frag_rate = float(gap_count / track_len)
    frag_records.append({
        "video": v,
        "track_id": tid,
        "track_len": track_len,
        "gap_count": gap_count,
        "max_gap": max_gap,
        "fragmentation_rate": frag_rate,
    })

frag = pd.DataFrame(frag_records)

# Attach class + size summary per track (use median to be robust)
track_aux = (
    df.groupby(["video","track_id"])
      .agg(
          category=("category", lambda x: x.iloc[0]),
          p50_area_px=("area_px","median"),
          p50_ar=("ar","median"),
      )
      .reset_index()
)

frag = frag.merge(track_aux, on=["video","track_id"], how="left")

frag_path = P4_DIR / "fragmentation_report.parquet"
frag.to_parquet(frag_path, index=False)
print("Saved:", frag_path)

# Quick view
frag.sort_values(["gap_count","max_gap","fragmentation_rate"], ascending=False).head(10)

Saved: /kaggle/working/preprocessed_v2/P4/fragmentation_report.parquet


Unnamed: 0,video,track_id,track_len,gap_count,max_gap,fragmentation_rate,category,p50_area_px,p50_ar
39274,02506726-0e9b815e,24271,163,28,4,0.171779,car,355.598693,1.189021
28015,01ad8d8a-49bb0c93,26987,157,20,9,0.127389,car,1335.476748,1.179487
39276,02506726-0e9b815e,24273,177,20,4,0.112994,car,422.486456,1.276864
28010,01ad8d8a-49bb0c93,26981,151,19,13,0.125828,car,2976.855144,1.375
28011,01ad8d8a-49bb0c93,26982,151,19,11,0.125828,car,2200.978583,1.408163
28016,01ad8d8a-49bb0c93,26988,144,19,10,0.131944,car,438.735187,1.271079
46271,02b0443e-25d7d385,79966,80,17,6,0.2125,car,4275.99807,2.450725
28041,01ad8d8a-49bb0c93,27028,105,15,13,0.142857,car,1190.316232,0.789474
40707,026c5058-acd78182,39497,171,15,6,0.087719,car,3741.334492,1.148936
36941,0233c21b-577418b6,6929,84,15,3,0.178571,bicycle,1562.936547,0.922323


In [31]:
# P4.3 ‚Äî Fragmentation summary
def q(x): 
    return x.quantile([0.0,0.25,0.5,0.75,0.9,0.95,0.99,1.0])

frag_summary = {
    "n_tracks": int(len(frag)),
    "pct_tracks_with_gap": float((frag["gap_count"] > 0).mean()),
    "gap_count_quantiles": q(frag["gap_count"]).to_dict(),
    "max_gap_quantiles": q(frag["max_gap"]).to_dict(),
    "fragmentation_rate_quantiles": q(frag["fragmentation_rate"]).to_dict(),
}

frag_summary_path = P4_DIR / "fragmentation_summary.json"
with open(frag_summary_path, "w") as f:
    json.dump(frag_summary, f, indent=2)

print("Saved:", frag_summary_path)
frag_summary

Saved: /kaggle/working/preprocessed_v2/P4/fragmentation_summary.json


{'n_tracks': 46962,
 'pct_tracks_with_gap': 0.25722924918018825,
 'gap_count_quantiles': {0.0: 0.0,
  0.25: 0.0,
  0.5: 0.0,
  0.75: 1.0,
  0.9: 2.0,
  0.95: 3.0,
  0.99: 6.0,
  1.0: 28.0},
 'max_gap_quantiles': {0.0: 0.0,
  0.25: 0.0,
  0.5: 0.0,
  0.75: 2.0,
  0.9: 7.0,
  0.95: 12.0,
  0.99: 38.0,
  1.0: 169.0},
 'fragmentation_rate_quantiles': {0.0: 0.0,
  0.25: 0.0,
  0.5: 0.0,
  0.75: 0.00819672131147541,
  0.9: 0.05555555555555555,
  0.95: 0.08333333333333333,
  0.99: 0.15384615384615385,
  1.0: 0.48148148148148145}}

## Small object analysis

In [32]:
# P4.4 ‚Äî Small object report
work = df.copy()
work["area_norm"] = work["area_px"] / (work["width"] * work["height"])

area_q = work["area_norm"].quantile([0.0,0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99,1.0]).to_dict()
thr = [0.001, 0.002, 0.005, 0.01, 0.02]

thr_stats = {f"pct_area_norm_lt_{t}": float((work["area_norm"] < t).mean()) for t in thr}

# per-class small ratio (for diagnosis)
per_class = (
    work.groupby("category")["area_norm"]
        .agg(
            n="size",
            p50="median",
            p90=lambda s: float(s.quantile(0.9)),
            pct_lt_0_002=lambda s: float((s < 0.002).mean()),
            pct_lt_0_005=lambda s: float((s < 0.005).mean()),
        )
        .reset_index()
        .sort_values("n", ascending=False)
)

small_report = {
    "area_norm_quantiles": area_q,
    "threshold_stats": thr_stats,
    "note": "area_norm = (bbox_area_px)/(W*H) per row-label"
}

# Save artifacts
small_report_path = P4_DIR / "small_object_report.parquet"
per_class.to_parquet(small_report_path, index=False)
print("Saved:", small_report_path)

small_summary_path = P4_DIR / "small_object_summary.json"
with open(small_summary_path, "w") as f:
    json.dump(small_report, f, indent=2)
print("Saved:", small_summary_path)

small_report, per_class.head(10)

Saved: /kaggle/working/preprocessed_v2/P4/small_object_report.parquet
Saved: /kaggle/working/preprocessed_v2/P4/small_object_summary.json


({'area_norm_quantiles': {0.0: 3.7938798174424654e-05,
   0.01: 0.0001793467735609711,
   0.05: 0.00032446147404531137,
   0.1: 0.00046951781494430553,
   0.25: 0.0009644286050909461,
   0.5: 0.002560078472533192,
   0.75: 0.00898536020006188,
   0.9: 0.03466698813621886,
   0.95: 0.07226161345799789,
   0.99: 0.18423566333340496,
   1.0: 0.8353279632545323},
  'threshold_stats': {'pct_area_norm_lt_0.001': 0.2591113803336,
   'pct_area_norm_lt_0.002': 0.437705051119787,
   'pct_area_norm_lt_0.005': 0.6487588925266726,
   'pct_area_norm_lt_0.01': 0.7659838394596273,
   'pct_area_norm_lt_0.02': 0.8499083993821003},
  'note': 'area_norm = (bbox_area_px)/(W*H) per row-label'},
          category        n       p50       p90  pct_lt_0_002  pct_lt_0_005
 2             car  1360053  0.002672  0.037060      0.429363      0.634807
 6      pedestrian   220037  0.001631  0.010011      0.569263      0.816245
 10          truck    92649  0.004245  0.061732      0.323382      0.533335
 1            

## Aspect ratio outlier + sampling untuk inspeksi

In [33]:
# P4.S0 ‚Äî Inspect schema for P3 output (labels_clean_dropShort)
from pathlib import Path
import pandas as pd

PREP_DIR = Path("/kaggle/working/preprocessed_v2")
P3_PATH = PREP_DIR / "P3" / "labels_clean_dropShort.parquet"

df3 = pd.read_parquet(P3_PATH)
print("Loaded:", P3_PATH)
print("Shape:", df3.shape)

# columns
print("\n=== COLUMNS ===")
print(df3.columns.tolist())

# dtypes
print("\n=== DTYPES ===")
print(df3.dtypes)

# head
print("\n=== HEAD ===")
display(df3.head(3))

Loaded: /kaggle/working/preprocessed_v2/P3/labels_clean_dropShort.parquet
Shape: (1761451, 23)

=== COLUMNS ===
['name', 'video', 'frameIndex', 'track_id', 'category', 'attributes.crowd', 'attributes.occluded', 'attributes.truncated', 'box2d.x1', 'box2d.x2', 'box2d.y1', 'box2d.y2', 'haveVideo', 'width', 'height', 'x1', 'y1', 'x2', 'y2', 'bw', 'bh', 'area_px', 'ar']

=== DTYPES ===
name                     object
video                    object
frameIndex                int64
track_id                 object
category                 object
attributes.crowd           bool
attributes.occluded        bool
attributes.truncated       bool
box2d.x1                float64
box2d.x2                float64
box2d.y1                float64
box2d.y2                float64
haveVideo                  bool
width                     int64
height                    int64
x1                      float64
y1                      float64
x2                      float64
y2                      float64
bw      

Unnamed: 0,name,video,frameIndex,track_id,category,attributes.crowd,attributes.occluded,attributes.truncated,box2d.x1,box2d.x2,...,width,height,x1,y1,x2,y2,bw,bh,area_px,ar
0,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89537,car,False,True,False,825.17321,1003.094688,...,1280,720,825.17321,355.011547,1003.094688,418.198614,177.921478,63.187067,11242.33635,2.815789
1,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89538,car,False,True,False,484.295612,700.461894,...,1280,720,484.295612,346.69746,700.461894,424.849885,216.166282,78.152425,16893.91911,2.765957
2,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89539,pedestrian,False,True,False,645.588915,663.879908,...,1280,720,645.588915,338.383372,663.879908,358.337182,18.290993,19.953811,364.975012,0.916667


In [34]:
# P4.S1 ‚Äî Auto-detect bbox columns present in df3
cols_lower = {c.lower(): c for c in df3.columns}

def find_col_exact(name):
    return cols_lower.get(name.lower())

def find_first_contains(substrings):
    out = []
    for c in df3.columns:
        cl = c.lower()
        if any(s in cl for s in substrings):
            out.append(c)
    return out

bbox_candidates = find_first_contains(["box2d", "bbox", "x1", "x2", "y1", "y2", "xmin", "xmax", "ymin", "ymax", "left", "right", "top", "bottom", ".x", ".y"])
print("BBox-like candidates:", bbox_candidates)

# Try common patterns:
patterns = [
    ("xyxy_box2d", ["box2d.x1","box2d.y1","box2d.x2","box2d.y2"]),
    ("xyxy_plain", ["x1","y1","x2","y2"]),
    ("xyxy_bbox",  ["bbox.x1","bbox.y1","bbox.x2","bbox.y2"]),
]
for name, p in patterns:
    got = [find_col_exact(x) for x in p]
    if all(got):
        print(f"Detected pattern {name}: {got}")

# Check if ar/bw/bh exist at all
for k in ["ar","ar_val","bw","bh","area_px"]:
    if k in df3.columns:
        print("Found derived col:", k)

BBox-like candidates: ['box2d.x1', 'box2d.x2', 'box2d.y1', 'box2d.y2', 'x1', 'y1', 'x2', 'y2']
Detected pattern xyxy_box2d: ['box2d.x1', 'box2d.y1', 'box2d.x2', 'box2d.y2']
Detected pattern xyxy_plain: ['x1', 'y1', 'x2', 'y2']
Found derived col: ar
Found derived col: bw
Found derived col: bh
Found derived col: area_px


In [35]:
# P4.5 (STABLE FINAL) ‚Äî Aspect ratio outliers (IQR per class) WITHOUT merge
import numpy as np
import pandas as pd

need = ["video","frameIndex","track_id","category","x1","y1","x2","y2","bw","bh","area_px","ar","width","height"]
miss = [c for c in need if c not in df.columns]
if miss:
    raise KeyError(f"Missing required columns for P4.5: {miss}")

ar_df = df[need].copy()

# Clean numeric issues
ar_df = ar_df.replace([np.inf, -np.inf], np.nan).dropna(subset=["ar","bw","bh"]).copy()
ar_df = ar_df[(ar_df["bw"] > 0) & (ar_df["bh"] > 0)].copy()

def iqr_lo_hi(s, k=1.5):
    q1 = s.quantile(0.25)
    q3 = s.quantile(0.75)
    iqr = q3 - q1
    return (q1 - k * iqr, q3 + k * iqr)

# Compute bounds per category (stable)
bounds_series = ar_df.groupby("category")["ar"].apply(iqr_lo_hi)

# Make two dict maps
ar_lo_map = {cat: lohi[0] for cat, lohi in bounds_series.items()}
ar_hi_map = {cat: lohi[1] for cat, lohi in bounds_series.items()}

# Assign bounds via map (NO MERGE => no suffix chaos)
ar_df["ar_lo"] = ar_df["category"].map(ar_lo_map)
ar_df["ar_hi"] = ar_df["category"].map(ar_hi_map)

# Sanity check: should be no missing bounds
n_missing_bounds = ar_df["ar_lo"].isna().sum() + ar_df["ar_hi"].isna().sum()
print("Missing bounds rows:", n_missing_bounds)

# Flag outliers
ar_df["is_ar_outlier"] = (ar_df["ar"] < ar_df["ar_lo"]) | (ar_df["ar"] > ar_df["ar_hi"])
outliers = ar_df[ar_df["is_ar_outlier"]].copy()
outliers["outlier_side"] = np.where(outliers["ar"] > outliers["ar_hi"], "high", "low")

# Save
out_path = P4_DIR / "ar_outliers.parquet"
outliers.to_parquet(out_path, index=False)
print("Saved:", out_path)

print("Outlier rows:", len(outliers), "/", len(ar_df), f"({len(outliers)/max(len(ar_df),1)*100:.4f}%)")
outliers.sort_values("ar", ascending=False).head(10)

Missing bounds rows: 0
Saved: /kaggle/working/preprocessed_v2/P4/ar_outliers.parquet
Outlier rows: 129117 / 1761451 (7.3301%)


Unnamed: 0,video,frameIndex,track_id,category,x1,y1,x2,y2,bw,bh,area_px,ar,width,height,ar_lo,ar_hi,is_ar_outlier,outlier_side
1033366,026eb2d9-a96c8aa4,143,13421,car,709.048443,225.897473,1280.0,245.205079,570.951557,19.307605,11023.707415,29.571329,1280,720,0.177062,2.484239,True,high
1033370,026eb2d9-a96c8aa4,144,13421,car,709.109394,227.448004,1247.438855,246.75561,538.329461,19.307605,10393.852845,27.881731,1280,720,0.177062,2.484239,True,high
1304478,01c9256f-b4754064,121,51828,car,113.106682,253.223916,650.577367,273.533487,537.470685,20.309572,10915.799415,26.46391,1280,720,0.177062,2.484239,True,high
1304549,01c9256f-b4754064,125,51828,car,45.580305,238.87456,647.251732,262.725173,601.671427,23.850613,14350.232266,25.226665,1280,720,0.177062,2.484239,True,high
1304630,01c9256f-b4754064,130,51828,car,0.0,238.030481,658.060046,264.387991,658.060046,26.35751,17344.824318,24.9667,1280,720,0.177062,2.484239,True,high
1304645,01c9256f-b4754064,131,51828,car,0.0,233.810082,679.676674,261.062356,679.676674,27.252274,18522.734686,24.940182,1280,720,0.177062,2.484239,True,high
1304458,01c9256f-b4754064,120,51828,car,135.896835,253.223916,638.106236,274.364896,502.209401,21.14098,10617.199141,23.755256,1280,720,0.177062,2.484239,True,high
1304615,01c9256f-b4754064,129,51828,car,12.886836,229.468822,693.810624,260.230947,680.923788,30.762125,20946.662471,22.135135,1280,720,0.177062,2.484239,True,high
816647,014d895c-70b158b8,174,11188,car,2.078522,307.621247,483.464203,330.069284,481.385681,22.448037,10806.163562,21.444444,1280,720,0.177062,2.484239,True,high
1304598,01c9256f-b4754064,128,51828,car,3.741339,228.637413,692.147806,261.062356,688.406467,32.424942,22321.53993,21.230769,1280,720,0.177062,2.484239,True,high


## Sampling outliers

In [36]:
# P4.6 ‚Äî Sample outliers for inspection (stable)
import json
import pandas as pd

K_PER_CLASS = 30
samples = []

for cat, sub in outliers.groupby("category"):
    sub_high = sub.sort_values("ar", ascending=False).head(K_PER_CLASS//2)
    sub_low  = sub.sort_values("ar", ascending=True).head(K_PER_CLASS//2)
    pick = pd.concat([sub_high, sub_low], axis=0)

    for r in pick.itertuples(index=False):
        samples.append({
            "category": cat,
            "video": r.video,
            "frameIndex": int(r.frameIndex),
            "track_id": str(r.track_id),
            "bbox_xyxy": [float(r.x1), float(r.y1), float(r.x2), float(r.y2)],
            "bw_bh": [float(r.bw), float(r.bh)],
            "ar": float(r.ar),
            "area_px": float(r.area_px),
            "img_wh": [int(r.width), int(r.height)],
            "outlier_side": str(r.outlier_side)
        })

samples_path = P4_DIR / "ar_outlier_samples.json"
with open(samples_path, "w") as f:
    json.dump({"k_per_class": K_PER_CLASS, "n_samples": len(samples), "samples": samples}, f, indent=2)

print("Saved:", samples_path)
print("Total samples:", len(samples))

Saved: /kaggle/working/preprocessed_v2/P4/ar_outlier_samples.json
Total samples: 330


## Class Imbalance Check (Global + Per-Video)

In [37]:
# P4.7.1 ‚Äî Global class distribution
from pathlib import Path
import pandas as pd

P4_DIR = Path("/kaggle/working/preprocessed_v2/P4")

df = pd.read_parquet("/kaggle/working/preprocessed_v2/P3/labels_clean_dropShort.parquet")

global_class_dist = (
    df["category"]
    .value_counts()
    .rename_axis("category")
    .reset_index(name="n_rows")
)

global_class_dist["pct_rows"] = global_class_dist["n_rows"] / global_class_dist["n_rows"].sum()

global_class_dist_path = P4_DIR / "class_dist.parquet"
global_class_dist.to_parquet(global_class_dist_path, index=False)

print("Saved:", global_class_dist_path)
global_class_dist

Saved: /kaggle/working/preprocessed_v2/P4/class_dist.parquet


Unnamed: 0,category,n_rows,pct_rows
0,car,1360053,0.772121
1,pedestrian,220037,0.124918
2,truck,92649,0.052598
3,bus,37872,0.0215
4,bicycle,16546,0.009393
5,other vehicle,12633,0.007172
6,rider,11660,0.00662
7,motorcycle,6758,0.003837
8,other person,1362,0.000773
9,train,1043,0.000592


In [38]:
# P4.7.2 ‚Äî Class distribution per video
per_video_class_dist = (
    df.groupby(["video", "category"])
      .size()
      .reset_index(name="n_rows")
)

per_video_path = P4_DIR / "class_dist_per_video.parquet"
per_video_class_dist.to_parquet(per_video_path, index=False)

print("Saved:", per_video_path)
per_video_class_dist.head()

Saved: /kaggle/working/preprocessed_v2/P4/class_dist_per_video.parquet


Unnamed: 0,video,category,n_rows
0,0000f77c-6257be58,car,1031
1,0000f77c-62c2a288,car,827
2,0000f77c-62c2a288,pedestrian,57
3,0000f77c-62c2a288,truck,30
4,0000f77c-cb820c98,car,1177


## Kemacetan / Crowdedness Detection

# P5 - Day/Night + Blur Flags 

## Setup + ambil frame sampel dari frame berlabel

In [39]:
# P5.0 ‚Äî Setup
from pathlib import Path
import pandas as pd
import numpy as np
import cv2
from collections import defaultdict

PREP_DIR = Path("/kaggle/working/preprocessed_v2")
P3_DIR = PREP_DIR / "P3"
P5_DIR = PREP_DIR / "P5"
P5_DIR.mkdir(parents=True, exist_ok=True)

# Load labels (trusted)
df = pd.read_parquet(P3_DIR / "labels_clean_dropShort.parquet")

# Video directory
VIDEO_DIR = Path("/kaggle/input/driving-video-with-object-tracking/bdd100k_videos_train_00/bdd100k/videos/train")

# How many frames to sample per video
N_SAMPLES = 10

print("Videos:", df["video"].nunique())
df.head()

Videos: 961


Unnamed: 0,name,video,frameIndex,track_id,category,attributes.crowd,attributes.occluded,attributes.truncated,box2d.x1,box2d.x2,...,width,height,x1,y1,x2,y2,bw,bh,area_px,ar
0,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89537,car,False,True,False,825.17321,1003.094688,...,1280,720,825.17321,355.011547,1003.094688,418.198614,177.921478,63.187067,11242.33635,2.815789
1,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89538,car,False,True,False,484.295612,700.461894,...,1280,720,484.295612,346.69746,700.461894,424.849885,216.166282,78.152425,16893.91911,2.765957
2,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89539,pedestrian,False,True,False,645.588915,663.879908,...,1280,720,645.588915,338.383372,663.879908,358.337182,18.290993,19.953811,364.975012,0.916667
3,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89540,car,False,False,False,120.969977,192.471132,...,1280,720,120.969977,359.168591,192.471132,409.053118,71.501155,49.884527,3566.801252,1.433333
4,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89541,car,False,False,False,251.501155,315.51963,...,1280,720,251.501155,354.180139,315.51963,400.73903,64.018476,46.558891,2980.629264,1.375


## Build sampling plan (frameIndex per video)

In [40]:
# P5.1 ‚Äî Sampling plan (frameIndex per video)
rng = np.random.default_rng(42)

video_frames = (
    df.groupby("video")["frameIndex"]
      .apply(lambda s: np.sort(s.unique()))
)

sample_plan = {}
for v, frames in video_frames.items():
    if len(frames) <= N_SAMPLES:
        sample_plan[v] = frames.tolist()
    else:
        idx = np.linspace(0, len(frames)-1, N_SAMPLES).astype(int)
        sample_plan[v] = frames[idx].tolist()

# Quick check
list(sample_plan.items())[:3]

[('0000f77c-6257be58', [0, 22, 44, 67, 89, 112, 134, 157, 179, 202]),
 ('0000f77c-62c2a288', [8, 27, 47, 66, 103, 122, 142, 161, 181, 201]),
 ('0000f77c-cb820c98', [0, 22, 44, 66, 88, 111, 133, 155, 177, 200])]

## Extract brightness stats per video

In [41]:
# P5.2 ‚Äî Brightness (day/night) per video
records_dn = []

def resolve_video_path(v):
    if "." in v:
        return VIDEO_DIR / v
    return VIDEO_DIR / f"{v}.mov"

for v, frames in sample_plan.items():
    vpath = resolve_video_path(v)
    if not vpath.exists():
        continue

    cap = cv2.VideoCapture(str(vpath))
    vals = []

    for fidx in frames:
        cap.set(cv2.CAP_PROP_POS_FRAMES, int(fidx))
        ok, frame = cap.read()
        if not ok:
            continue
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        vals.append({
            "mean_luma": float(gray.mean()),
            "median_luma": float(np.median(gray))
        })

    cap.release()

    if len(vals) == 0:
        continue

    records_dn.append({
        "video": v,
        "mean_luma": float(np.mean([x["mean_luma"] for x in vals])),
        "median_luma": float(np.median([x["median_luma"] for x in vals])),
        "n_samples": len(vals)
    })

day_night_df = pd.DataFrame(records_dn)
day_night_df.head()

Unnamed: 0,video,mean_luma,median_luma,n_samples
0,0000f77c-6257be58,111.463647,98.0,10
1,0000f77c-62c2a288,112.159377,87.0,10
2,0000f77c-cb820c98,77.596216,51.5,10
3,0001542f-5ce3cf52,36.133239,26.5,10
4,0001542f-7c670be8,47.894251,43.0,10


## Threshold & flag is_night

In [42]:
# P5.3 ‚Äî Day/Night flag
NIGHT_THR = 60.0

day_night_df["is_night"] = day_night_df["median_luma"] < NIGHT_THR

day_night_path = P5_DIR / "day_night_flags.parquet"
day_night_df.to_parquet(day_night_path, index=False)
print("Saved:", day_night_path)

day_night_df["is_night"].value_counts(normalize=True)

Saved: /kaggle/working/preprocessed_v2/P5/day_night_flags.parquet


is_night
False    0.543184
True     0.456816
Name: proportion, dtype: float64

## Blur Detection

In [43]:
# P5.4 ‚Äî Blur score (variance of Laplacian)
records_blur = []

for v, frames in sample_plan.items():
    vpath = resolve_video_path(v)
    if not vpath.exists():
        continue

    cap = cv2.VideoCapture(str(vpath))
    scores = []

    for fidx in frames:
        cap.set(cv2.CAP_PROP_POS_FRAMES, int(fidx))
        ok, frame = cap.read()
        if not ok:
            continue
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        lap = cv2.Laplacian(gray, cv2.CV_64F)
        scores.append(lap.var())

    cap.release()

    if len(scores) == 0:
        continue

    records_blur.append({
        "video": v,
        "blur_score_mean": float(np.mean(scores)),
        "blur_score_median": float(np.median(scores)),
        "n_samples": len(scores)
    })

blur_df = pd.DataFrame(records_blur)
blur_df.head()

Unnamed: 0,video,blur_score_mean,blur_score_median,n_samples
0,0000f77c-6257be58,595.377104,616.273568,10
1,0000f77c-62c2a288,83.937757,87.119782,10
2,0000f77c-cb820c98,284.983534,258.074807,10
3,0001542f-5ce3cf52,42.106747,32.197536,10
4,0001542f-7c670be8,48.328218,38.466577,10


In [44]:
# P5.5 ‚Äî Blur flag
BLUR_THR = 100.0

blur_df["is_blur"] = blur_df["blur_score_median"] < BLUR_THR

blur_path = P5_DIR / "blur_flags.parquet"
blur_df.to_parquet(blur_path, index=False)
print("Saved:", blur_path)

blur_df["is_blur"].value_counts(normalize=True)

Saved: /kaggle/working/preprocessed_v2/P5/blur_flags.parquet


is_blur
False    0.736733
True     0.263267
Name: proportion, dtype: float64

# P6 - Sampling Manifest untuk Training

## Setup & load data

In [45]:
# P6.0 ‚Äî Setup
from pathlib import Path
import pandas as pd
import numpy as np
import json

PREP_DIR = Path("/kaggle/working/preprocessed_v2")
P3_DIR = PREP_DIR / "P3"
P5_DIR = PREP_DIR / "P5"
P6_DIR = PREP_DIR / "P6"
P6_DIR.mkdir(parents=True, exist_ok=True)

df = pd.read_parquet(P3_DIR / "labels_clean_dropShort.parquet")

# Load condition flags (optional)
day_night = pd.read_parquet(P5_DIR / "day_night_flags.parquet")
blur = pd.read_parquet(P5_DIR / "blur_flags.parquet")

df = df.merge(day_night[["video","is_night"]], on="video", how="left")
df = df.merge(blur[["video","is_blur"]], on="video", how="left")

df.head()

Unnamed: 0,name,video,frameIndex,track_id,category,attributes.crowd,attributes.occluded,attributes.truncated,box2d.x1,box2d.x2,...,x1,y1,x2,y2,bw,bh,area_px,ar,is_night,is_blur
0,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89537,car,False,True,False,825.17321,1003.094688,...,825.17321,355.011547,1003.094688,418.198614,177.921478,63.187067,11242.33635,2.815789,False,False
1,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89538,car,False,True,False,484.295612,700.461894,...,484.295612,346.69746,700.461894,424.849885,216.166282,78.152425,16893.91911,2.765957,False,False
2,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89539,pedestrian,False,True,False,645.588915,663.879908,...,645.588915,338.383372,663.879908,358.337182,18.290993,19.953811,364.975012,0.916667,False,False
3,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89540,car,False,False,False,120.969977,192.471132,...,120.969977,359.168591,192.471132,409.053118,71.501155,49.884527,3566.801252,1.433333,False,False
4,01c71072-718028b8-0000001.jpg,01c71072-718028b8,0,89541,car,False,False,False,251.501155,315.51963,...,251.501155,354.180139,315.51963,400.73903,64.018476,46.558891,2980.629264,1.375,False,False


## Track length + sampling ratio

In [46]:
# P6.1 ‚Äî Track length stats
track_len = (
    df.groupby(["video","track_id","category"])["frameIndex"]
      .nunique()
      .reset_index(name="track_len")
)

df = df.merge(track_len, on=["video","track_id","category"], how="left")

In [47]:
# P6.2 ‚Äî Track-aware keep interval
def keep_interval(row):
    if row["category"] != "car":
        return 1
    L = row["track_len"]
    if L <= 30:
        return 1
    elif L <= 100:
        return 3
    else:
        return 5

df["keep_every"] = df.apply(keep_interval, axis=1)

## Frame-level keep decision

In [48]:
# P6.3 ‚Äî Frame-level decision
df = df.sort_values(["video","frameIndex","track_id"])

df["keep_by_track"] = (
    df.groupby(["video","track_id"])
      .cumcount() % df["keep_every"] == 0
)

## Small-object override

In [49]:
# P6.4 ‚Äî Small-object protection
df["area_norm"] = df["area_px"] / (df["width"] * df["height"])
df["is_small_obj"] = df["area_norm"] < 0.005

## Aggregate to frame-level manifest

In [50]:
# P6.5 ‚Äî Frame-level aggregation
frame_keep = (
    df.groupby(["video","frameIndex"])
      .agg(
          keep_track=("keep_by_track","any"),
          has_small_obj=("is_small_obj","any"),
          is_night=("is_night","first"),
          is_blur=("is_blur","first"),
      )
      .reset_index()
)

frame_keep["keep"] = frame_keep["keep_track"] | frame_keep["has_small_obj"]

## Per-video cap (fallback)

In [51]:
# P6.6 ‚Äî Per-video cap
MAX_FRAMES_DEFAULT = 300

final_frames = []
rng = np.random.default_rng(42)

for v, sub in frame_keep.groupby("video"):
    keep_frames = sub[sub["keep"]].copy()
    if len(keep_frames) > MAX_FRAMES_DEFAULT:
        keep_frames = keep_frames.sample(MAX_FRAMES_DEFAULT, random_state=42)
    final_frames.append(keep_frames)

train_frames = pd.concat(final_frames, ignore_index=True)

## Build TRAIN manifest

In [52]:
# P6.7 ‚Äî Train manifest
train_manifest = (
    train_frames[["video","frameIndex"]]
    .sort_values(["video","frameIndex"])
    .to_dict(orient="records")
)

train_manifest_path = P6_DIR / "train_manifest.json"
with open(train_manifest_path, "w") as f:
    json.dump(train_manifest, f, indent=2)

print("Saved:", train_manifest_path)

Saved: /kaggle/working/preprocessed_v2/P6/train_manifest.json


## Eval strata (for analysis & reporting)

In [53]:
# P6.8 ‚Äî Eval strata
eval_strata = train_frames.groupby(
    ["is_night","is_blur","has_small_obj"]
).size().reset_index(name="n_frames")

eval_strata_path = P6_DIR / "eval_strata.parquet"
eval_strata.to_parquet(eval_strata_path, index=False)

eval_strata

Unnamed: 0,is_night,is_blur,has_small_obj,n_frames
0,False,False,False,3176
1,False,False,True,94151
2,False,True,False,192
3,False,True,True,3865
4,True,False,False,1654
5,True,False,True,38723
6,True,True,False,2582
7,True,True,True,40798
