In [4]:
import pandas as pd
from pathlib import Path
import os

manifest = pd.read_csv(Path("/Users/claudiapacheco/TFM/outputs/manifests/manifest.csv"))

print("Split % of total:")
print((manifest["split"].value_counts(normalize=True) * 100).round(2))

print("\nWithin-split class % (each split sums to 100):")
counts = manifest.groupby(["split","class_id"]).size().unstack(fill_value=0)
perc = counts.div(counts.sum(axis=1), axis=0) * 100
print(perc.round(2))

print("Total images:", len(manifest))
print("\nBy split:")
print(manifest["split"].value_counts(dropna=False))

print("\nBy split x class:")
print(manifest.groupby(["split", "class_id"]).size())

print("\nBy split x camera:")
print(manifest.groupby(["split", "camera"]).size())

print("\nVAL coverage by driver (should only be VAL rows):")
print(manifest[manifest["split"]=="val"]["driver_id"].value_counts(dropna=False))

print("\nMissing img_num / class_id sanity:")
print("Missing img_num:", manifest["img_num"].isna().sum())
print("Unique class_ids:", sorted(manifest["class_id"].unique()))

print("\nClass balance per split:")
print(manifest.groupby(["split","class_id"]).size().unstack(fill_value=0))



Split % of total:
split
train    73.65
test     13.28
val      13.08
Name: proportion, dtype: float64

Within-split class % (each split sums to 100):
class_id     c0     c1     c2    c3    c4    c5    c6    c7    c8     c9
split                                                                   
test      18.03  11.06  10.08  9.35  8.88  8.83  7.43  7.43  7.58  11.32
train     21.11  11.93   8.32  7.44  9.20  7.37  7.32  6.97  7.11  13.24
val       20.46  12.29   9.23  8.02  8.97  8.86  8.07  7.86  7.44   8.81
Total images: 14500

By split:
split
train    10679
test      1925
val       1896
Name: count, dtype: int64

By split x class:
split  class_id
test   c0           347
       c1           213
       c2           194
       c3           180
       c4           171
       c5           170
       c6           143
       c7           143
       c8           146
       c9           218
train  c0          2254
       c1          1274
       c2           889
       c3           794
      

### Notes:
- Would like to get split counts as percentages of total numbers 
- No class seems to be severely underrepresented
- I believe the authors said that they purposely overrepresented class 0 because it was the normal driving class, which the model had the most trouble with, but I have to review this to really know. 
- Any negative consequences of two of the drivers being represented significantly more in the validation set than others? 

In [5]:
splits_dir = Path("/Users/claudiapacheco/TFM/outputs/splits")
train = pd.read_csv(splits_dir / "train.csv")
val   = pd.read_csv(splits_dir / "val.csv")
test  = pd.read_csv(splits_dir / "test.csv")

def inter(a, b):
    a_paths = set(a["path"].tolist())
    b_paths = set(b["path"].tolist())
    dup = a_paths & b_paths
    return len(dup), list(dup)[:5]

print("VAL∩TRAIN:", inter(val, train))
print("VAL∩TEST :", inter(val, test))
print("TRAIN∩TEST:", inter(train, test))

VAL∩TRAIN: (0, [])
VAL∩TEST : (0, [])
TRAIN∩TEST: (0, [])


- The same image doesn't show up in train, test, val

In [6]:
val = manifest[manifest["split"]=="val"]
val_pdc = val.pivot_table(index="driver_id", columns="class_id", values="path", aggfunc="count", fill_value=0)
print("\nVAL per-driver per-class counts:")
print(val_pdc)



VAL per-driver per-class counts:
class_id    c0  c1  c2  c3  c4  c5  c6  c7  c8  c9
driver_id                                         
D001       124  33  31  18  17  19  29  23  29  30
D002        72  73  29  31  42  20  30  28  13  22
D003        55  40  30  16  29  39  16  21  22  35
D004        77  27  25  27  22  30  18  17  17  20
D008        20  20  20  20  20  20  20  20  20  20
D009        20  20  20  20  20  20  20  20  20  20
D010        20  20  20  20  20  20  20  20  20  20


In [7]:
missing = manifest[~manifest["path"].apply(os.path.exists)]
print("\nMissing files referenced in CSV:", len(missing))
if len(missing): print(missing.head())



Missing files referenced in CSV: 0
