## Raw Readers

In [None]:
import os

folder = "/Users/gabriele/Desktop/Magistrale/Explainable_and_trustworthy_AI/progetti/concept_gridlock/kaggle/working/road-save/comma/cache/resnet50I3D512-Pkinetics-b4s8x1x1-commat3-h3x3x3"

if not os.path.exists(folder):
    print(f"❌ The folder '{folder}' does not exist!")
else:
    print(f"📂 Scanning folder: {folder}")

    for filename in os.listdir(folder):
        print(f"🔎 Found file: {filename}")  # debug

        if "_240frames" in filename:
            new_name = filename.replace("_240frames", "")
            old_path = os.path.join(folder, filename)
            new_path = os.path.join(folder, new_name)

            os.rename(old_path, new_path)
            print(f"✅ Renamed: {filename} → {new_name}")

In [None]:
!git clone https://github.com/Gabriele-Raffaele/openpilot.git

In [None]:
!pip install pycapnp
!pip install smbus2
!pip install lru-dict

In [None]:
from __future__ import print_function
import os
import numpy as np
import sys
from tqdm import tqdm
import cv2
import platform
platform.architecture()
sys.path.append("/content/openpilot")
from tools.lib.logreader import LogReader
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from tools.lib.framereader import FrameReader
import h5py

In [None]:
!git clone "https://github.com/commaai/comma2k19.git"

In [None]:
# Download first chunk (~9GB)
!wget https://huggingface.co/datasets/commaai/comma2k19/resolve/431c287f12295222eb427a9cff821d63101f2169/Chunk_1.zip -O Chunk_1.zip


In [None]:
!mkdir -p /content/dataset
!mv Chunk_1.zip /content/dataset

In [None]:
!unzip /content/dataset/Chunk_1.zip -d /content/dataset

In [None]:
removed_files = []
def get_sample(p):
    frame_reader = FrameReader(p+'/video.hevc')
    logs = list(LogReader(p + '/raw_log.bz2'))

    angle = np.array([l.carState.steeringAngleDeg for l in logs if l.which() == 'carState'])[1::5][1::5]
    time = np.array([l.logMonoTime for l in logs if l.which() == 'carState'])[1::5][1::5]
    vEgo = np.array([l.carState.vEgo for l in logs if l.which() == 'carState'])[1::5][1::5]
    gas = np.array([l.carState.gas for l in logs if l.which() == 'carState'])[1::5][1::5]
    brake = np.array([l.carState.brake for l in logs if l.which() == 'carState'])[1::5][1::5]
    gps_times = np.load(p + '/global_pose/frame_gps_times')
    times = np.load(p + '/global_pose/frame_times')
    gas = np.array([l.carState.gas for l in logs if l.which() == 'carState'])[1::5][1::5]
    gaspressed = np.array([l.carState.gasPressed for l in logs if l.which() == 'carState'])[1::5][1::5]
    brake = np.array([l.carState.brake for l in logs if l.which() == 'carState'])[1::5][1::5]
    brake_pressed = np.array([l.carState.brakePressed for l in logs if l.which() == 'carState'])[1::5][1::5]

    enabled = np.array([l.carState.cruiseState.enabled for l in logs if l.which() == 'carState'])[1::5][1::5]
    speed = np.array([l.carState.cruiseState.speed for l in logs if l.which() == 'carState'])[1::5][1::5]
    speedOffset = np.array([l.carState.cruiseState.speedOffset for l in logs if l.which() == 'carState'])[1::5][1::5]
    standstill = np.array([l.carState.cruiseState.standstill for l in logs if l.which() == 'carState'])[1::5][1::5]
    nonAdaptive = np.array([l.carState.cruiseState.nonAdaptive for l in logs if l.which() == 'carState'])[1::5][1::5]
    speedCluster = np.array([l.carState.cruiseState.speedCluster for l in logs if l.which() == 'carState'])[1::5][1::5]

    leftBlinker = np.array([l.carState.leftBlinker for l in logs if l.which() == 'carState'])[1::5][1::5]
    rightBlinker = np.array([l.carState.rightBlinker for l in logs if l.which() == 'carState'])[1::5][1::5]
    #print(frame_reader.frame_count, gps_times.shape, times.shape)
    #print([l.carState for l in logs if l.which() == "carState"][0])
    #print([l.radarState for l in logs if l.which() == "radarState"][0])

    dist = np.array([l.radarState.leadOne.dRel for l in logs if l.which() == "radarState"])[1::5]
    if ((vEgo == 0).mean() > 0.2) or ((dist == 0).mean() > 0.2) or len(dist) <=230:
      print(f"❌ Removed {p} because vEgo/dist too low, vEgo_zeros={(vEgo==0).mean():.2f}, dist_zeros={(dist==0).mean():.2f}, len(dist)={len(dist)}\n")
      removed_files.append(p)
      return None
    images = []
    l = list(range(frame_reader.frame_count))
    if len(l) > 245:
        l = l[1::5]
    for idx in list(range(frame_reader.frame_count))[1::5]:
        image = np.array(frame_reader.get(idx, pix_fmt='rgb24')[0], dtype=np.float64)
        image = cv2.resize(image, dsize=(224, 224), interpolation=cv2.INTER_CUBIC)
        images.append(image)
    steady_state = ~gaspressed & ~brake_pressed & ~leftBlinker & ~rightBlinker
    last_idx = 0
    desired_gap = np.zeros(steady_state.shape)

    for i in range(len(steady_state)-1):
        if steady_state[i] == True:
            desired_gap[last_idx:i] = int(dist[i])
            last_idx = i

    #VIDEO_ID feature
    #parent_folder = os.path.basename(os.path.dirname(p))  # 'b0c9d2329ad1606b|2018-07-27--06-03-57'
    #sub_folder = os.path.basename(p)                      # '10'
    #print(f"p: {p}")
    #print(f"parent_folder: {parent_folder}")
    #print(f"sub_folder: {sub_folder}")

    #video_id = f"{parent_folder}_{sub_folder}"
    #print(video_id)
    sample = {
        #'video_id': video_id,
        'image': images,
        "CruiseStateenabled": enabled,
        "CruiseStatespeed": speed,
        "CruiseStatespeedOffset": speedOffset,
        "CruiseStatestandstill": standstill,
        "CruiseStatenonAdaptive": nonAdaptive,
        "CruiseStatespeedCluster": speedCluster,
        'leftBlinker': leftBlinker,
        'rightBlinker': rightBlinker,
        "gas": gas,
        "gaspressed": gaspressed,
        "brake": brake,
        "brakepressed": brake_pressed,
        'angle': angle,
        'time': time,
        'gas': gas,
        'vEgo': vEgo,
        'brake': brake,
        'dist': dist,
        'desired_dist': desired_gap,
        }
    if (desired_gap == 0).mean() > 0.2:
      print(f"❌ Discarded {p} because desired_gap too low {(desired_gap == 0).mean():.2f}")
      removed_files.append(p)
      return None
    return sample

In [None]:
removed_files = []
def get_sample_2(p):
    frame_reader = FrameReader(p+'/video.hevc')
    logs = list(LogReader(p + '/raw_log.bz2'))

    angle = np.array([l.carState.steeringAngleDeg for l in logs if l.which() == 'carState'])[1::5][1::5]
    time = np.array([l.logMonoTime for l in logs if l.which() == 'carState'])[1::5][1::5]
    vEgo = np.array([l.carState.vEgo for l in logs if l.which() == 'carState'])[1::5][1::5]
    gas = np.array([l.carState.gas for l in logs if l.which() == 'carState'])[1::5][1::5]
    brake = np.array([l.carState.brake for l in logs if l.which() == 'carState'])[1::5][1::5]
    gps_times = np.load(p + '/global_pose/frame_gps_times')
    times = np.load(p + '/global_pose/frame_times')
    gas = np.array([l.carState.gas for l in logs if l.which() == 'carState'])[1::5][1::5]
    gaspressed = np.array([l.carState.gasPressed for l in logs if l.which() == 'carState'])[1::5][1::5]
    brake = np.array([l.carState.brake for l in logs if l.which() == 'carState'])[1::5][1::5]
    brake_pressed = np.array([l.carState.brakePressed for l in logs if l.which() == 'carState'])[1::5][1::5]

    enabled = np.array([l.carState.cruiseState.enabled for l in logs if l.which() == 'carState'])[1::5][1::5]
    speed = np.array([l.carState.cruiseState.speed for l in logs if l.which() == 'carState'])[1::5][1::5]
    speedOffset = np.array([l.carState.cruiseState.speedOffset for l in logs if l.which() == 'carState'])[1::5][1::5]
    standstill = np.array([l.carState.cruiseState.standstill for l in logs if l.which() == 'carState'])[1::5][1::5]
    nonAdaptive = np.array([l.carState.cruiseState.nonAdaptive for l in logs if l.which() == 'carState'])[1::5][1::5]
    speedCluster = np.array([l.carState.cruiseState.speedCluster for l in logs if l.which() == 'carState'])[1::5][1::5]

    leftBlinker = np.array([l.carState.leftBlinker for l in logs if l.which() == 'carState'])[1::5][1::5]
    rightBlinker = np.array([l.carState.rightBlinker for l in logs if l.which() == 'carState'])[1::5][1::5]
    #print(frame_reader.frame_count, gps_times.shape, times.shape)
    #print([l.carState for l in logs if l.which() == "carState"][0])
    #print([l.radarState for l in logs if l.which() == "radarState"][0])

    dist = np.array([l.radarState.leadOne.dRel for l in logs if l.which() == "radarState"])[1::5]
    if ((vEgo == 0).mean() > 0.2) or ((dist == 0).mean() > 0.2) or len(dist) <=230:
     
      removed_files.append(p)
      return None
    
    steady_state = ~gaspressed & ~brake_pressed & ~leftBlinker & ~rightBlinker
    last_idx = 0
    desired_gap = np.zeros(steady_state.shape)

    for i in range(len(steady_state)-1):
        if steady_state[i] == True:
            desired_gap[last_idx:i] = int(dist[i])
            last_idx = i

    #VIDEO_ID feature
    #parent_folder = os.path.basename(os.path.dirname(p))  # 'b0c9d2329ad1606b|2018-07-27--06-03-57'
    #sub_folder = os.path.basename(p)                      # '10'
    #print(f"p: {p}")
    #print(f"parent_folder: {parent_folder}")
    #print(f"sub_folder: {sub_folder}")

    #video_id = f"{parent_folder}_{sub_folder}"
    #print(video_id)
    sample = {
        #'video_id': video_id,
        
        "CruiseStateenabled": enabled,
        "CruiseStatespeed": speed,
        "CruiseStatespeedOffset": speedOffset,
        "CruiseStatestandstill": standstill,
        "CruiseStatenonAdaptive": nonAdaptive,
        "CruiseStatespeedCluster": speedCluster,
        'leftBlinker': leftBlinker,
        'rightBlinker': rightBlinker,
        "gas": gas,
        "gaspressed": gaspressed,
        "brake": brake,
        "brakepressed": brake_pressed,
        'angle': angle,
        'time': time,
        'gas': gas,
        'vEgo': vEgo,
        'brake': brake,
        'dist': dist,
        'desired_dist': desired_gap,
        }
    if (desired_gap == 0).mean() > 0.2:
      
      removed_files.append(p)
      return None
    return sample

In [None]:
def save_h5py(i, sample, h):
    group = h.create_group(str(i))
    for col in sample.keys():
            dt = np.float32 if col != 'image' else int#
            dataset_name = col #groups are divided by '/'
            a = list(sample[col])
            group.create_dataset(dataset_name, data = np.asarray(a, dtype=dt),
                    #compression_opts=9,
                    #chunks=(164, 20, 20, 3),
                    compression='lzf')

#Use this if you want to save also video_id as feature
'''def save_h5py(i, sample, h):
    group = h.create_group(str(i))
    for col in sample.keys():
        if col == "video_id":
            dt = h5py.string_dtype(encoding='utf-8')
            group.create_dataset(col, data=sample[col], dtype=dt)
        else:
            dt = np.float32 if col != 'image' else int
            a = list(sample[col])
            group.create_dataset(col, data=np.asarray(a, dtype=dt), compression='lzf')'''
None

In [None]:
!rm -r "/content/gas_and_brake_train_comma_chunk_1_w_imgs.hdf5"
!rm -r "/content/gas_and_brake_test_comma_chunk_1_w_imgs.hdf5"
!rm -r "/content/gas_and_brake_val_comma_chunk_1_w_imgs.hdf5"

In [None]:
main_dir='/content/dataset/Chunk_1_reduced/'
hdf5_filename = "gas_and_brake_train_comma_chunk_1_w_imgs.hdf5"
h_train = h5py.File(hdf5_filename, 'w')
hdf5_filename = "gas_and_brake_val_comma_chunk_1_w_imgs.hdf5"
h_val = h5py.File(hdf5_filename, 'w')
hdf5_filename = "gas_and_brake_test_comma_chunk_1_w_imgs.hdf5"
h_test = h5py.File(hdf5_filename, 'w')

In [None]:
#Old version
for j, drive_sequence_path in tqdm(enumerate(os.listdir(main_dir))):
    if '.DS_Store ' in drive_sequence_path or not os.path.isdir(main_dir+"/"+drive_sequence_path):
      print(f"⚠️ Skipping non-directory or DS_Store: {drive_sequence_path}")
      continue
    min_sequence_paths = os.listdir(main_dir+"/"+drive_sequence_path)
    if len(min_sequence_paths) < 3:
      print(f"⚠️ Skipping {drive_sequence_path} because minor than 3 subdirectory ")
      continue
    min_sequence_path_test = main_dir+"/"+drive_sequence_path+"/"+min_sequence_paths[-2]
    min_sequence_paths_val = main_dir+"/"+drive_sequence_path+"/"+min_sequence_paths[-1]
    sample = get_sample(min_sequence_path_test)
    if sample != None:
        save_h5py(f"{drive_sequence_path}", sample, h_test)
        print(f"✅ Test saved: {drive_sequence_path}")
    else:
        print(f"❌ Test discarded: {min_sequence_path_test}")

    sample = get_sample(min_sequence_paths_val)

    if sample != None:
        save_h5py(f"{drive_sequence_path}", sample, h_val)
        print(f"✅ Val saved: {drive_sequence_path}")
    else:
        print(f"❌ Val discarded: {min_sequence_path_test}")

    for i, min_sequence in enumerate(min_sequence_paths[:-1]):
        if '.DS_Store ' in min_sequence or not os.path.isdir(main_dir+"/"+drive_sequence_path+'/'+min_sequence):
          print(f"⚠️ Skipping train non-directory or DS_Store: {min_sequence}")
          continue
        p = main_dir+"/"+drive_sequence_path+"/"+min_sequence
        sample = get_sample(p)
        if sample != None:
            save_h5py(f"{drive_sequence_path}_{min_sequence}", sample, h_train)


In [None]:
import os
import random
from tqdm import tqdm

main_dir = "/content/dataset/Chunk_1"  # principal folder
split_ratios = {"train": 0.7, "val": 0.15, "test": 0.15}

# --- Collect all valid sequences ---
all_sequences = []
removed_sequences = []

for drive_sequence_path in tqdm(os.listdir(main_dir), desc="Scanning drives"):
    full_drive_path = os.path.join(main_dir, drive_sequence_path)
    if not os.path.isdir(full_drive_path) or ".DS_Store" in drive_sequence_path:
        continue

    for seq in os.listdir(full_drive_path):
        full_seq_path = os.path.join(full_drive_path, seq)
        if os.path.isdir(full_seq_path):
            # ⚠️ Check if the sequence is valid
            sample = get_sample_2(full_seq_path)
            if sample is not None:
                all_sequences.append((drive_sequence_path, seq, full_seq_path))
            else:
                removed_sequences.append(full_seq_path)

print(f"📦 Found {len(all_sequences)} valid sequences")
print(f"🗑️ Discarded {len(removed_sequences)} sequences")

# --- Shuffle ---
random.seed(42)  # for reproducibility
random.shuffle(all_sequences)

# --- Split globale ---
n_total = len(all_sequences)
n_train = int(split_ratios["train"] * n_total)
n_val   = int(split_ratios["val"] * n_total)
# test = rest
train_seqs = all_sequences[:n_train]
val_seqs   = all_sequences[n_train:n_train+n_val]
test_seqs  = all_sequences[n_train+n_val:]

print(f"👉 Train: {len(train_seqs)}, Val: {len(val_seqs)}, Test: {len(test_seqs)}")

In [None]:
train_seqs= [('b0c9d2329ad1606b|2018-08-10--22-42-26', '5', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-10--22-42-26/5'), ('b0c9d2329ad1606b|2018-07-30--13-03-07', '21', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-07-30--13-03-07/21'), ('b0c9d2329ad1606b|2018-08-14--10-32-01', '34', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-14--10-32-01/34'), ('b0c9d2329ad1606b|2018-08-17--14-55-39', '6', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-17--14-55-39/6'), ('b0c9d2329ad1606b|2018-07-30--13-44-30', '11', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-07-30--13-44-30/11'), ('b0c9d2329ad1606b|2018-08-17--14-17-47', '11', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-17--14-17-47/11'), ('b0c9d2329ad1606b|2018-08-02--08-34-47', '40', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-02--08-34-47/40'), ('b0c9d2329ad1606b|2018-08-06--10-04-53', '25', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-06--10-04-53/25'), ('b0c9d2329ad1606b|2018-08-14--10-32-01', '35', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-14--10-32-01/35'), ('b0c9d2329ad1606b|2018-08-03--10-35-16', '11', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-03--10-35-16/11'), ('b0c9d2329ad1606b|2018-08-14--10-32-01', '26', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-14--10-32-01/26'), ('b0c9d2329ad1606b|2018-08-14--10-32-01', '28', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-14--10-32-01/28'), ('b0c9d2329ad1606b|2018-08-17--14-17-47', '6', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-17--14-17-47/6'), ('b0c9d2329ad1606b|2018-08-02--08-34-47', '31', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-02--08-34-47/31'), ('b0c9d2329ad1606b|2018-08-10--22-42-26', '10', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-10--22-42-26/10'), ('b0c9d2329ad1606b|2018-08-17--12-07-08', '32', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-17--12-07-08/32'), ('b0c9d2329ad1606b|2018-08-10--22-42-26', '7', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-10--22-42-26/7'), ('b0c9d2329ad1606b|2018-07-30--13-44-30', '10', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-07-30--13-44-30/10'), ('b0c9d2329ad1606b|2018-08-17--12-07-08', '35', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-17--12-07-08/35'), ('b0c9d2329ad1606b|2018-08-17--12-07-08', '41', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-17--12-07-08/41'), ('b0c9d2329ad1606b|2018-08-14--10-32-01', '32', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-14--10-32-01/32'), ('b0c9d2329ad1606b|2018-08-15--09-01-03', '21', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-15--09-01-03/21'), ('b0c9d2329ad1606b|2018-07-30--13-44-30', '8', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-07-30--13-44-30/8'), ('b0c9d2329ad1606b|2018-08-10--22-42-26', '8', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-10--22-42-26/8'), ('b0c9d2329ad1606b|2018-08-03--10-35-16', '9', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-03--10-35-16/9'), ('b0c9d2329ad1606b|2018-07-29--11-17-20', '6', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-07-29--11-17-20/6'), ('b0c9d2329ad1606b|2018-08-03--10-35-16', '4', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-03--10-35-16/4'), ('b0c9d2329ad1606b|2018-07-30--13-44-30', '12', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-07-30--13-44-30/12'), ('b0c9d2329ad1606b|2018-08-03--10-35-16', '13', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-03--10-35-16/13'), ('b0c9d2329ad1606b|2018-08-14--10-32-01', '29', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-14--10-32-01/29'), ('b0c9d2329ad1606b|2018-08-10--22-42-26', '9', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-10--22-42-26/9'), ('b0c9d2329ad1606b|2018-08-14--10-32-01', '30', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-14--10-32-01/30'), ('b0c9d2329ad1606b|2018-08-14--10-32-01', '33', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-14--10-32-01/33'), ('b0c9d2329ad1606b|2018-08-15--09-01-03', '23', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-15--09-01-03/23'), ('b0c9d2329ad1606b|2018-08-14--10-32-01', '27', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-14--10-32-01/27'), ('b0c9d2329ad1606b|2018-08-02--16-41-38', '17', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-02--16-41-38/17'), ('b0c9d2329ad1606b|2018-07-27--06-50-48', '9', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-07-27--06-50-48/9'), ('b0c9d2329ad1606b|2018-08-03--10-35-16', '16', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-03--10-35-16/16'), ('b0c9d2329ad1606b|2018-08-06--10-04-53', '29', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-06--10-04-53/29'), ('b0c9d2329ad1606b|2018-08-02--08-34-47', '39', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-02--08-34-47/39'), ('b0c9d2329ad1606b|2018-08-17--14-17-47', '12', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-17--14-17-47/12'), ('b0c9d2329ad1606b|2018-08-17--14-17-47', '7', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-17--14-17-47/7'), ('b0c9d2329ad1606b|2018-08-17--12-07-08', '40', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-17--12-07-08/40'), ('b0c9d2329ad1606b|2018-08-14--20-41-07', '7', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-14--20-41-07/7'), ('b0c9d2329ad1606b|2018-08-03--10-35-16', '14', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-03--10-35-16/14'), ('b0c9d2329ad1606b|2018-07-30--13-44-30', '6', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-07-30--13-44-30/6'), ('b0c9d2329ad1606b|2018-08-17--14-55-39', '3', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-17--14-55-39/3'), ('b0c9d2329ad1606b|2018-08-02--08-34-47', '34', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-02--08-34-47/34'), ('b0c9d2329ad1606b|2018-08-03--10-35-16', '8', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-03--10-35-16/8'), ('b0c9d2329ad1606b|2018-08-17--12-07-08', '33', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-17--12-07-08/33'), ('b0c9d2329ad1606b|2018-08-06--10-04-53', '30', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-06--10-04-53/30'), ('b0c9d2329ad1606b|2018-08-02--16-41-38', '11', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-02--16-41-38/11'), ('b0c9d2329ad1606b|2018-08-06--10-04-53', '26', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-06--10-04-53/26'), ('b0c9d2329ad1606b|2018-08-14--20-41-07', '5', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-14--20-41-07/5'), ('b0c9d2329ad1606b|2018-08-14--20-41-07', '6', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-14--20-41-07/6'), ('b0c9d2329ad1606b|2018-08-10--22-42-26', '3', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-10--22-42-26/3'), ('b0c9d2329ad1606b|2018-08-14--10-32-01', '38', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-14--10-32-01/38'), ('b0c9d2329ad1606b|2018-08-15--09-01-03', '17', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-15--09-01-03/17'), ('b0c9d2329ad1606b|2018-07-30--13-03-07', '22', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-07-30--13-03-07/22'), ('b0c9d2329ad1606b|2018-08-15--09-01-03', '19', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-15--09-01-03/19'), ('b0c9d2329ad1606b|2018-07-29--12-02-42', '27', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-07-29--12-02-42/27'), ('b0c9d2329ad1606b|2018-08-02--16-41-38', '14', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-02--16-41-38/14'), ('b0c9d2329ad1606b|2018-08-14--10-32-01', '31', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-14--10-32-01/31')]
val_seqs= [('b0c9d2329ad1606b|2018-08-02--08-34-47', '38', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-02--08-34-47/38'), ('b0c9d2329ad1606b|2018-08-17--14-55-39', '5', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-17--14-55-39/5'), ('b0c9d2329ad1606b|2018-08-15--09-01-03', '20', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-15--09-01-03/20'), ('b0c9d2329ad1606b|2018-07-31--20-50-28', '9', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-07-31--20-50-28/9'), ('b0c9d2329ad1606b|2018-08-15--09-01-03', '22', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-15--09-01-03/22'), ('b0c9d2329ad1606b|2018-08-17--12-07-08', '31', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-17--12-07-08/31'), ('b0c9d2329ad1606b|2018-08-02--16-41-38', '8', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-02--16-41-38/8'), ('b0c9d2329ad1606b|2018-08-06--10-04-53', '36', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-06--10-04-53/36'), ('b0c9d2329ad1606b|2018-07-30--13-44-30', '15', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-07-30--13-44-30/15'), ('b0c9d2329ad1606b|2018-08-02--16-41-38', '10', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-02--16-41-38/10'), ('b0c9d2329ad1606b|2018-08-17--14-17-47', '10', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-17--14-17-47/10'), ('b0c9d2329ad1606b|2018-08-03--10-35-16', '10', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-03--10-35-16/10'), ('b0c9d2329ad1606b|2018-07-27--06-03-57', '11', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-07-27--06-03-57/11')]
test_seqs = [('b0c9d2329ad1606b|2018-08-15--09-01-03', '16', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-15--09-01-03/16'), ('b0c9d2329ad1606b|2018-08-15--09-01-03', '18', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-15--09-01-03/18'), ('b0c9d2329ad1606b|2018-08-17--14-55-39', '8', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-17--14-55-39/8'), ('b0c9d2329ad1606b|2018-08-17--12-07-08', '36', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-17--12-07-08/36'), ('b0c9d2329ad1606b|2018-07-30--13-44-30', '7', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-07-30--13-44-30/7'), ('b0c9d2329ad1606b|2018-07-30--13-03-07', '16', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-07-30--13-03-07/16'), ('b0c9d2329ad1606b|2018-08-17--14-17-47', '8', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-17--14-17-47/8'), ('b0c9d2329ad1606b|2018-08-02--08-34-47', '35', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-02--08-34-47/35'), ('b0c9d2329ad1606b|2018-08-02--08-34-47', '29', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-02--08-34-47/29'), ('b0c9d2329ad1606b|2018-08-03--10-35-16', '5', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-03--10-35-16/5'), ('b0c9d2329ad1606b|2018-08-03--10-35-16', '12', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-03--10-35-16/12'), ('b0c9d2329ad1606b|2018-08-03--10-35-16', '7', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-03--10-35-16/7'), ('b0c9d2329ad1606b|2018-08-17--14-55-39', '4', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-17--14-55-39/4'), ('b0c9d2329ad1606b|2018-08-02--08-34-47', '30', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-02--08-34-47/30'), ('b0c9d2329ad1606b|2018-08-02--16-41-38', '7', '/content/dataset/Chunk_1/b0c9d2329ad1606b|2018-08-02--16-41-38/7')]

In [None]:

# --- Save dataset ---
for drive_sequence_path, seq, full_seq_path in tqdm(train_seqs, desc="Train"):
    sample = get_sample(full_seq_path)
    if sample:
        video_id = drive_sequence_path.replace("|", "_")
        save_h5py(f"{video_id}_{seq}", sample, h_train)

for drive_sequence_path, seq, full_seq_path in tqdm(val_seqs, desc="Val"):
    sample = get_sample(full_seq_path)
    if sample:
        video_id = drive_sequence_path.replace("|", "_")
        save_h5py(f"{video_id}_{seq}", sample, h_val)

for drive_sequence_path, seq, full_seq_path  in tqdm(test_seqs, desc="Test"):
    sample = get_sample(full_seq_path)
    if sample:
        video_id = drive_sequence_path.replace("|", "_")
        save_h5py(f"{video_id}_{seq}", sample, h_test)

In [None]:
h_test.close()
h_train.close()
h_val.close()

In [None]:
from google.colab import files
files.download("/content/gas_and_brake_train_comma_chunk_1_w_imgs.hdf5")
files.download("/content/gas_and_brake_val_comma_chunk_1_w_imgs.hdf5")
files.download("/content/gas_and_brake_test_comma_chunk_1_w_imgs.hdf5")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# This cell was made by Giuseppe Vacante
import os, gc
import cv2
import numpy as np
from tqdm import tqdm
from scipy import ndimage


# ---------------- CONFIG ----------------
chunk_path = "/content/dataset/Chunk_1"   # <- principal folder
output_base = "/content/frames"           # where to save the images
output_size = (1280, 960)                 # size to save the frames
os.makedirs(output_base, exist_ok=True)

last_done = None   # e.g. "b0c9d2329ad1606b|2018-08-17--14-55-39|9" or None
skip = False if last_done is None else True

# ---------- function get_sample ----------
def get_sample(p):
    """ Return (sample, reason). sample=None if discarded, reason indicates the reason."""
    try:
        fr = FrameReader(os.path.join(p, 'video.hevc'))
        logs = list(LogReader(os.path.join(p, 'raw_log.bz2')))
    except Exception as e:
        return None, f"open_failed: {e}"

    try:
        # extract fields (sampling 1::5 as in original code)
        angle = np.array([l.carState.steeringAngleDeg for l in logs if l.which() == 'carState'])[1::5][1::5]
        time = np.array([l.logMonoTime for l in logs if l.which() == 'carState'])[1::5][1::5]
        vEgo = np.array([l.carState.vEgo for l in logs if l.which() == 'carState'])[1::5][1::5]
        gas = np.array([l.carState.gas for l in logs if l.which() == 'carState'])[1::5][1::5]
        gaspressed = np.array([l.carState.gasPressed for l in logs if l.which() == 'carState'])[1::5][1::5]
        brake = np.array([l.carState.brake for l in logs if l.which() == 'carState'])[1::5][1::5]
        brake_pressed = np.array([l.carState.brakePressed for l in logs if l.which() == 'carState'])[1::5][1::5]

        leftBlinker = np.array([l.carState.leftBlinker for l in logs if l.which() == 'carState'])[1::5][1::5]
        rightBlinker = np.array([l.carState.rightBlinker for l in logs if l.which() == 'carState'])[1::5][1::5]

        enabled = np.array([l.carState.cruiseState.enabled for l in logs if l.which() == 'carState'])[1::5][1::5]
        speed = np.array([l.carState.cruiseState.speed for l in logs if l.which() == 'carState'])[1::5][1::5]
        speedOffset = np.array([l.carState.cruiseState.speedOffset for l in logs if l.which() == 'carState'])[1::5][1::5]
        standstill = np.array([l.carState.cruiseState.standstill for l in logs if l.which() == 'carState'])[1::5][1::5]
        nonAdaptive = np.array([l.carState.cruiseState.nonAdaptive for l in logs if l.which() == 'carState'])[1::5][1::5]
        speedCluster = np.array([l.carState.cruiseState.speedCluster for l in logs if l.which() == 'carState'])[1::5][1::5]

        # radar distances (similar sampling)
        dist = np.array([l.radarState.leadOne.dRel for l in logs if l.which() == "radarState"])[1::5]
    except Exception as e:
        return None, f"field_extract_failed: {e}"

    # Filters: vEgo/dist too many zeros or dist too short -> discard
    if ((vEgo == 0).mean() > 0.2):
        return None, f"vEgo_zeros_frac={(vEgo==0).mean():.3f}"
    if ((dist == 0).mean() > 0.2):
        return None, f"dist_zeros_frac={(dist==0).mean():.3f}"
    if len(dist) <= 230:
        return None, f"dist_too_short={len(dist)}"

    # Read frames (1::5 sampling)
    images = []
    try:
        for idx in range(fr.frame_count)[1::5]:
            im = fr.get(idx, pix_fmt='rgb24')[0]  # numpy array H,W,3 RGB
            images.append(np.array(im, dtype=np.float32))
    except Exception as e:
        return None, f"frame_read_failed: {e}"

    if len(images) == 0:
        return None, "no_images"

    # compute desired_gap as in original code
    steady_state = ~gaspressed & ~brake_pressed & ~leftBlinker & ~rightBlinker
    desired_gap = np.zeros(steady_state.shape)
    last_idx = 0
    for i in range(len(steady_state)-1):
        if steady_state[i]:
            desired_gap[last_idx:i] = int(dist[i])
            last_idx = i
    desired_gap[-12:] = dist[-12:].mean() if len(dist) >= 12 else (dist.mean() if len(dist)>0 else 0)

    if (desired_gap == 0).mean() > 0.2:
        return None, f"desired_gap_zero_frac={(desired_gap==0).mean():.3f}"

    sample = {
        'image': images,
        'vEgo': vEgo,
        'dist': dist,
        'desired_dist': desired_gap,
    }
    return sample, "ok"

# ---------------- LOOP SUL CHUNK FOLDER ----------------
kept_seqs = []               # list of folder_name
skipped_seqs = []            # list of tuples (folder_name, reason)
n_saved = 0
n_skipped = 0
n_errors = 0

if not os.path.isdir(chunk_path):
    raise FileNotFoundError(f"chunk_path not found: {chunk_path}")

for drive in tqdm(sorted(os.listdir(chunk_path)), desc="Processing drives"):
    drive_path = os.path.join(chunk_path, drive)
    if not os.path.isdir(drive_path):
        continue

    for seq in tqdm(sorted(os.listdir(drive_path)), desc=f"Processing seq in {drive}", leave=False):
        seq_path = os.path.join(drive_path, seq)
        video_path = os.path.join(seq_path, "video.hevc")
        if not os.path.exists(video_path):
            skipped_seqs.append((f"{drive}|{seq}", "no_video"))
            n_skipped += 1
            continue

        folder_name = f"{drive}|{seq}"
        # manage skip (if you want to restart from last_done)
        if skip:
            if last_done is not None and folder_name == last_done:
                skip = False
            else:
                continue

        save_path = os.path.join(output_base, folder_name)
        os.makedirs(save_path, exist_ok=True)

        # apply filters with get_sample
        try:
            sample, reason = get_sample(seq_path)
        except Exception as e:
            sample = None
            reason = f"exception_in_get_sample: {e}"

        if sample is None:
            skipped_seqs.append((folder_name, reason))
            n_skipped += 1
            continue

        # save images in sample['image'], resize and save
        try:
            images = sample['image']
            if len(images) == 0:
                skipped_seqs.append((folder_name, "no_images"))
                n_skipped += 1
                continue

            saved_idx = 1
            for img in images:
                arr = np.clip(img, 0, 255).astype(np.uint8)
                arr_resized = cv2.resize(arr, output_size, interpolation=cv2.INTER_CUBIC)
                arr_bgr = cv2.cvtColor(arr_resized, cv2.COLOR_RGB2BGR)
                filename = os.path.join(save_path, f"{saved_idx:05}.jpg")
                cv2.imwrite(filename, arr_bgr)
                saved_idx += 1
                n_saved += 1

            kept_seqs.append(folder_name)

        except Exception as e:
            skipped_seqs.append((folder_name, f"save_error: {e}"))
            n_errors += 1
            continue

# final report + saving lists
print("=== Done ===")
print(f"Saved frames (files): {n_saved}")
print(f"Skipped sequences (count): {len(skipped_seqs)}")
print(f"Errors while saving (count): {n_errors}")
print(f"Kept sequences (count): {len(kept_seqs)}")

# save two text files with the lists (ready for download)
kept_file = os.path.join(output_base, "kept_seqs.txt")
skipped_file = os.path.join(output_base, "skipped_seqs.txt")

with open(kept_file, "w") as f:
    for s in kept_seqs:
        f.write(s + "\n")

with open(skipped_file, "w") as f:
    for s, reason in skipped_seqs:
        f.write(f"{s}\t{reason}\n")

print(f"Kept list saved to: {kept_file}")
print(f"Skipped list saved to: {skipped_file}")


-------------------------------------------------------------------------------------------

In [None]:
#Old version - extracts every 5th frame without filtering
import os
import cv2
from tqdm import tqdm
#from openpilot.tools.lib.video import FrameReader

chunk_path = "/content/dataset/Chunk_1" 
output_base = "/content/frames"         
fps_target = 20
output_size = (1280, 960)

os.makedirs(output_base, exist_ok=True)


for drive in tqdm(os.listdir(chunk_path), desc="Processing drives"):
    drive_path = os.path.join(chunk_path, drive)
    if not os.path.isdir(drive_path): continue

    for seq in os.listdir(drive_path):
        seq_path = os.path.join(drive_path, seq)
        video_path = os.path.join(seq_path, "video.hevc")
        if not os.path.exists(video_path): continue

        folder_name = f"{drive}|{seq}"
        save_path = os.path.join(output_base, folder_name)
        os.makedirs(save_path, exist_ok=True)

        try:
            fr = FrameReader(video_path)
            total_frames = fr.frame_count
            print(total_frames)
            video_fps = 20
            step = int(round(video_fps / fps_target)) 

            frame_idx = 0
            saved_idx = 1

            for i in range(1, total_frames, 5):
                try:
                    img = fr.get(i, pix_fmt='rgb24')[0]  # (H, W, 3)
                    img = cv2.resize(img, output_size, interpolation=cv2.INTER_CUBIC)
                    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
                    filename = os.path.join(save_path, f"{saved_idx:05}.jpg")
                    cv2.imwrite(filename, img)
                    saved_idx += 1
                except Exception as e:
                    print(f"⚠️ Error in frame {i} in {folder_name}: {e}")

        except Exception as e:
            print(f"❌ Error with {video_path}: {e}")

In [None]:
from google.colab import files
files.download("/content/frames")

In [None]:
import shutil
from google.colab import files
folder_path = "/content/frames"          
output_zip = "/content/frames.zip"       
shutil.make_archive("/content/frames", 'zip', folder_path)

