In [1]:
import os
from argparse import Namespace

from datasets import load_dataset

DATA_DIR = "../0data_collection/keystroke-typing-videos"
dataset = load_dataset(
    "json",
    data_files={
        "train":      os.path.join(DATA_DIR, "train",      "metadata.jsonl"),
        "validation": os.path.join(DATA_DIR, "validation", "metadata.jsonl"),
        "test":       os.path.join(DATA_DIR, "test",       "metadata.jsonl"),
    },
    field=None,  # top-level JSONL
)

dataset

Using custom data configuration default-e2f6fbf577b386e0


Downloading and preparing dataset json/default to /Users/andrewtran/.cache/huggingface/datasets/json/default-e2f6fbf577b386e0/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /Users/andrewtran/.cache/huggingface/datasets/json/default-e2f6fbf577b386e0/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['file_name', 'text', 'duration_sec', 'keystrokes', 'actual_fps'],
        num_rows: 640
    })
    validation: Dataset({
        features: ['file_name', 'text', 'duration_sec', 'keystrokes', 'actual_fps'],
        num_rows: 80
    })
    test: Dataset({
        features: ['file_name', 'text', 'duration_sec', 'keystrokes', 'actual_fps'],
        num_rows: 80
    })
})

In [2]:
sample = dataset['train'][0]

In [3]:
def preprocess(ex, split):
    video_path = os.path.join(DATA_DIR, split, ex["file_name"])
    prompt     = f"<Video>{video_path}</Video>\nGive me the exact string that is being typed in the video."
    return {"prompt": prompt, "completion": ex["text"]}

In [4]:
preprocess(sample, 'train')

{'prompt': '<Video>../0data_collection/keystroke-typing-videos/train/758.mp4</Video>\nGive me the exact string that is being typed in the video.',
 'completion': 'Sears Savings has 6.6 billion in assets and 91 branches predominantly in southern California.'}

In [5]:
dataset['train']

Dataset({
    features: ['file_name', 'text', 'duration_sec', 'keystrokes', 'actual_fps'],
    num_rows: 640
})

# Data Augmentation

In [6]:
dataset['train'][0]

{'file_name': '758.mp4',
 'text': 'Sears Savings has 6.6 billion in assets and 91 branches predominantly in southern California.',
 'duration_sec': 11.5,
 'keystrokes': [{'event': 'down', 'key': 'Shift_L', 'timestamp_ms': 0},
  {'event': 'down', 'key': 'S', 'timestamp_ms': 253},
  {'event': 'down', 'key': 'e', 'timestamp_ms': 436},
  {'event': 'down', 'key': 'a', 'timestamp_ms': 541},
  {'event': 'down', 'key': 'r', 'timestamp_ms': 648},
  {'event': 'down', 'key': 's', 'timestamp_ms': 803},
  {'event': 'down', 'key': 'space', 'timestamp_ms': 900},
  {'event': 'down', 'key': 'Shift_L', 'timestamp_ms': 998},
  {'event': 'down', 'key': 'S', 'timestamp_ms': 1137},
  {'event': 'down', 'key': 'a', 'timestamp_ms': 1271},
  {'event': 'down', 'key': 'v', 'timestamp_ms': 1440},
  {'event': 'down', 'key': 'i', 'timestamp_ms': 1529},
  {'event': 'down', 'key': 'n', 'timestamp_ms': 1682},
  {'event': 'down', 'key': 'g', 'timestamp_ms': 1710},
  {'event': 'down', 'key': 's', 'timestamp_ms': 1880},
 

In [7]:
def add_file_path(example, split):
    example['file_path'] = os.path.join(DATA_DIR, split, example['file_name'])
    return example

for split in ['train', 'validation', 'test']:
    dataset[split] = dataset[split].map(lambda x: add_file_path(x, split))



  0%|          | 0/640 [00:00<?, ?ex/s]

  0%|          | 0/80 [00:00<?, ?ex/s]

  0%|          | 0/80 [00:00<?, ?ex/s]

In [8]:
sample = dataset['train'][0]
sample

{'file_name': '758.mp4',
 'text': 'Sears Savings has 6.6 billion in assets and 91 branches predominantly in southern California.',
 'duration_sec': 11.5,
 'keystrokes': [{'event': 'down', 'key': 'Shift_L', 'timestamp_ms': 0},
  {'event': 'down', 'key': 'S', 'timestamp_ms': 253},
  {'event': 'down', 'key': 'e', 'timestamp_ms': 436},
  {'event': 'down', 'key': 'a', 'timestamp_ms': 541},
  {'event': 'down', 'key': 'r', 'timestamp_ms': 648},
  {'event': 'down', 'key': 's', 'timestamp_ms': 803},
  {'event': 'down', 'key': 'space', 'timestamp_ms': 900},
  {'event': 'down', 'key': 'Shift_L', 'timestamp_ms': 998},
  {'event': 'down', 'key': 'S', 'timestamp_ms': 1137},
  {'event': 'down', 'key': 'a', 'timestamp_ms': 1271},
  {'event': 'down', 'key': 'v', 'timestamp_ms': 1440},
  {'event': 'down', 'key': 'i', 'timestamp_ms': 1529},
  {'event': 'down', 'key': 'n', 'timestamp_ms': 1682},
  {'event': 'down', 'key': 'g', 'timestamp_ms': 1710},
  {'event': 'down', 'key': 's', 'timestamp_ms': 1880},
 

In [9]:
import string
import math
import random
import subprocess
import copy

AUG_DIR = os.path.join(DATA_DIR, "train_aug")
os.makedirs(AUG_DIR, exist_ok=True)

def clip_video(original_video_path, new_video_path, start_ms, end_ms):
    start_sec = start_ms / 1000
    duration_sec = (end_ms - start_ms) / 1000

    # ffmpeg command to trim the video
    cmd = [
        "ffmpeg",
        "-y",  # Overwrite without asking
        "-ss", str(start_sec),        # Start time
        "-i", original_video_path,    # Input file
        "-t", str(duration_sec),      # Duration
        "-c", "copy",                 # Copy codec to avoid re-encoding
        new_video_path                # Output file
    ]

    subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

# Plan is that we have
# - 640 normal
# - Each normal sample gets 8 augmented versions (4 full length augment, 4 crops + augment)
def create_crop(sample, aug_idx: int, crop_size: float):
    orig_idx_str = os.path.basename(sample['file_path']).split(".")[0]
    original_video_path = sample["file_path"]
    new_video_path = os.path.join(AUG_DIR, f"{orig_idx_str}.crop_{aug_idx}.mp4")

    keystrokes = sample['keystrokes']
    keystroke_count = len(keystrokes)
    range_len = max(1, math.floor(crop_size * keystroke_count))
    start = random.randint(0, keystroke_count - range_len)
    end = start + range_len

    start_timestamp_ms = keystrokes[start]['timestamp_ms']
    end_timestamp_ms = keystrokes[min(end + 1, len(keystrokes) - 1)]['timestamp_ms']

    new_keystrokes = copy.deepcopy(keystrokes[start:end+1])
    new_text = ""
    for keystroke in new_keystrokes:
        if keystroke['key'] == 'space':
            new_text += ' '
        elif len(keystroke['key']) == 1 and (keystroke['key'].isalnum() or keystroke['key'] in string.punctuation):
            new_text += keystroke['key']

    clip_video(original_video_path, new_video_path, start_timestamp_ms, end_timestamp_ms)

    # Re-zero the timestamps
    for keystroke in new_keystrokes:
        keystroke['timestamp_ms'] -= start_timestamp_ms

    return {
        'file_path': new_video_path,
        'text': new_text,
        'duration_sec': (end_timestamp_ms - start_timestamp_ms) / 1000,
        'keystrokes': new_keystrokes,
        'actual_fps': sample['actual_fps']
    }


In [10]:
create_crop(sample, 0, 0.3)

{'file_path': '../0data_collection/keystroke-typing-videos/train_aug/758.crop_0.mp4',
 'text': 's predominantly in southern ',
 'duration_sec': 3.581,
 'keystrokes': [{'event': 'down', 'key': 's', 'timestamp_ms': 0},
  {'event': 'down', 'key': 'space', 'timestamp_ms': 56},
  {'event': 'down', 'key': 'p', 'timestamp_ms': 205},
  {'event': 'down', 'key': 'r', 'timestamp_ms': 302},
  {'event': 'down', 'key': 'e', 'timestamp_ms': 370},
  {'event': 'down', 'key': 'd', 'timestamp_ms': 516},
  {'event': 'down', 'key': 'o', 'timestamp_ms': 545},
  {'event': 'down', 'key': 'm', 'timestamp_ms': 701},
  {'event': 'down', 'key': 'i', 'timestamp_ms': 862},
  {'event': 'down', 'key': 'n', 'timestamp_ms': 1007},
  {'event': 'down', 'key': 'a', 'timestamp_ms': 1184},
  {'event': 'down', 'key': 'n', 'timestamp_ms': 1402},
  {'event': 'down', 'key': 't', 'timestamp_ms': 1653},
  {'event': 'down', 'key': 'l', 'timestamp_ms': 1832},
  {'event': 'down', 'key': 'y', 'timestamp_ms': 1983},
  {'event': 'down'

In [11]:
import os
import copy
import cv2
import numpy as np
import random
import subprocess

AUG_DIR = "../0data_collection/keystroke-typing-videos/train_aug"
os.makedirs(AUG_DIR, exist_ok=True)

def create_aug(sample, aug_idx: int):
    orig_fp = sample["file_path"]
    base    = os.path.splitext(os.path.basename(orig_fp))[0]
    tmp_fp  = os.path.join(AUG_DIR, f"{base}.aug_{aug_idx}.tmp.mp4")
    new_fp  = os.path.join(AUG_DIR, f"{base}.aug_{aug_idx}.mp4")

    # 1) write the tmp video
    cap = cv2.VideoCapture(orig_fp)
    fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
    w   = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    h   = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(tmp_fp, fourcc, fps, (w, h))

    # random augmentation params
    k      = random.choice([1,3])
    angle  = random.uniform(-5,5)
    scale  = random.uniform(0.9,1.1)
    tx     = random.uniform(-0.05,0.05)*w
    ty     = random.uniform(-0.05,0.05)*h
    max_pw = 0.1*min(w,h)
    src    = np.float32([[0,0],[w,0],[w,h],[0,h]])
    dst    = src + np.random.uniform(-max_pw,max_pw,(4,2)).astype(np.float32)
    M_p    = cv2.getPerspectiveTransform(src,dst)
    M_r    = cv2.getRotationMatrix2D((w/2,h/2), angle, scale)
    M_r[0,2] += tx
    M_r[1,2] += ty

    frame_count = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.GaussianBlur(frame,(k,k),0)
        frame = cv2.warpPerspective(frame, M_p, (w,h), borderMode=cv2.BORDER_REFLECT)
        frame = cv2.warpAffine(frame, M_r, (w,h), borderMode=cv2.BORDER_REFLECT)
        out.write(frame)
        frame_count += 1

    cap.release()
    out.release()

    cmd = [
        "ffmpeg", "-y",
        "-i", tmp_fp,        # video‐only file
        "-i", orig_fp,       # original has audio+video
        "-c:v", "libx264", "-preset", "ultrafast",
        "-c:a", "copy",      # copy the audio stream from orig_fp
        new_fp
    ]
    # print("Running FFmpeg:", " ".join(cmd))
    proc = subprocess.run(cmd, capture_output=True, text=True)
    # print(proc.stderr)       # look for any warnings/errors
    proc.check_returncode()  # raises if ffmpeg failed

    # Clean up
    os.remove(tmp_fp)

    # 4) Verify
    if not os.path.exists(new_fp):
        raise FileNotFoundError(f"Final augmented file not found at {new_fp}")

    new_sample = copy.copy(sample)
    new_sample["file_path"] = new_fp
    return new_sample

In [12]:
create_aug(sample, 0)

{'file_name': '758.mp4',
 'text': 'Sears Savings has 6.6 billion in assets and 91 branches predominantly in southern California.',
 'duration_sec': 11.5,
 'keystrokes': [{'event': 'down', 'key': 'Shift_L', 'timestamp_ms': 0},
  {'event': 'down', 'key': 'S', 'timestamp_ms': 253},
  {'event': 'down', 'key': 'e', 'timestamp_ms': 436},
  {'event': 'down', 'key': 'a', 'timestamp_ms': 541},
  {'event': 'down', 'key': 'r', 'timestamp_ms': 648},
  {'event': 'down', 'key': 's', 'timestamp_ms': 803},
  {'event': 'down', 'key': 'space', 'timestamp_ms': 900},
  {'event': 'down', 'key': 'Shift_L', 'timestamp_ms': 998},
  {'event': 'down', 'key': 'S', 'timestamp_ms': 1137},
  {'event': 'down', 'key': 'a', 'timestamp_ms': 1271},
  {'event': 'down', 'key': 'v', 'timestamp_ms': 1440},
  {'event': 'down', 'key': 'i', 'timestamp_ms': 1529},
  {'event': 'down', 'key': 'n', 'timestamp_ms': 1682},
  {'event': 'down', 'key': 'g', 'timestamp_ms': 1710},
  {'event': 'down', 'key': 's', 'timestamp_ms': 1880},
 

In [13]:
final_samples = []

In [14]:
cropped = create_crop(sample, 0, 0.3)
aug_cropped = create_aug(cropped, 0)
aug_cropped

{'file_path': '../0data_collection/keystroke-typing-videos/train_aug/758.crop_0.aug_0.mp4',
 'text': 'es predominantly in southern ',
 'duration_sec': 3.692,
 'keystrokes': [{'event': 'down', 'key': 'e', 'timestamp_ms': 0},
  {'event': 'down', 'key': 's', 'timestamp_ms': 157},
  {'event': 'down', 'key': 'space', 'timestamp_ms': 213},
  {'event': 'down', 'key': 'p', 'timestamp_ms': 362},
  {'event': 'down', 'key': 'r', 'timestamp_ms': 459},
  {'event': 'down', 'key': 'e', 'timestamp_ms': 527},
  {'event': 'down', 'key': 'd', 'timestamp_ms': 673},
  {'event': 'down', 'key': 'o', 'timestamp_ms': 702},
  {'event': 'down', 'key': 'm', 'timestamp_ms': 858},
  {'event': 'down', 'key': 'i', 'timestamp_ms': 1019},
  {'event': 'down', 'key': 'n', 'timestamp_ms': 1164},
  {'event': 'down', 'key': 'a', 'timestamp_ms': 1341},
  {'event': 'down', 'key': 'n', 'timestamp_ms': 1559},
  {'event': 'down', 'key': 't', 'timestamp_ms': 1810},
  {'event': 'down', 'key': 'l', 'timestamp_ms': 1989},
  {'event'

In [15]:
import os
CROP_SIZE = 0.3

train_split = list(dataset["train"])

def process_one(args):
    """
    Given (sample, idx), produce:
      - the original sample
      - 8 crop+aug variants (j=0..7)
      - 8 full-length aug variants (j=8..15)
    """
    sample, idx = args
    out = []
    # 1) original
    out.append(sample)
    # 2) crop + augment
    for j in range(8):
        cropped = create_crop(sample, j, CROP_SIZE)
        aug_cropped = create_aug(cropped, j)
        out.append(aug_cropped)
    # 3) full-length augment
    for j in range(8, 16):
        out.append(create_aug(sample, j))
    return out

process_one((sample, 0))

[{'file_name': '758.mp4',
  'text': 'Sears Savings has 6.6 billion in assets and 91 branches predominantly in southern California.',
  'duration_sec': 11.5,
  'keystrokes': [{'event': 'down', 'key': 'Shift_L', 'timestamp_ms': 0},
   {'event': 'down', 'key': 'S', 'timestamp_ms': 253},
   {'event': 'down', 'key': 'e', 'timestamp_ms': 436},
   {'event': 'down', 'key': 'a', 'timestamp_ms': 541},
   {'event': 'down', 'key': 'r', 'timestamp_ms': 648},
   {'event': 'down', 'key': 's', 'timestamp_ms': 803},
   {'event': 'down', 'key': 'space', 'timestamp_ms': 900},
   {'event': 'down', 'key': 'Shift_L', 'timestamp_ms': 998},
   {'event': 'down', 'key': 'S', 'timestamp_ms': 1137},
   {'event': 'down', 'key': 'a', 'timestamp_ms': 1271},
   {'event': 'down', 'key': 'v', 'timestamp_ms': 1440},
   {'event': 'down', 'key': 'i', 'timestamp_ms': 1529},
   {'event': 'down', 'key': 'n', 'timestamp_ms': 1682},
   {'event': 'down', 'key': 'g', 'timestamp_ms': 1710},
   {'event': 'down', 'key': 's', 'times

In [19]:
!rm -rf {AUG_DIR}/*

In [20]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm

CROP_SIZE = 0.3
train_split = list(dataset["train"])

def _worker(task):
    sample, idx = task
    return process_one((sample, idx))

tasks = [(s, i) for i, s in enumerate(train_split)]

final_samples = []
max_workers = 8

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = {executor.submit(_worker, t): t for t in tasks}
    for future in tqdm(as_completed(futures),
                       total=len(futures),
                       desc="Augmenting"):
        sample, idx = futures[future]
        try:
            batch = future.result()
            final_samples.extend(batch)
        except Exception as e:
            print(f"Error processing sample {idx}: {e}")

print(f"Done — produced {len(final_samples)} samples")

Augmenting:   0%|          | 0/640 [00:00<?, ?it/s]

Error processing sample 28: Command '['ffmpeg', '-y', '-i', '../0data_collection/keystroke-typing-videos/train_aug/313.crop_1.aug_1.tmp.mp4', '-i', '../0data_collection/keystroke-typing-videos/train_aug/313.crop_1.mp4', '-c:v', 'libx264', '-preset', 'ultrafast', '-c:a', 'copy', '../0data_collection/keystroke-typing-videos/train_aug/313.crop_1.aug_1.mp4']' returned non-zero exit status 183.
Error processing sample 29: Command '['ffmpeg', '-y', '-i', '../0data_collection/keystroke-typing-videos/train_aug/437.aug_8.tmp.mp4', '-i', '../0data_collection/keystroke-typing-videos/train/437.mp4', '-c:v', 'libx264', '-preset', 'ultrafast', '-c:a', 'copy', '../0data_collection/keystroke-typing-videos/train_aug/437.aug_8.mp4']' returned non-zero exit status 254.
Done — produced 10846 samples


In [32]:
import pickle

with open('final_samples.pkl', 'wb') as f:
    pickle.dump(final_samples, f)

In [17]:
import pickle

with open('final_samples.pkl', 'rb') as f:
    final_samples = pickle.load(f)

In [19]:
import os

DATASETS_PATH = 'LLaMA-Factory/data'
DATASET_INFO_PATH = os.path.join(DATASETS_PATH, 'dataset_info.json')
DATASET_JSON_PATH = os.path.join(DATASETS_PATH, 'keyboard.json')

In [21]:
# Add my dataset
import json

new_entry_key = "keyboard"
new_entry_value = {
    'file_name': 'keyboard.json',
    'formatting': 'sharegpt',
    'columns': {
        'messages': 'messages',
        'videos': 'videos',
        'audios': 'audios'
    },
    'tags': {
        'role_tag': 'role',
        'content_tag': 'content',
        'user_tag': 'user',
        'assistant_tag': 'assistant'
    }
}

with open(DATASET_INFO_PATH, 'r') as f:
    data = json.load(f)

data[new_entry_key] = new_entry_value

with open(DATASET_INFO_PATH, 'w') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

In [36]:
# Copy videos
!mkdir -p {DATASETS_PATH}/keyboard_videos

In [44]:
!rsync -a {AUG_DIR}/../train/ {DATASETS_PATH}/keyboard_videos/

rsync: setpgid: Operation not permitted


In [49]:
!rsync -r --include='*.mp4' --exclude='*' {AUG_DIR} {DATASETS_PATH}/keyboard_videos/

rsync: setpgid: Operation not permitted


In [None]:
%%bash
cd LLaMA-Factory/data/keyboard_videos
parallel --bar -j32 \
  ffmpeg -y -i {} -vn -acodec pcm_s16le -ar 16000 -ac 1 {.}.wav \
  ::: *.mp4

m #0:0[0x1](und): Video: h264 (Constrained Baseline) (avc1 / 0x31637661), yuv420p(progressive), 640x480 [SAR 1:1 DAR 4:3], 1271 kb/s, 30 fps, 30 tbr, 15360 tbn (default)
      Metadata:
        handler_name    : VideoHandler
        vendor_id       : [0][0][0][0]
        encoder         : Lavc61.19.101 libx264
  Stream #0:1[0x2](und): Audio: aac (LC) (mp4a / 0x6134706D), 48000 Hz, mono, fltp, 197 kb/s (default)
      Metadata:
        handler_name    : SoundHandler
        vendor_id       : [0][0][0][0]
Stream mapping:
  Stream #0:1 -> #0:0 (aac (native) -> pcm_s16le (native))
Press [q] to stop, [?] for help
Output #0, wav, to 'LLaMA-Factory/data/keyboard_videos/wavs/114.aug_8.wav':
  Metadata:
    major_brand     : isom
    minor_version   : 512
    compatible_brands: isomiso2avc1mp41
    ISFT            : Lavf61.7.100
  Stream #0:0(und): Audio: pcm_s16le ([1][0][0][0] / 0x0001), 16000 Hz, mono, s16, 256 kb/s (default)
      Metadata:
        handler_name    : SoundHandler
        ven

In [6]:
len(final_samples)

10846

In [None]:
%%bash
cd LLaMA-Factory/data/keyboard_videos
echo "Checking for corrupted or unreadable .mp4/.wav files..."
echo "Writing plain paths to bad.txt"
echo "----"

# Clear output file
: > bad.txt

# Function to validate a single file
check_file() {
  file="$1"
  duration=$(ffprobe -v error -show_entries format=duration \
    -of default=noprint_wrappers=1:nokey=1 "$file" 2>/dev/null)

  # If ffprobe fails outright
  if [ $? -ne 0 ]; then
    echo "$file" >> bad.txt
    return
  fi

  # If duration is not a valid number (e.g., "N/A")
  if ! [[ "$duration" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
    echo "$file" >> bad.txt
    return
  fi

  # If duration is 0 or less
  duration_int=${duration%.*}
  if [ "$duration_int" -le 0 ]; then
    echo "$file" >> bad.txt
  fi
}

# Export function for parallel
export -f check_file

# Run checks on all visible .mp4/.wav files in parallel with a progress bar
find . -type f \( -iname "*.mp4" -o -iname "*.wav" \) ! -iname ".*" \
  | parallel -j 16 --bar check_file {}
cp bad.txt ../../../

In [15]:
bad_files = []
with open('bad.txt', 'r') as f:
    for line in f:
        bad_files.append(line.strip().replace('./', ''))
bad_files[:10]

['557.crop_7.aug_7.mp4',
 '662.aug_13.mp4',
 '42.crop_1.aug_1.mp4',
 '206.crop_1.mp4',
 '744.crop_7.mp4',
 '695.crop_4.mp4',
 '695.crop_0.aug_0.mp4',
 '475.aug_15.mp4',
 '428.mp4',
 '295.crop_5.aug_5.mp4']

In [25]:
import os

json_samples = []
skipped = 0
for sample in final_samples:
    file_name = os.path.basename(sample['file_path'])
    new_file_path = f'{DATASETS_PATH}/keyboard_videos/{file_name}'
    if not os.path.exists(new_file_path):
        print(f"Path {new_file_path} not found, skipping")
        continue

    if len(sample['text']) < 10:
        continue

    video_rel_path = new_file_path.replace('LLaMA-Factory/data/', '')
    audio_rel_path = video_rel_path.replace('.mp4', '.wav')

    if os.path.basename(video_rel_path) in bad_files or os.path.basename(audio_rel_path) in bad_files:
        skipped += 1
        continue

    json_sample = {
        'messages': [
            {
                'content': '<video><audio>What exact string is typed in this overhead recording of typing on a MacBook keyboard at an upside down view?',
                'role': 'user'
            },
            {
                'content': f"The typed string is: {sample['text']}",
                'role': 'assistant'
            }
        ],
        'videos': [
            video_rel_path
        ],
        'audios': [
            audio_rel_path
        ]
    }
    json_samples.append(json_sample)

with open(DATASET_JSON_PATH, 'w') as f:
    json.dump(json_samples, f, ensure_ascii=False, indent=2)

skipped, len(json_samples), json_samples[0]

(65,
 10032,
 {'messages': [{'content': '<video><audio>What exact string is typed in this overhead recording of typing on a MacBook keyboard at an upside down view?',
    'role': 'user'},
   {'content': 'The typed string is: This is a change of nuance.',
    'role': 'assistant'}],
  'videos': ['keyboard_videos/681.mp4'],
  'audios': ['keyboard_videos/681.wav']})