# Success: Parsing from the detailed captions


We use this function to convert the .vtt file (contains captions with detailed timestamps) into a clearer json file

In [None]:
import os
import json 
import re

def vtt_to_json(filename, output_dir):
    """
    Extracts text and timestamps from a .vtt file and saves the result as JSON.
    
    Args:
        filename (str): Path to the input VTT file.
        output_dir (str): Directory where the JSON output will be saved.
    """
    # Read file
    with open(filename, "r", encoding="utf-8") as f:
        lines = f.read().splitlines()
    
    results = []
    
    # Step size of 8 based on your original loop
    for i in range(6, len(lines), 8):
        raw_line = lines[i]

        # Clean the text by removing <HH:MM:SS.mmm> and <c> tags
        clean_text = re.sub(r"<\d{2}:\d{2}:\d{2}\.\d{3}>", "", raw_line)
        clean_text = re.sub(r"</?c>", "", clean_text)

        # Extract timestamps
        timestamps = re.findall(r"<(\d{2}:\d{2}:\d{2}\.\d{3})>", raw_line)

        if timestamps:
            start_time = timestamps[0]
            end_time = timestamps[-1]
            results.append({
                "text": clean_text.strip(),
                "start": start_time,
                "end": end_time
            })
        else:
            print(f"[WARN] No timestamps found in line {i}: {raw_line!r}")
            # no problem

    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Save to JSON
    base_name = os.path.splitext(os.path.basename(filename))[0]
    output_path = os.path.join(output_dir, f"{base_name}.json")

    with open(output_path, "w", encoding="utf-8") as out_file:
        json.dump(results, out_file, ensure_ascii=False, indent=2)
    
    print(f"[INFO] Extracted {len(results)} captions from {filename}")
    print(f"[INFO] JSON saved to {output_path}")

# Example usage:
# vtt_to_json(r".\vtt\001.ar.vtt", r".\output")


In [None]:
vtt_dir = r".\vtt"
output_dir = r".\json_output"
vtt_files = os.listdir(vtt_dir)
for vtt_file in vtt_files:
    if vtt_file.endswith('.vtt'):
        vtt_path = os.path.join(vtt_dir, vtt_file)
        vtt_to_json(vtt_path, output_dir)

[WARN] No timestamps found in line 518: '[موسيقى]'
[INFO] Extracted 64 captions from .\vtt\001.ar.vtt
[INFO] JSON saved to .\json_output\001.ar.json
[WARN] No timestamps found in line 6: '[موسيقى]'
[WARN] No timestamps found in line 14: 'الو'
[WARN] No timestamps found in line 566: 'ريسف'
[WARN] No timestamps found in line 1030: 'تلعب'
[WARN] No timestamps found in line 1062: 'نزيد'
[WARN] No timestamps found in line 1206: 'اهلاي'
[WARN] No timestamps found in line 1526: 'الهربه'
[WARN] No timestamps found in line 1574: 'يتبدل'
[WARN] No timestamps found in line 1966: 'منخرطيه'
[WARN] No timestamps found in line 2014: '[موسيقى]'
[WARN] No timestamps found in line 2022: 'شعلك'
[WARN] No timestamps found in line 2030: '[موسيقى]'
[WARN] No timestamps found in line 2222: 'كشش'
[WARN] No timestamps found in line 2710: 'واحد'
[WARN] No timestamps found in line 2798: 'الكلمه'
[WARN] No timestamps found in line 2950: '[موسيقى]'
[WARN] No timestamps found in line 2990: 'الساعي'
[WARN] No timest

Those warnings are fine (and can be neglected safely lol)

## The Splitting functions

The function **split_audio_by_integer** is the only one you should care about

In [None]:
import os
import json
from pydub import AudioSegment

def _time_to_ms(timestamp):
    """Convert 'HH:MM:SS.mmm' to milliseconds."""
    h, m, s = timestamp.split(":")
    s, ms = s.split(".")
    return (int(h) * 3600 + int(m) * 60 + int(s)) * 1000 + int(ms)

def _ms_to_hhmmssms(ms):
    """Convert milliseconds to HH:MM:SS.mmm format."""
    s, ms = divmod(int(ms), 1000)
    m, s = divmod(s, 60)
    h, m = divmod(m, 60)
    return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"

def split_audio_by_integer(audio_path, json_path, output_dir, x, audio_format="mp3"):
    """
    Splits audio into chunks of ~x seconds by grouping JSON rows until duration >= x.
    Never splits a subtitle row in half.

    Args:
        audio_path (str): Path to the audio file (e.g., '001.mp3').
        json_path (str): Path to the JSON file with subtitle data.
        output_dir (str): Folder where chunks will be saved.
        x (int/float): Target chunk length in seconds.
        audio_format (str): Output audio format (default: 'mp3').

    Returns:
        List of chunk info dicts.
    """
    x_ms = int(x * 1000)  # target chunk length in milliseconds
    os.makedirs(output_dir, exist_ok=True)
    # os.makedirs(f"{output_dir}/metadata", exist_ok=True)
    # os.makedirs(f"{output_dir}/audio_chunks", exist_ok=True)

    # Extract file ID from filename (without extension)
    file_id = os.path.splitext(os.path.basename(audio_path))[0]

    # Load audio
    audio = AudioSegment.from_file(audio_path)

    # Load subtitle JSON
    with open(json_path, "r", encoding="utf-8") as f:
        subs = json.load(f)

    # Add start/end in milliseconds
    for row in subs:
        row["_start_ms"] = _time_to_ms(row["start"])
        row["_end_ms"] = _time_to_ms(row["end"])

    # Sort rows just in case
    # subs.sort(key=lambda r: r["_start_ms"])

    chunks_info = []
    chunk_rows = []
    chunk_start_ms = None
    chunk_idx = 1

    def finalize_chunk():
        """Save current chunk as audio + JSON."""
        nonlocal chunk_idx, chunk_rows, chunk_start_ms

        if not chunk_rows:
            return

        chunk_end_ms = chunk_rows[-1]["_end_ms"]

        audio_slice = audio[chunk_start_ms:chunk_end_ms]

        # Naming: fileID_chunkXXX
        chunk_name = f"{file_id}_chunk{chunk_idx:03d}"
        audio_out_path = os.path.join(output_dir, f"{chunk_name}.{audio_format}")
        json_out_path = os.path.join(output_dir, f"{chunk_name}.json")

        # Save audio
        audio_slice.export(audio_out_path, format=audio_format)

        # Save metadata JSON
        meta = {
            "chunk_name": chunk_name,
            "start_time": _ms_to_hhmmssms(chunk_start_ms),
            "end_time": _ms_to_hhmmssms(chunk_end_ms),
            "duration_s": (chunk_end_ms - chunk_start_ms) / 1000.0,
            "rows": [
                {k: v for k, v in row.items() if k not in ("_start_ms", "_end_ms")}
                for row in chunk_rows
            ]
        }
        with open(json_out_path, "w", encoding="utf-8") as jf:
            json.dump(meta, jf, ensure_ascii=False, indent=2)

        chunks_info.append(meta)

        print(f"[INFO] Saved {audio_out_path} ({meta['duration_s']:.2f}s, {len(chunk_rows)} rows)")

        chunk_idx += 1
        chunk_rows.clear()
        chunk_start_ms = None

    # Main loop — group rows into chunks
    for row in subs:
        if not chunk_rows:
            chunk_rows.append(row)
            chunk_start_ms = row["_start_ms"]
            continue

        candidate_end_ms = row["_end_ms"]
        candidate_duration = candidate_end_ms - chunk_start_ms

        chunk_rows.append(row)
        if candidate_duration >= x_ms:
            finalize_chunk()

    # Final chunk if leftover rows
    if chunk_rows:
        finalize_chunk()

    print(f"[INFO] Total chunks created: {len(chunks_info)}")
    return chunks_info


In [None]:
mp3_dir = "../audio/mp3"
json_dir = "./json_output"
output_dir = "chunks"
seconds = 60  # specify the chunk length in seconds
mp3_files = os.listdir(mp3_dir)
json_files = os.listdir(json_dir)



for audio_file, json_file in zip(mp3_files, json_files):
    audio_path = os.path.join(mp3_dir, audio_file)
    json_path = os.path.join(json_dir, json_file)
    split_audio_by_integer( audio_path,json_path, output_dir, seconds)
    print(audio_file, json_file, "have been processed.")

[INFO] Saved chunks\001_chunk001.mp3 (60.40s, 26 rows)
[INFO] Saved chunks\001_chunk002.mp3 (64.76s, 24 rows)
[INFO] Saved chunks\001_chunk003.mp3 (40.64s, 14 rows)
[INFO] Total chunks created: 3
001.mp3 001.ar.json have been processed.
[INFO] Saved chunks\002_chunk001.mp3 (60.04s, 17 rows)
[INFO] Saved chunks\002_chunk002.mp3 (62.08s, 20 rows)
[INFO] Saved chunks\002_chunk003.mp3 (62.80s, 19 rows)
[INFO] Saved chunks\002_chunk004.mp3 (60.72s, 18 rows)
[INFO] Saved chunks\002_chunk005.mp3 (60.08s, 20 rows)
[INFO] Saved chunks\002_chunk006.mp3 (60.36s, 14 rows)
[INFO] Saved chunks\002_chunk007.mp3 (62.08s, 18 rows)
[INFO] Saved chunks\002_chunk008.mp3 (61.96s, 17 rows)
[INFO] Saved chunks\002_chunk009.mp3 (61.48s, 18 rows)
[INFO] Saved chunks\002_chunk010.mp3 (61.36s, 21 rows)
[INFO] Saved chunks\002_chunk011.mp3 (61.76s, 18 rows)
[INFO] Saved chunks\002_chunk012.mp3 (61.00s, 17 rows)
[INFO] Saved chunks\002_chunk013.mp3 (60.56s, 16 rows)
[INFO] Saved chunks\002_chunk014.mp3 (62.28s, 14

In [None]:
# add the full text to the json files

def add_full_text_to_json(filename):
    with open(f"chunks/{filename}", "r", encoding="utf-8") as f:
        chunk_data = json.load(f)
    chunk_data["full_text"] = " ".join([row["text"] for row in chunk_data["rows"]])
    print(chunk_data["full_text"])
    with open(f"chunks/{filename}", "w", encoding="utf-8") as f:
        json.dump(chunk_data, f, ensure_ascii=False, indent=2)

In [None]:
for file in os.listdir("chunks"):
    if file.endswith('.json'):
        add_full_text_to_json(file)
        print(f"Full text added to {file}")

وا بق فلاش منروحش نحوا بنينه هذ يا ودي هذ تع الس تاع الصباح ويا ربي ونكذب عليك وليت ماقدرش نطول في البحر خويا نغم وتوحش الكونكور زون تاعي كونكور زون زون خير الحوم ما تروحوا البحر ما تروحوا الجامع ما تزوجوا ما تياجي حبوس ديريكت اسمعش تض مع الحومه انا بلوطه حبي تفر قلبك لاوب مروحش بحر بلوط يخلص حب يخلص يخلص انا نروح في اكتوبر موديل جديد كاين جماعه سبتمبر تلهم فرع واحد اخر تاع اكتوبر صح نسقسيك علاش الناس تروح في سبتمبر باينه ينقصوا الغاشي وترخص الحاله شويه نروح في اكتوبر كي يخرجوا يصحاب سبتمبر ويندل البري ماشي يرخاص برك ياودي نقلع دك را 12 ح يقلعوا الكوايل عينك تذو هاهو ماكش كيفا عندي شغل دك لازم نروح نجيب كليماتيزور من عند روشدي باباه وص عليا نورمال خليه الغدوه ولا نقولك قله كي نولو في اللي ريبيري انا هذه ما نديرها خويا الراجل ينقبض ملسان عطيت لله كلمه مندورش فيها يا ودي راجلي تاعنا ويسهر في الليل نعرفو عيس ت بيتني كش خمس ايام في داركور انا ن بيتك خمس ايام كي ما نجيب كليماتيزور ويحوز ني بابا كنت تحملني انت يا حي معليش معليش انا قلت ندي صاحبي ضربت بحيره معايا كلش خالص ماكتبتش نروح مع ري و