# This section is responsible for:

# *1. Cleaning out timestamps,*

# *2. Splitting text into overlapping word chunks,*

# *3. Estimating the start time of each chunk based on words-per-second rate,

# *4. Saving the results into a CSV for later use (e.g., search, embedding, or alignment with video)

### The goal of this section is to break up long transcripts into smaller, overlapping chunks of text that can be later used for tasks like search, summarization, or question-answering.

### Breaking the text into chunks with some overlap helps keep the meaning clear — the overlap ensures that important context isn't lost between segments. The script also estimates when each chunk starts in the video, based on how fast people typically speak, so each chunk can be tied back to a specific time in the video. This is useful when someone asks a question and you want to jump to the exact part of the video that answers it. Finally, the code cleans up the transcripts by removing timestamps, so only the actual spoken words are kept for further analysis.

In [1]:
import os
import pickle
import pandas as pd
import spacy

In [2]:
# --- Parameters ---
transcript_dir = "VideoProj_transcripts"  # From Part 1
nlp = spacy.load("en_core_web_sm")  # Use spaCy's small English model

# --- Step 1: Segment timestamped entries using spaCy ---
def spacy_sentence_segmentation(transcript_data):
    sentences = []
    for entry in transcript_data:
        text = entry['text']
        start_time = entry['start']  # Keep timestamp of entry

        # Use spaCy to split into sentences
        doc = nlp(text)
        for sent in doc.sents:
            cleaned = sent.text.strip()
            if len(cleaned) > 10:  # Filter short/noisy fragments
                sentences.append({
                    "text": cleaned,
                    "start": int(start_time)
                })
    return sentences

# --- Step 2: Remove repeated/near-identical sentences ---
def remove_repeated_sentences(sentences):
    seen = set()
    unique_sentences = []
    for s in sentences:
        normalized = s["text"].lower().strip()
        if normalized not in seen:
            seen.add(normalized)
            unique_sentences.append(s)
    return unique_sentences

# --- Step 3: Process all transcript files ---
all_sentences = []

for filename in os.listdir(transcript_dir):
    if filename.endswith(".pkl"):
        video_id = filename.split("_")[0]
        filepath = os.path.join(transcript_dir, filename)

        # Load transcript
        with open(filepath, "rb") as f:
            transcript_data = pickle.load(f)

        # Segment + clean
        sentences = spacy_sentence_segmentation(transcript_data)
        sentences = remove_repeated_sentences(sentences)

        # Store structured sentence chunks
        for idx, sent in enumerate(sentences):
            all_sentences.append({
                "video_id": video_id,
                "chunk_id": idx,
                "text": sent["text"],
                "start": sent["start"]
            })

In [3]:
# --- Step 4: Save to CSV for Part 3 ---
chunk_df = pd.DataFrame(all_sentences)
chunk_df.to_csv("VideoProj_chunks.csv", index=False)

# --- Preview ---
print(f" Total unique sentence-chunks: {len(chunk_df)} from {len(os.listdir(transcript_dir))} videos")
chunk_df.head()

 Total unique sentence-chunks: 35329 from 80 videos


Unnamed: 0,video_id,chunk_id,text,start
0,-Hv6OPTlUZU,0,so now that we have seen in an earlier,0
1,-Hv6OPTlUZU,1,video the simple RNN kind of,2
2,-Hv6OPTlUZU,2,architecture and we went through a,4
3,-Hv6OPTlUZU,3,simple example of time serious,5
4,-Hv6OPTlUZU,4,prediction I think it's worthwhile,7
