In [1]:
!pip install sentence-transformers pandas tqdm tiktoken numpy




In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import tiktoken
from tqdm import tqdm

# -------------------------------
# Load CSV
# -------------------------------
df = pd.read_csv("cleaned_youtube_data.csv")

# Combine title + transcript
df["combined_text"] = df["title"].fillna('') + " " + df["transcript"].fillna('')

# -------------------------------
# Tokenizer (same as OpenAI models)
# -------------------------------
tokenizer = tiktoken.get_encoding("cl100k_base")

def count_tokens(text):
    return len(tokenizer.encode(text))

# -------------------------------
# Chunking Function
# -------------------------------
def chunk_text(text, chunk_size=400, overlap=50):
    tokens = tokenizer.encode(text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = start + chunk_size
        chunk = tokens[start:end]
        chunks.append(tokenizer.decode(chunk))
        start += chunk_size - overlap  # move with overlap
    return chunks

# -------------------------------
# Embedding Model
# -------------------------------
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# -------------------------------
# Generate embeddings
# -------------------------------
all_embeddings = []

for text in tqdm(df["combined_text"], desc="Generating embeddings"):
    chunks = chunk_text(text)
    chunk_embeddings = model.encode(chunks)
    # Average pooling of chunk embeddings to get one vector per transcript
    avg_embedding = np.mean(chunk_embeddings, axis=0)
    all_embeddings.append(avg_embedding.tolist())

df["embedding"] = all_embeddings

# -------------------------------
# Save output
# -------------------------------
df.to_csv("data_with_chunked_embeddings.csv", index=False)

print("✅ Chunked embeddings saved successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating embeddings: 100%|██████████| 622/622 [00:12<00:00, 50.04it/s]


✅ Chunked embeddings saved successfully!
