In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

# 1️⃣ Load your CSV
df = pd.read_csv("/content/master data cleaned_output.csv")

# 2️⃣ Load embedding model
model = SentenceTransformer('embeddding model')

# 3️⃣ Function to chunk long text
def chunk_text(text, chunk_size=512, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

# 4️⃣ Function to get averaged embedding for long text
def get_text_embedding(text):
    if not isinstance(text, str) or text.strip() == "":
        return []

    chunks = chunk_text(text)
    embeddings = [model.encode(chunk) for chunk in chunks]
    mean_embedding = np.mean(embeddings, axis=0)  # Average over chunks
    return mean_embedding.tolist()

# 5️⃣ Combine title + transcript
df["combined_text"] = df["title"].fillna('') + " " + df["transcript"].fillna('')

# 6️⃣ Generate embeddings
df["embedding"] = df["combined_text"].apply(get_text_embedding)

# 7️⃣ Save to new CSV
df.to_csv("youtube_embeddings.csv", index=False)

print("✅ Embeddings for title+transcript created and saved successfully!")
