In [None]:
%pip install chromadb

In [None]:
# === Import required libraries ===
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
import chromadb
from chromadb.config import Settings

In [None]:
# === File paths ===
metadata_path = "master_task1_fully_cleaned.csv"
transcripts_path = "master_task2_clean_transcript_dataset.csv"

merged_output_csv = "merged_dataset.csv"
embeddings_output_csv = "dataset_with_embeddings.csv"

In [None]:
# === Step 1: Load and Merge Metadata + Transcripts ===
print("🔹 Loading CSV files...")
meta_df = pd.read_csv(metadata_path)
trans_df = pd.read_csv(transcripts_path)
print("🔹 Merging datasets on 'video_id'...")
merged_df = pd.merge(meta_df, trans_df, on='id', how='inner')

In [None]:
# === Step 2: Clean Data ===
print("🔹 Cleaning data...")
merged_df.dropna(subset=['transcript'], inplace=True)
merged_df = merged_df[merged_df['transcript'].str.strip() != ""]

In [None]:
# Save the merged clean dataset
merged_df.to_csv(merged_output_csv, index=False)
print(f"✅ Merged dataset saved as {merged_output_csv}")

In [None]:
# === Step 3: Generate Semantic Embeddings ===
print("🔹 Generating embeddings using 'all-mpnet-base-v2' (high accuracy)...")
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [None]:
# Combine title + transcript for rich context
merged_df['text_for_embedding'] = merged_df['title'].fillna('') + " " + merged_df['transcript'].fillna('')


In [None]:
# Generate embeddings (batched for speed)
embeddings = model.encode(
    merged_df['text_for_embedding'].tolist(),
    show_progress_bar=True,
    batch_size=32,
    convert_to_numpy=True,
    normalize_embeddings=True
)

In [None]:
# Add embeddings as strings for CSV storage
merged_df['embedding'] = [",".join(map(str, emb)) for emb in embeddings]


In [None]:
# Save dataset with embeddings
merged_df.to_csv(embeddings_output_csv, index=False)
print(f"✅ Embeddings saved to {embeddings_output_csv}")

In [None]:
# === Step 4: Store in ChromaDB ===
print("🔹 Initializing ChromaDB and adding data...")

client = chromadb.Client(Settings(
    persist_directory="chroma_storage",  # directory for persistent storage
    anonymized_telemetry=False
))

collection = client.get_or_create_collection(name="video_embeddings")

In [None]:
# Convert embedding strings back to lists of floats before adding
embedding_vectors = [np.fromstring(e, sep=",") for e in merged_df['embedding']]


In [None]:
# Remove duplicate IDs before adding to ChromaDB
merged_df_unique = merged_df.drop_duplicates(subset=['id'])

# Add data to ChromaDB
collection.add(
    ids=merged_df_unique['id'].astype(str).tolist(),
    embeddings=[embedding_vectors[i] for i in merged_df_unique.index], # Select embeddings corresponding to unique IDs
    metadatas=[
        {
            "video_id": row['id'],
            "title": row['title'],
            "transcript": row['transcript']
        }
        for _, row in merged_df_unique.iterrows()
    ],
    documents=merged_df_unique['text_for_embedding'].tolist()
)

print("✅ Embeddings and metadata successfully stored in ChromaDB!")

In [None]:
# === Step 5: Example Query (Fixed) ===
print("\n🔍 Running example semantic query...")

query = "How to use AI for video summarization"

# Use the same model that generated stored embeddings
query_embedding = model.encode([query], normalize_embeddings=True)

results = collection.query(
    query_embeddings=query_embedding.tolist(),
    n_results=3
)

print("\n🔍 Example Search Results:")
for i, doc in enumerate(results['documents'][0]):
    print(f"\nResult {i+1}:")
    print("Title:", results['metadatas'][0][i]['title'])
    print("Transcript snippet:", doc[:300])