In [1]:

import pandas as pd

# Load datasets
meta_df = pd.read_csv("/content/Master_Task1_withTranscriptFlag.csv")
trans_df = pd.read_csv("/content/Master_task2_Cleaned_s.csv")

# Ensure consistent column name for merging
trans_df.rename(columns={"video_id": "id"}, inplace=True)

# Merge on video ID
merged_df = pd.merge(meta_df, trans_df, on="id", how="inner")

# Remove empty or missing transcripts
merged_df["transcript"] = merged_df["transcript"].fillna("").astype(str)
merged_df = merged_df[merged_df["transcript"].str.strip() != ""]

print(f"✅ Merged dataset shape: {merged_df.shape}")

# Save outputs
merged_df.to_csv("Merged_VideoData.csv", index=False)
merged_df.to_parquet("Merged_VideoData.parquet", index=False)

print("💾 Saved merged dataset as CSV and Parquet.")


✅ Merged dataset shape: (365, 26)
💾 Saved merged dataset as CSV and Parquet.


In [3]:

import pandas as pd
from sentence_transformers import SentenceTransformer

# Load merged dataset
merged_df = pd.read_csv("/content/Merged_VideoData.csv")

# Initialize model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Combine title and transcript for embeddings
merged_df["combined_text"] = merged_df["title"] + " " + merged_df["transcript"]

# Generate embeddings
embeddings = model.encode(merged_df["combined_text"].tolist(), show_progress_bar=True)

# Add embeddings to DataFrame
merged_df["embedding"] = embeddings.tolist()

# Save outputs
merged_df.to_csv("Merged_Embeddings.csv", index=False)
merged_df.to_parquet("Merged_Embeddings.parquet", index=False)

print(f"✅ Embeddings generated and saved for {len(merged_df)} videos.")


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

✅ Embeddings generated and saved for 365 videos.


In [1]:
!pip install chromadb



✅ Stored 364 videos in ChromaDB collection 'youtube_videos'.
🎯 Data is ready for semantic search queries.


In [6]:
import pandas as pd
import numpy as np
import chromadb

# Load dataset with embeddings
merged_df = pd.read_csv("Merged_Embeddings.csv")

# Initialize persistent ChromaDB client
client = chromadb.PersistentClient(path="./chroma_db")

# Create or get collection
collection = client.get_or_create_collection(name="youtube_videos")

# Remove duplicate IDs before adding to ChromaDB
merged_df.drop_duplicates(subset=['id'], inplace=True)

# Add data to ChromaDB
collection.add(
    ids=merged_df["id"].astype(str).tolist(),
    embeddings=np.vstack(merged_df["embedding"].apply(eval).values),
    metadatas=merged_df[["title", "transcript"]].to_dict(orient="records"),
    documents=merged_df["combined_text"].tolist()
)

print(f"✅ Stored {len(merged_df)} videos in ChromaDB collection 'youtube_videos'.")
print("🎯 Data is ready for semantic search queries.")

✅ Stored 364 videos in ChromaDB collection 'youtube_videos'.
🎯 Data is ready for semantic search queries.
