In [None]:
# âœ… Install required library
!pip install -q chromadb

import pandas as pd
import chromadb
from chromadb.config import Settings

# ================================
# Step 1: Load Dataset
# ================================
df = pd.read_csv("/content/youtube_embeddings (1).csv", on_bad_lines='skip', encoding='utf-8')
print("âœ… Dataset loaded successfully!")
print("Total rows:", len(df))
print("Columns:", df.columns.tolist())

# ================================
# Step 2: Validate Required Columns
# ================================
required_cols = ["id", "transcript", "embedding", "title", "channel_title", "viewcount", "duration"]
for col in required_cols:
    if col not in df.columns:
        raise ValueError(f"Missing required column: {col}")

# ================================
# Step 3: Clean and Parse Data
# ================================
df["id"] = df["id"].fillna("").astype(str)
df.loc[df["id"].str.strip() == "", "id"] = [f"auto_id_{i}" for i in range(len(df[df['id'].str.strip() == '']))]

def parse_embedding(x):
    if isinstance(x, str):
        x = x.strip("[]")
        try:
            return [float(i) for i in x.split(",") if i.strip()]
        except:
            return []
    return x

df["embedding"] = df["embedding"].apply(parse_embedding)

# Filter out rows with empty embeddings
df = df[df["embedding"].apply(lambda x: isinstance(x, list) and len(x) > 0)]

# ================================
# Step 4: Create Persistent ChromaDB Client (Stored Locally)
# ================================
client = chromadb.PersistentClient(path="./youtube_vector_db")

# Create or load collection
collection = client.get_or_create_collection(name="youtube_embeddings_final")

# ================================
# Step 5: Add Data into Vector DB
# ================================
collection.add(
    ids=df["id"].tolist(),
    embeddings=df["embedding"].tolist(),
    metadatas=[
        {
            "title": row["title"],
            "transcript": row["transcript"],
            "channel_title": row["channel_title"],
            "viewCount": row["viewcount"],
            "duration": row["duration"]
        }
        for _, row in df.iterrows()
    ]
)

# ================================
# Step 6: Summary
# ================================
print("âœ… Successfully saved video_id, transcript, embedding, title, channel_title, viewCount, and duration into Chroma vector database!")
print(f"ðŸ“¦ Total records stored: {len(df)}")
print("ðŸ’¾ Data saved locally at: ./youtube_vector_db")
