In [3]:
import pandas as pd

# Load datasets
meta_df = pd.read_csv("/content/Master_Task1_withTranscriptFlag.csv")

# Try reading with a different engine and error handling
try:
    trans_df = pd.read_csv("/content/Master_task2_Cleaned_s14.csv", engine='python', on_bad_lines='skip')
except Exception as e:
    print(f"Error reading CSV: {e}")
    # If still fails, try reading with a different delimiter or quoting
    trans_df = pd.read_csv("/content/Master_task2_Cleaned_s14.csv", engine='python', sep='\t', on_bad_lines='skip')


# Ensure consistent column name for merging
trans_df.rename(columns={"video_id": "id"}, inplace=True)

# Merge on video ID
merged_df = pd.merge(meta_df, trans_df, on="id", how="inner")

# Remove empty or missing transcripts
merged_df["transcript"] = merged_df["transcript"].fillna("").astype(str)
merged_df = merged_df[merged_df["transcript"].str.strip() != ""]

print(f"✅ Merged dataset shape: {merged_df.shape}")

# Save outputs
merged_df.to_csv("Merged_VideoData.csv", index=False)
merged_df.to_parquet("Merged_VideoData.parquet", index=False)

print("💾 Saved merged dataset as CSV and Parquet.")

✅ Merged dataset shape: (583, 25)
💾 Saved merged dataset as CSV and Parquet.


In [4]:

import pandas as pd
from sentence_transformers import SentenceTransformer

# Load merged dataset
merged_df = pd.read_csv("/content/Merged_VideoData.csv")

# Initialize model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Combine title and transcript for embeddings
merged_df["combined_text"] = merged_df["title"] + " " + merged_df["transcript"]

# Generate embeddings
embeddings = model.encode(merged_df["combined_text"].tolist(), show_progress_bar=True)

# Add embeddings to DataFrame
merged_df["embedding"] = embeddings.tolist()

# Save outputs
merged_df.to_csv("Merged_Embeddings.csv", index=False)
merged_df.to_parquet("Merged_Embeddings.parquet", index=False)

print(f"✅ Embeddings generated and saved for {len(merged_df)} videos.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/19 [00:00<?, ?it/s]

✅ Embeddings generated and saved for 583 videos.


In [2]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-1.1.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.37.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?

In [5]:
import pandas as pd
import numpy as np
import chromadb

# ============================
# Load dataset from Parquet
# ============================
merged_df = pd.read_parquet("/content/Merged_Embeddings.parquet")

# Initialize persistent ChromaDB client
client = chromadb.PersistentClient(path="./chroma_db")

# Create or get collection
collection = client.get_or_create_collection(name="youtube_videos")

# Remove duplicate IDs
merged_df.drop_duplicates(subset=['id'], inplace=True)

# Convert embedding column safely (if stored as string)
def parse_embedding(x):
    if isinstance(x, str):
        return np.array(eval(x))
    return np.array(x)

merged_df["embedding"] = merged_df["embedding"].apply(parse_embedding)

# Stack embeddings into a single array
embeddings = np.vstack(merged_df["embedding"].values)

# Add data to ChromaDB
collection.add(
    ids=merged_df["id"].astype(str).tolist(),
    embeddings=embeddings,
    metadatas=merged_df[["title", "transcript"]].to_dict(orient="records"),
    documents=merged_df["combined_text"].astype(str).tolist()
)

print(f"✅ Stored {len(merged_df)} videos in ChromaDB collection 'youtube_videos'.")
print("🎯 Data is ready for semantic search queries.")


✅ Stored 522 videos in ChromaDB collection 'youtube_videos'.
🎯 Data is ready for semantic search queries.


In [None]:
from sentence_transformers import SentenceTransformer
import chromadb
import numpy as np

# ===============================
# 1️⃣ Query Input Handling
# ===============================
def get_user_query():
    query = input("🔍 Enter your search query: ").strip()
    if not query:
        raise ValueError("❌ Query cannot be empty. Please enter a valid search term.")
    # Optional preprocessing
    query = ''.join(c for c in query if c.isalnum() or c.isspace())
    return query


# ===============================
# 2️⃣ Query Embedding Generation
# ===============================
def generate_query_embedding(query):
    print("⚙️ Loading embedding model...")
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embedding = model.encode(query, convert_to_numpy=True)
    return embedding


# ===============================
# 3️⃣ Perform Semantic Search
# ===============================
def search_chromadb(query_embedding, top_k=5):
    print("🔎 Connecting to ChromaDB...")
    client = chromadb.PersistentClient(path="./chroma_db")
    collection = client.get_or_create_collection(name="youtube_videos")

    # Perform the semantic search
    results = collection.query(
        query_embeddings=query_embedding.tolist(),
        n_results=top_k,
        include=["metadatas", "documents", "distances"]
    )
    return results


# ===============================
# 4️⃣ Format and Filter Results
# ===============================
def format_results(results, min_score=0.2):
    formatted = []
    for i in range(len(results["ids"][0])):
        score = 1 / (1 + results["distances"][0][i])
  # Convert distance to similarity
        if score >= min_score:
            data = {
                "rank": i + 1,
                "title": results["metadatas"][0][i].get("title", "N/A"),
                "transcript": results["metadatas"][0][i].get("transcript", "")[:200] + "...",
                "similarity_score": round(score, 3)
            }
            formatted.append(data)
    return formatted


# ===============================
# 5️⃣ Display Results
# ===============================
def display_results(formatted_results):
    if not formatted_results:
        print("❌ No relevant results found.")
        return
    print("\n🎯 Top Search Results:")
    for r in formatted_results:
        print(f"\nRank {r['rank']}")
        print(f"Title: {r['title']}")
        print(f"Similarity Score: {r['similarity_score']}")
        print(f"Transcript (Preview): {r['transcript']}")


# ===============================
# 🚀 Main Script
# ===============================
if __name__ == "__main__":
    try:
        query = get_user_query()
        query_embedding = generate_query_embedding(query)
        results = search_chromadb(query_embedding, top_k=5)
        formatted_results = format_results(results, min_score=0.2)
        display_results(formatted_results)
    except Exception as e:
        print(f"⚠️ Error: {e}")


🔍 Enter your search query: css tutorial
⚙️ Loading embedding model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🔎 Connecting to ChromaDB...

🎯 Top Search Results:

Rank 1
Title: learn html css full course for beginners
Similarity Score: 0.536
Transcript (Preview): this beginner's course will teach you html and css through building and deploying five awesome projects and you will learn by solving over 75 coding challenges pair is your instructor pair borgen has ...

Rank 2
Title: html tutorial website crash course for beginners
Similarity Score: 0.531
Transcript (Preview): this is an html crash course. i'm beau carnes and i'm going to teach you the basics of html. let's jump right into it. you probably already know that html is used to create web pages. it stands for hy...

Rank 3
Title: build a simple website with html, css, javascript course for beginners
Similarity Score: 0.486
Transcript (Preview): practice your web development skills by building a social media dashboard with a dark light theme. jess, who runs the popular coder coder youtube channel, will guide you through a beginner front en

In [None]:
from sentence_transformers import SentenceTransformer
import chromadb
import numpy as np

# ===============================
# 1️⃣ Generate query embedding
# ===============================
def generate_query_embedding(query_text):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    query_embedding = model.encode(query_text, convert_to_numpy=True)
    return query_embedding

# ===============================
# 2️⃣ Search top 5 results
# ===============================
def search_top_videos(query_text, top_k=5):
    # Load model & encode query
    embedding = generate_query_embedding(query_text)

    # Connect to ChromaDB
    client = chromadb.PersistentClient(path="./chroma_db")
    collection = client.get_or_create_collection(name="youtube_videos")

    # Perform semantic search
    results = collection.query(
        query_embeddings=embedding.tolist(),
        n_results=top_k,
        include=["metadatas", "documents", "distances"]
    )

    # Format results
    formatted_results = []
    for i in range(len(results["ids"][0])):
        score = 1 / (1 + results["distances"][0][i])
  # Convert distance to similarity
        formatted_results.append({
            "rank": i + 1,
            "title": results["metadatas"][0][i].get("title", "N/A"),
            "similarity_score": round(score, 3),
            "transcript": results["metadatas"][0][i].get("transcript", "")[:200] + "..."
        })

    return formatted_results

# ===============================
# 3️⃣ Display results
# ===============================
def display_results(results):
    if not results:
        print("❌ No relevant videos found.")
        return

    print("\n🎯 Top 5 Most Relevant Videos:")
    for r in results:
        print(f"\nRank {r['rank']}")
        print(f"Title: {r['title']}")
        print(f"Similarity Score: {r['similarity_score']}")
        print(f"Transcript Preview: {r['transcript']}")

# ===============================
# 🚀 Run the search
# ===============================
if __name__ == "__main__":
    user_query = input("🔍 Enter your search query: ").strip()
    if not user_query:
        print("❌ Please enter a valid query.")
    else:
        top_results = search_top_videos(user_query, top_k=5)
        display_results(top_results)


🔍 Enter your search query: css tutorial

🎯 Top 5 Most Relevant Videos:

Rank 1
Title: learn html css full course for beginners
Similarity Score: 0.536
Transcript Preview: this beginner's course will teach you html and css through building and deploying five awesome projects and you will learn by solving over 75 coding challenges pair is your instructor pair borgen has ...

Rank 2
Title: html tutorial website crash course for beginners
Similarity Score: 0.531
Transcript Preview: this is an html crash course. i'm beau carnes and i'm going to teach you the basics of html. let's jump right into it. you probably already know that html is used to create web pages. it stands for hy...

Rank 3
Title: build a simple website with html, css, javascript course for beginners
Similarity Score: 0.486
Transcript Preview: practice your web development skills by building a social media dashboard with a dark light theme. jess, who runs the popular coder coder youtube channel, will guide you through a beg

In [9]:
pip install fastapi uvicorn sentence-transformers chromadb




In [10]:
uvicorn app:app --reload


SyntaxError: invalid syntax (ipython-input-1867459706.py, line 1)

To run the FastAPI application using uvicorn, you need to execute the command in a shell. Use the `!` prefix in Colab to run shell commands.

In [None]:
!uvicorn vOMBSzOr2L1f:app --reload --port 8000

[32mINFO[0m:     Will watch for changes in these directories: ['/content']
[32mINFO[0m:     Uvicorn running on [1mhttp://127.0.0.1:8000[0m (Press CTRL+C to quit)
[32mINFO[0m:     Started reloader process [[36m[1m3391[0m] using [36m[1mWatchFiles[0m
[31mERROR[0m:    Error loading ASGI app. Could not import module "vOMBSzOr2L1f".


In [None]:
# ==========================================
# app.py — FastAPI for Semantic Video Search
# ==========================================

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer
import chromadb
import numpy as np

# -------------------------------
# 1️⃣ Initialize FastAPI App
# -------------------------------
app = FastAPI(title="YouTube Video Semantic Search API",
              description="Search videos semantically using ChromaDB embeddings.",
              version="1.0")

# -------------------------------
# 2️⃣ Load ChromaDB & Model
# -------------------------------
client = chromadb.PersistentClient(path="./chroma_db")
collection = client.get_or_create_collection(name="youtube_videos")
model = SentenceTransformer("all-MiniLM-L6-v2")

# -------------------------------
# 3️⃣ Define Request Body Schema
# -------------------------------
class SearchRequest(BaseModel):
    query: str
    top_k: int = 5   # default top 5 results


# -------------------------------
# 4️⃣ Helper Function for Search
# -------------------------------
def perform_search(query: str, top_k: int = 5):
    query = query.strip().lower()
    if not query:
        raise ValueError("Query cannot be empty")

    # Generate embedding
    query_embedding = model.encode(query)

    # Query ChromaDB
    results = collection.query(
        query_embeddings=[query_embedding.tolist()],
        n_results=top_k
    )

    # Format results
    formatted_results = []
    for idx, metadata in enumerate(results["metadatas"][0]):
        distance = results["distances"][0][idx]
        similarity_score = 1 / (1 + distance)  # ✅ Normalized positive similarity
        transcript_preview = metadata.get("transcript", "")[:200].replace("\n", " ")

        formatted_results.append({
            "rank": idx + 1,
            "title": metadata.get("title", "N/A"),
            "similarity_score": round(similarity_score, 3),
            "transcript_preview": transcript_preview
        })

    return formatted_results


# -------------------------------
# 5️⃣ Define API Endpoint
# -------------------------------
@app.post("/search")
async def search_videos(request: SearchRequest):
    try:
        results = perform_search(request.query, request.top_k)
        return {"query": request.query, "results": results}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))