In [1]:
import pandas as pd
import numpy as np
import faiss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# ============================================================
# 1Ô∏è‚É£ Load CSV and build FAISS index
# ============================================================
def load_and_build_index(csv_path):
    df = pd.read_csv(csv_path)
    df["transcript"] = df["transcript"].fillna("")

    # Parse embeddings from string to numpy array
    def parse_embedding(emb_str):
        try:
            return np.array(eval(emb_str), dtype="float32")
        except Exception:
            emb_str = emb_str.strip("[]")
            return np.array([float(x) for x in emb_str.split(",")], dtype="float32")

    df["text_embedding"] = df["text_embedding"].apply(parse_embedding)
    embeddings = np.vstack(df["text_embedding"].values).astype("float32")

    # Build FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

    metadata = df[["video_id", "title", "channel_title"]].to_dict(orient="records")
    return df, index, metadata


# ============================================================
# 2Ô∏è‚É£ Build text models (TF-IDF + SVD)
# ============================================================
def build_text_models(df):
    df["combined_text"] = df["title"].astype(str) + " " + df["transcript"].astype(str)
    tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
    tfidf_matrix = tfidf_vectorizer.fit_transform(df["combined_text"])

    svd_model = TruncatedSVD(n_components=100, random_state=42)
    svd_model.fit(tfidf_matrix)

    return tfidf_vectorizer, svd_model


# ============================================================
# 3Ô∏è‚É£ Search for similar videos
# ============================================================
def search_videos(query, index, metadata, tfidf_vectorizer, svd_model, k=5):
    query_tfidf = tfidf_vectorizer.transform([query])
    query_embedding = svd_model.transform(query_tfidf).astype("float32")

    distances, indices = index.search(query_embedding, k)
    similarity_scores = 1 / (1 + distances[0])

    results = []
    for rank, i in enumerate(indices[0]):
        data = metadata[i]
        results.append({
            "Rank": rank + 1,
            "Video ID": data.get("video_id", "N/A"),
            "Title": data.get("title", "N/A"),
            "Channel": data.get("channel_title", "N/A"),
            "Similarity Score": round(float(similarity_scores[rank]), 4)
        })
    return results


# ============================================================
# 4Ô∏è‚É£ Main script
# ============================================================
if __name__ == "__main__":
    csv_path = "youtube_details_with_embeddings.csv"
    print("üìÇ Loading data and building FAISS index...")
    df, index, metadata = load_and_build_index(csv_path)
    print("‚úÖ FAISS index built successfully!")

    print("\nüîß Building TF-IDF + SVD models...")
    tfidf_vectorizer, svd_model = build_text_models(df)
    print("‚úÖ Text models ready!\n")

    while True:
        query = input("Enter your search query (or type 'exit' to quit): ").strip()
        if query.lower() == "exit":
            print("üëã Exiting search...")
            break

        print("\nüîç Top 5 Most Relevant Videos:\n")
        results = search_videos(query, index, metadata, tfidf_vectorizer, svd_model, k=5)
        for r in results:
            print(f"Rank {r['Rank']}")
            print(f"Video ID: {r['Video ID']}")
            print(f"Title: {r['Title']}")
            print(f"Channel: {r['Channel']}")
            print(f"Similarity Score: {r['Similarity Score']}")
            print("-" * 60)


üìÇ Loading data and building FAISS index...
‚úÖ FAISS index built successfully!

üîß Building TF-IDF + SVD models...
‚úÖ Text models ready!



Enter your search query (or type 'exit' to quit):  python



üîç Top 5 Most Relevant Videos:

Rank 1
Video ID: koL06y7HpKo
Title: 2 first code in python
Channel: Telusko
Similarity Score: 0.7292
------------------------------------------------------------
Rank 2
Video ID: KcBd4fyHJvg
Title: 3 python setup
Channel: nan
Similarity Score: 0.7159
------------------------------------------------------------
Rank 3
Video ID: 85Ad-2jMOss
Title: 3 comparison mistakes in python python nesoacademy quickconcepts
Channel: Programming with Mosh
Similarity Score: 0.6923
------------------------------------------------------------
Rank 4
Video ID: wRWdRwAb9DM
Title: my favorite vs code extension
Channel: Bro Code
Similarity Score: 0.6888
------------------------------------------------------------
Rank 5
Video ID: ycsjulONhI4
Title: the pemdas rule in python python nesoacademy quickconcepts
Channel: ProgrammingWithHarry
Similarity Score: 0.6851
------------------------------------------------------------


Enter your search query (or type 'exit' to quit):  exit


üëã Exiting search...
