In [1]:
import pandas as pd
import numpy as np
import faiss
import json
import pickle

# Load CSV
csv_path = "youtube_details_with_embeddings.csv"
df = pd.read_csv(csv_path)

# Parse embedding strings into numeric vectors
def parse_embedding(emb_str):
    try:
        return np.array(json.loads(emb_str), dtype="float32")
    except:
        emb_str = emb_str.strip("[]")
        return np.array([float(x) for x in emb_str.split(",")], dtype="float32")

df["embedding"] = df["text_embedding"].apply(parse_embedding)

# Stack embeddings into a matrix
embeddings = np.vstack(df["embedding"].values).astype("float32")

# Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

print(f"âœ… Added {index.ntotal} embeddings to FAISS index.")

# Store only id, viewCount, and embedding
metadata = []
for i, row in df.iterrows():
    metadata.append({
        "video_id": row["video_id"],
        "view_count": row["view_count"],
        "embedding": row["embedding"].tolist()
    })

# Save index and metadata
faiss.write_index(index, "youtube_faiss.index")
with open("youtube_metadata.pkl", "wb") as f:
    pickle.dump(metadata, f)

print("ðŸ’¾ Saved FAISS index â†’ 'youtube_faiss.index' and metadata â†’ 'youtube_metadata.pkl'")


âœ… Added 622 embeddings to FAISS index.
ðŸ’¾ Saved FAISS index â†’ 'youtube_faiss.index' and metadata â†’ 'youtube_metadata.pkl'


In [2]:
import faiss
import numpy as np
import pickle

# Load index + metadata
index = faiss.read_index("youtube_faiss.index")
with open("youtube_metadata.pkl", "rb") as f:
    metadata = pickle.load(f)

# Example query embedding
query_embedding = np.random.rand(index.d).astype("float32").reshape(1, -1)

# Search top 5 similar videos
k = 5
distances, indices = index.search(query_embedding, k)

for rank, i in enumerate(indices[0]):
    print(f"\nRank {rank+1}")
    print("Video ID:", metadata[i]["video_id"])
    print("View Count:", metadata[i]["view_count"])
    print("Distance:", distances[0][rank])
    print("Embedding (first 10 values):", metadata[i]["embedding"][:10])  # truncated for readability



Rank 1
Video ID: yl0YWA2K2B0
View Count: 656108.0
Distance: 36.16978
Embedding (first 10 values): [0.22847987711429596, -0.04192284867167473, 0.06950722634792328, 0.0053841229528188705, -0.1229172870516777, 0.0017060406971722841, -0.005340714007616043, 0.03562821447849274, -0.09493828564882278, -0.010290388949215412]

Rank 2
Video ID: v9F6QBaZJ8k
View Count: 128993.0
Distance: 36.30974
Embedding (first 10 values): [0.24970613420009613, -0.062416478991508484, 0.04846638813614845, 0.015476236119866371, 0.4603954255580902, 0.003714188002049923, 0.13365280628204346, 0.028333567082881927, 0.1626475304365158, 0.06328649818897247]

Rank 3
Video ID: qh6bSF133-k
View Count: 28596.0
Distance: 36.322426
Embedding (first 10 values): [0.08376958966255188, -0.017860621213912964, 0.021351883187890053, 0.021200766786932945, 0.049147315323352814, 0.024289481341838837, 0.03461398929357529, -0.006472823675721884, -0.0639449879527092, -0.015422451309859753]

Rank 4
Video ID: K1Yl3WJxZIw
View Count: 24464