In [None]:
pip install faiss-cpu



In [None]:
import faiss
import numpy as np
import pickle
import re
import pandas as pd

In [None]:
df = pd.read_csv("/content/querytube_with_embeddings.csv")

In [None]:
# Keep only rows where embeddings exist
valid_df = df[df["embedding"].notnull()].reset_index(drop=True)
print(f"✅ Valid embeddings found for {len(valid_df)} videos.")


✅ Valid embeddings found for 606 videos.


In [None]:
def parse_embedding_fixed(x):
    try:
        # Clean and convert space-separated floats into a NumPy array
        x = re.sub(r'\s+', ',', x.strip('[] '))
        return np.fromstring(x, sep=',', dtype='float32')
    except Exception as e:
        print("Error:", e)
        return None

df["embedding"] = df["embedding"].apply(parse_embedding_fixed)

print(f"✅ Parsed {df['embedding'].notnull().sum()} valid embeddings out of {len(df)} rows.")
print("Sample:", df["embedding"].iloc[0][:10])

Error: 'float' object has no attribute 'strip'
Error: 'float' object has no attribute 'strip'
✅ Parsed 606 valid embeddings out of 608 rows.
Sample: [-0.11733995  0.07676633  0.09278427  0.04812029 -0.03009335 -0.10167918
 -0.10489553  0.03499562  0.03447094  0.01400307]


In [None]:
# Drop rows with invalid embeddings (if any)
valid_df = df[df["embedding"].notnull()].reset_index(drop=True)


In [None]:
# Stack embeddings into a 2D numpy array
embedding_matrix = np.vstack(valid_df["embedding"].values).astype("float32")

In [None]:
# Check dimensions
print(f"Embedding matrix shape: {embedding_matrix.shape}")

Embedding matrix shape: (606, 384)


In [None]:
# Create FAISS index (using cosine similarity via inner product)
dimension = embedding_matrix.shape[1]
index = faiss.IndexFlatIP(dimension)

In [None]:
# Normalize for cosine similarity
faiss.normalize_L2(embedding_matrix)

In [None]:
# Add embeddings to index
index.add(embedding_matrix)
print(f"✅ FAISS index built with {index.ntotal} vectors.")


✅ FAISS index built with 606 vectors.


In [None]:
# Save FAISS index
faiss.write_index(index, "final_embeddings.index")

In [None]:
# Save corresponding metadata (like video titles, ids, etc.)
with open("final_metadata.pkl", "wb") as f:
    pickle.dump(valid_df.to_dict(orient="records"), f)