In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import os
import pickle
from tqdm import tqdm



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# -------------------------------
# 1. Load embedding model
# -------------------------------
model = SentenceTransformer("all-MiniLM-L6-v2")

# -------------------------------
# 2. FAISS helper functions
# -------------------------------
def create_faiss_index(dim):
    """Create a FAISS L2 index for embeddings"""
    return faiss.IndexFlatL2(dim)

def save_faiss_index(index, path):
    faiss.write_index(index, path)

def load_faiss_index(path):
    return faiss.read_index(path)

def save_texts(texts, path):
    with open(path, "wb") as f:
        pickle.dump(texts, f)

def load_texts(path):
    with open(path, "rb") as f:
        return pickle.load(f)



In [5]:
def build_faiss_from_csv(csv_path, index_path="reviews.index", texts_path="texts.pkl", max_reviews=10000, batch_size=64):
    """
    Reads Yelp reviews from CSV (limited to max_reviews), embeds them, and builds a FAISS index.
    """
    df = pd.read_csv(csv_path, quotechar='"',escapechar='\\', on_bad_lines="skip",engine='python').head(max_reviews)  # <-- limit to first 10k
    print(f"📄 Loaded {len(df)} reviews for embedding.")

    # Combine business + stars + text for embeddings
    df["embedding_text"] = (
        "Business: " + df["business_name"].astype(str) +
        ". Stars: " + df["stars"].astype(str) +
        ". Review: " + df["cleaned_text"].astype(str)
    )

    texts_batch = df["embedding_text"].tolist()
    vectors = []
    texts = []

    # Encode in batches
    for i in tqdm(range(0, len(texts_batch), batch_size)):
        batch = texts_batch[i:i+batch_size]
        emb = model.encode(batch, batch_size=len(batch), show_progress_bar=False, convert_to_numpy=True)
        vectors.append(emb)
        texts.extend(batch)

    vectors = np.vstack(vectors).astype("float32")

    # Build FAISS index
    index = create_faiss_index(vectors.shape[1])
    index.add(vectors)

    # Save index and texts
    save_faiss_index(index, index_path)
    save_texts(texts, texts_path)
    print(f"✅ FAISS index built and saved at {index_path} with {len(texts)} reviews.")

    return index, texts

# -------------------------------
# 4. Query the index
# -------------------------------
def query_index(query, index, texts, top_k=5):
    query_vec = model.encode([query], convert_to_numpy=True).astype("float32")
    D, I = index.search(query_vec, top_k)
    results = [(texts[i], float(D[0][j])) for j, i in enumerate(I[0])]
    return results

# -------------------------------
# 5. Run the pipeline
# -------------------------------
if __name__ == "__main__":
    CSV_PATH = "D:/pycharm/yelp-genai-poc/data/processed/final_reviews_normalised.csv"
    INDEX_PATH = "reviews.index"
    TEXTS_PATH = "texts.pkl"

    if os.path.exists(INDEX_PATH) and os.path.exists(TEXTS_PATH):
        print("🔄 Loading existing FAISS index and texts...")
        index = load_faiss_index(INDEX_PATH)
        texts = load_texts(TEXTS_PATH)
    else:
        index, texts = build_faiss_from_csv(CSV_PATH, INDEX_PATH, TEXTS_PATH, max_reviews=2000)

    # Example query
    query = "bad customer service"
    results = query_index(query, index, texts, top_k=3)

    print("\n🔍 Query Results:")
    for text, dist in results:
        print(f"- ({dist:.4f}) {text[:200]}...")

📄 Loaded 2000 reviews for embedding.


100%|██████████| 32/32 [01:12<00:00,  2.26s/it]


✅ FAISS index built and saved at reviews.index with 2000 reviews.

🔍 Query Results:
- (0.8401) Business: Los Agaves. Stars: 1.0. Review: I'd like to share multiple occasion issues. I'm very  unsatisfied customer, I consider myself customer because I do love the food but customer service i have ...
- (1.0304) Business: Till Five Pizza. Stars: 1.0. Review: Absolutely HORRIBLE. Worst place ever. The service is terrible. I placed an order for delivery and it never got delivered (and this was two hours later.....
- (1.0468) Business: Costco. Stars: 5.0. Review: Came to Costco to pick up an order I placed online. The employees handling the pick ups were extremely busy but the lines didn't seem too long. There was a rather...


In [6]:
query = "top restaurants for food quality"
results = query_index(query, index, texts, top_k=3)
print("\n🔍 Query Results:")
for text, dist in results:
    print(f"- ({dist:.4f}) {text[:200]}...")


🔍 Query Results:
- (0.7154) Business: Chive Cafe. Stars: 5.0. Review: Simply great place.  Tried it for 3 times and never disappointed.  People who run it they really care very consistent quality and tasty foods.  The only thing...
- (0.7595) Business: Stonehouse Restaurant. Stars: 5.0. Review: I recently had lunch with my fiancé at Stonehouse, and our waiter Crispin went above and beyond to make this a memorable experience. His food recom...
- (0.7775) Business: Alessia Patisserie & Cafe. Stars: 5.0. Review: There are not enough words to describe how amazing this place is. The food, pastries, ambiance and customer service are simply superb. I don't ...
