In [1]:
# Import Libraries and Define Preprocessing Functions
import pandas as pd
import numpy as np
import faiss
import os
import pickle
from tqdm import tqdm
import re  # For regular expressions
from sentence_transformers import SentenceTransformer

# Initialize the enhanced embedding model
model = SentenceTransformer('all-mpnet-base-v2')  # A more powerful embedding model

# Function to preprocess text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    return text


  from tqdm.autonotebook import tqdm, trange


In [2]:
# Load and Clean Data
# Load the dataset
df = pd.read_csv('train.csv') 

# Remove rows that have any missing values in TITLE, BULLET_POINTS, or DESCRIPTION
df_cleaned = df[['TITLE', 'BULLET_POINTS', 'DESCRIPTION']].dropna()

# Reset index after dropping rows
df_cleaned.reset_index(drop=True, inplace=True)

# Display the number of products in the cleaned dataset
print(f"Cleaned dataset contains {len(df_cleaned)} products.")


Cleaned dataset contains 658 products.


In [3]:
# Compute Embeddings and Build Knowledge Base
embeddings_file = 'product_embeddings.npy'
metadata_file = 'product_metadata.pkl'

if os.path.exists(embeddings_file) and os.path.exists(metadata_file):
    # Load embeddings and metadata from disk
    print("Loading embeddings and metadata from disk...")
    embeddings = np.load(embeddings_file)
    with open(metadata_file, 'rb') as f:
        metadata = pickle.load(f)
else:
    print("Computing embeddings... This may take a while.")
    
    # Prepare the texts for embedding with preprocessing
    texts = [
        preprocess_text(f"{row['TITLE']} {row['BULLET_POINTS']} {row['DESCRIPTION']}")
        for _, row in tqdm(df_cleaned.iterrows(), total=df_cleaned.shape[0])
    ]
    
    # Encode texts in batches to improve performance
    embeddings = model.encode(texts, batch_size=64, show_progress_bar=True)
    
    # Convert embeddings to numpy array of type float32
    embeddings = np.array(embeddings).astype('float32')
    
    # Store metadata
    metadata = [
        {
            'title': row['TITLE'],
            'bullet_points': row['BULLET_POINTS'],
            'description': row['DESCRIPTION']
        }
        for _, row in df_cleaned.iterrows()
    ]

    # Save embeddings and metadata to disk for future use
    np.save(embeddings_file, embeddings)
    with open(metadata_file, 'wb') as f:
        pickle.dump(metadata, f)


Loading embeddings and metadata from disk...


In [4]:
# Build and Save FAISS Index
index_file = 'faiss_index.index'

if os.path.exists(index_file):
    # Load the FAISS index from disk
    print("Loading FAISS index from disk...")
    index = faiss.read_index(index_file)
else:
    # Build an approximate FAISS index suitable for large datasets
    print("Building FAISS index...")
    dimension = embeddings.shape[1]

    # Use the IndexHNSWFlat for efficient approximate nearest neighbor search
    index = faiss.IndexHNSWFlat(dimension, 32)  # 32 is the number of neighbors in the graph

    # Normalize embeddings if necessary
    faiss.normalize_L2(embeddings)

    # Add embeddings to the index
    index.add(embeddings)

    # Save the index to disk
    faiss.write_index(index, index_file)


Loading FAISS index from disk...


In [None]:
# Build FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexHNSWFlat(dimension, 32)
faiss.normalize_L2(embeddings)
index.add(embeddings)
# Save the index to disk
faiss.write_index(index, index_file)

In [None]:

def retrieve_product_info(query_title, index, metadata, top_k=5):

    query_title_normalized = preprocess_text(query_title)
    query_embedding = model.encode([query_title_normalized], convert_to_numpy=True)
    query_embedding = np.array(query_embedding).astype('float32')
    faiss.normalize_L2(query_embedding)
    distances, indices = index.search(query_embedding, top_k * 5)
    
    initial_results = []
    for idx in indices[0]:
        product_info = metadata[idx]
        initial_results.append(product_info)
    query_tokens = set(query_title_normalized.split())
    scored_results = []
    for product_info in initial_results:
        product_title = preprocess_text(product_info['title'])
        product_tokens = set(product_title.split())
        overlap = query_tokens.intersection(product_tokens)
        score = len(overlap)
        scored_results.append((score, product_info))
    
    scored_results.sort(key=lambda x: x[0], reverse=True)
    results = [item[1] for item in scored_results[:top_k]]
    
    return results


In [6]:
# Test the Retrieval Function
test_titles = [
    "electric air horn compressor",
    "wireless bluetooth earbuds",
    "samsung galaxy watch",
    "nike men's running shoes",
    "stainless steel water bottle",
    "portable power bank charger",
    "yoga mat for home workouts",
    "adjustable dumbbell set",
    "sony noise-canceling headphones",
    "hiking backpack for outdoor adventures",
    "non-stick frying pan",
    "baby stroller with adjustable seat",
    "robotic vacuum cleaner"
]

for title in test_titles:
    print(f"Query Title: {title}")
    results = retrieve_product_info(title, index, metadata, top_k=3)
    for i, result in enumerate(results):
        print(f"\nResult {i+1}:")
        print(f"Title: {result['title']}")
        print(f"Bullet Points: {result['bullet_points']}")
        print(f"Description: {result['description']}")
    print("=" * 50)


Query Title: electric air horn compressor

Result 1:
Title: PRIKNIK Horn Red Electric Air Horn Compressor Interior Dual Tone Trumpet Loud Compatible with SP Concept
Bullet Points: [Loud Dual Tone Trumpet Horn, Compatible With SP Concept,Electric Air Horn with Compressor, Compatible With SP Concept,High Quality, High Intensity Horn Sound, Compatible With SP Concept,Color: Red, Compatible With SP Concept,Professional installation required, Compatible With SP Concept]
Description: Specifications: Color: Red, Material: Aluminium, Voltage: 12V, dB: 130 dB (around), Material: Aluminum Pump Head + Steel Pump Body + ABS Shell and Parts DB output: 130db Voltage: 12v Sound Type: Dual Tone Application: 12V Voltage Vehicles With Battery Above 20A Package included: 1 x Dual Tone Air Horn Compatible With SP Concept

Result 2:
Title: PRIKNIK Horn Red Electric Air Horn Compressor Interior Dual Tone Trumpet Loud Compatible with SX4
Bullet Points: [Loud Dual Tone Trumpet Horn, Compatible With SX4,Electr