## Keyword Search

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
brand_category_df = pd.read_csv('../data/raw/brand_category.csv')
offer_retailer_df = pd.read_csv('../data/raw/offer_retailer.csv')
categories_df = pd.read_csv('../data/raw/categories.csv')

brand_categories = brand_category_df.groupby('BRAND')['BRAND_BELONGS_TO_CATEGORY'].agg(list).reset_index()
brand_categories.rename(columns={'BRAND_BELONGS_TO_CATEGORY': 'CATEGORIES'}, inplace=True)

merged_df = offer_retailer_df.merge(brand_categories, on='BRAND', how='left')

category_mapping = categories_df.set_index('PRODUCT_CATEGORY')['IS_CHILD_CATEGORY_TO'].to_dict()

def get_super_categories(categories):
    super_categories = set()
    if isinstance(categories, list):
        for category in categories:
            super_category = category_mapping.get(category)
            if super_category:
                super_categories.add(super_category)
    return list(super_categories) if super_categories else ''

merged_df['SUPER_CATEGORIES'] = merged_df['CATEGORIES'].apply(get_super_categories)

merged_df = merged_df.fillna('')
merged_df['TEXT'] = merged_df['TEXT'] = (merged_df['BRAND'] + ' ; ' + merged_df['RETAILER'] + ' ; ' + merged_df['CATEGORIES'].str.join(', ') + ' ; ' + merged_df['SUPER_CATEGORIES'].str.join(', ')).str.lower()

In [None]:
merged_df.head()

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(merged_df['TEXT'])

In [None]:
def search_offers(user_input, threshold=0.05, dis_threshold=0.3):
    user_input = user_input.lower()

    user_vector = tfidf_vectorizer.transform([user_input])

    scores = cosine_similarity(user_vector, tfidf_matrix)

    results = merged_df.copy()
    results['SIMILARITY_SCORE'] = scores[0]
    results = results[results['SIMILARITY_SCORE'] > threshold]
    results = results.sort_values(by='SIMILARITY_SCORE', ascending=False)

    plt.scatter(results['SIMILARITY_SCORE'], results['SIMILARITY_SCORE'])
    plt.xlabel('')
    plt.ylabel('')
    plt.title('Similarity Scores Scatter Plot')
    n_clusters = 2
    kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init='auto')
    results['Cluster'] = kmeans.fit_predict(results[['SIMILARITY_SCORE']])
    cluster_centers = kmeans.cluster_centers_

    for cluster in range(n_clusters):
        cluster_data = results[results['Cluster'] == cluster]
        plt.scatter(cluster_data['SIMILARITY_SCORE'], cluster_data['SIMILARITY_SCORE'], label=f'Cluster {cluster}')

    cluster_0_center = cluster_centers[0]
    cluster_1_center = cluster_centers[1]

    lowest_point_cluster_higher_center = results[results['Cluster'] == np.argmax(cluster_centers)]['SIMILARITY_SCORE'].min()

    highest_point_other_cluster = results[results['Cluster'] != np.argmax(cluster_centers)]['SIMILARITY_SCORE'].max()

    distance = np.abs(lowest_point_cluster_higher_center - highest_point_other_cluster)
    distance_between_centers = np.linalg.norm(cluster_0_center - cluster_1_center)
    print(distance, distance_between_centers)

    plt.legend()
    plt.show()

    return results[['OFFER', 'RETAILER', 'BRAND', 'CATEGORIES', 'SIMILARITY_SCORE', 'Cluster']]


In [None]:
user_input = "Carbonated Soft Drinks"
results = search_offers(user_input)


## Semantic Search

In [None]:
import torch
import faiss
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer

In [None]:
brand_category_df = pd.read_csv('../data/raw/brand_category.csv')
offer_retailer_df = pd.read_csv('../data/raw/offer_retailer.csv')
categories_df = pd.read_csv('../data/raw/categories.csv')

brand_categories = brand_category_df.groupby('BRAND')['BRAND_BELONGS_TO_CATEGORY'].agg(list).reset_index()
brand_categories.rename(columns={'BRAND_BELONGS_TO_CATEGORY': 'CATEGORIES'}, inplace=True)

merged_df = offer_retailer_df.merge(brand_categories, on='BRAND', how='left')

category_mapping = categories_df.set_index('PRODUCT_CATEGORY')['IS_CHILD_CATEGORY_TO'].to_dict()

def get_super_categories(categories):
    super_categories = set()
    if isinstance(categories, list):
        for category in categories:
            super_category = category_mapping.get(category)
            if super_category:
                super_categories.add(super_category)
    return list(super_categories) if super_categories else ''

merged_df['SUPER_CATEGORIES'] = merged_df['CATEGORIES'].apply(get_super_categories)

merged_df = merged_df.fillna('')
merged_df['TEXT'] = merged_df['TEXT'] = (merged_df['BRAND'] + ' ; ' + merged_df['RETAILER'] + ' ; ' + merged_df['CATEGORIES'].str.join(', ') + ' ; ' + merged_df['SUPER_CATEGORIES'].str.join(', ')).str.lower()

In [None]:
merged_df.TEXT[1]

In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
models = ['BAAI/bge-base-en-v1.5', 'all-distilroberta-v1', 'thenlper/gte-base', ]
model = SentenceTransformer(models[1])
model.to(DEVICE)

### EXP

In [None]:
import requests

API_URL = "https://api-inference.huggingface.co/models/BAAI/bge-base-en-v1.5"
headers = {"Authorization": "Bearer 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"}

def query(text):
	payload = {"inputs": text}
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

In [None]:
emb = model.encode(merged_df.TEXT[1])

In [None]:
emb_q = query(merged_df.TEXT[1])
emb_q = np.array([emb_q]).astype("float32")

In [None]:
emb_a = np.array([emb]).astype("float32")

### FAI

In [None]:
text_embeddings = model.encode(merged_df['TEXT'])

In [None]:
res = faiss.StandardGpuResources()
index = faiss.IndexFlatL2(text_embeddings.shape[1])
index.metric_type = faiss.METRIC_INNER_PRODUCT
# index = faiss.IndexIDMap(index)

# gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index)

In [None]:
# index.add_with_ids(text_embeddings, merged_df.index.values)
index.add(text_embeddings)
print(index.is_trained, index.ntotal)

In [None]:
faiss.write_index(index, './stores/bge_embedding.index')

In [None]:
query_vector = model.encode("Walmart".lower())
D, I = index.search(query_vector.reshape(1, -1), 30)
print(list(zip(D,I)))

In [None]:
def search_offers(user_input, threshold=0.05, dis_threshold=0.3, k=100):
    user_input = user_input.lower()

    user_vector = model.encode(user_input).reshape(1, -1)

    scores, indices = index.search(user_vector.reshape(1, -1), k)

    results = merged_df.loc[indices[0]]
    results["SIMILARITY_SCORE"] = scores[0]
    results = results[results["SIMILARITY_SCORE"] > threshold]
    # results = results.sort_values(by='SIMILARITY_SCORE', ascending=False)

    plt.scatter(results["SIMILARITY_SCORE"], results["SIMILARITY_SCORE"])
    plt.xlabel("")
    plt.ylabel("")
    plt.title("Similarity Scores Scatter Plot")

    n_clusters = 2
    kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init="auto")
    results["Cluster"] = kmeans.fit_predict(results[["SIMILARITY_SCORE"]])
    cluster_centers = kmeans.cluster_centers_

    for cluster in range(n_clusters):
        cluster_data = results[results["Cluster"] == cluster]
        plt.scatter(
            cluster_data["SIMILARITY_SCORE"],
            cluster_data["SIMILARITY_SCORE"],
            label=f"Cluster {cluster}",
        )

    cluster_0_center = cluster_centers[0]
    cluster_1_center = cluster_centers[1]

    lowest_point_cluster_higher_center = results[
        results["Cluster"] == np.argmax(cluster_centers)
    ]["SIMILARITY_SCORE"].min()

    highest_point_other_cluster = results[
        results["Cluster"] != np.argmax(cluster_centers)
    ]["SIMILARITY_SCORE"].max()

    distance = np.abs(lowest_point_cluster_higher_center - highest_point_other_cluster)
    distance_between_centers = np.linalg.norm(cluster_0_center - cluster_1_center)
    print(distance, distance_between_centers)

    plt.legend()
    plt.show()

    return results[
        ["OFFER", "RETAILER", "BRAND", "CATEGORIES", "SIMILARITY_SCORE", "Cluster"]
    ]

In [None]:
user_input = "walmartz"
search_results = search_offers(user_input)