In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

## Keyword Search

In [None]:
# # Load the datasets
# brands_category_df = pd.read_csv('./dataset/brand_category.csv')
# categories_df = pd.read_csv('./dataset/categories.csv')
# offer_retailer_df = pd.read_csv('./dataset/offer_retailer.csv')

# # Merge brand_category and categories on 'BRAND_BELONGS_TO_CATEGORY' and 'PRODUCT_CATEGORY'
# # merged_brand_category = brands_category_df.merge(categories_df, left_on='BRAND_BELONGS_TO_CATEGORY', right_on='PRODUCT_CATEGORY', how='left')

# # Merge offer_retailer with the previously merged data on 'BRAND'
# merged_df = offer_retailer_df.merge(brands_category_df, on='BRAND', how='left')

# # Preprocess the text data and replace NaN values with an empty string
# merged_df = merged_df.fillna('')
# merged_df['TEXT'] = (merged_df['BRAND'] + ' ' + merged_df['BRAND_BELONGS_TO_CATEGORY'] + ' ' + merged_df['RETAILER']).str.lower()

In [None]:
# # Load the datasets
# brand_category_df = pd.read_csv('./dataset/brand_category.csv')
# offer_retailer_df = pd.read_csv('./dataset/offer_retailer.csv')

# # Group brand_category_df by 'BRAND_BELONGS_TO_CATEGORY' (brand) and aggregate categories
# brand_categories = brand_category_df.groupby('BRAND')['BRAND_BELONGS_TO_CATEGORY'].agg(list).reset_index()
# brand_categories.rename(columns={'BRAND_BELONGS_TO_CATEGORY': 'CATEGORIES'}, inplace=True)

# # Map the aggregated categories to the 'BRAND' column in offer_retailer_df
# merged_df = offer_retailer_df.merge(brand_categories, on='BRAND', how='left')

# # Preprocess the text data and replace NaN values with an empty string
# merged_df = merged_df.fillna('')
# merged_df['TEXT'] = merged_df['TEXT'] = (merged_df['BRAND'] + ' : ' + merged_df['RETAILER'] + ' : ' + merged_df['CATEGORIES'].str.join(', ')).str.lower()


In [None]:
# Load the datasets
brand_category_df = pd.read_csv('../data/raw/brand_category.csv')
offer_retailer_df = pd.read_csv('../data/raw/offer_retailer.csv')
categories_df = pd.read_csv('../data/raw/categories.csv')

# Group brand_category_df by 'BRAND_BELONGS_TO_CATEGORY' (brand) and aggregate categories
brand_categories = brand_category_df.groupby('BRAND')['BRAND_BELONGS_TO_CATEGORY'].agg(list).reset_index()
brand_categories.rename(columns={'BRAND_BELONGS_TO_CATEGORY': 'CATEGORIES'}, inplace=True)

# Map the aggregated categories to the 'BRAND' column in offer_retailer_df
merged_df = offer_retailer_df.merge(brand_categories, on='BRAND', how='left')

# Create a mapping of 'PRODUCT_CATEGORY' to 'IS_CHILD_CATEGORY_TO'
category_mapping = categories_df.set_index('PRODUCT_CATEGORY')['IS_CHILD_CATEGORY_TO'].to_dict()

# Function to get unique 'IS_CHILD_CATEGORY_TO' values for each brand
def get_super_categories(categories):
    super_categories = set()
    if isinstance(categories, list):
        for category in categories:
            super_category = category_mapping.get(category)
            if super_category:
                super_categories.add(super_category)
    return list(super_categories) if super_categories else ''

# Apply the function to the 'CATEGORIES' column
merged_df['SUPER_CATEGORIES'] = merged_df['CATEGORIES'].apply(get_super_categories)

# # Preprocess the text data and replace NaN values with an empty string
merged_df = merged_df.fillna('')
merged_df['TEXT'] = merged_df['TEXT'] = (merged_df['BRAND'] + ' ; ' + merged_df['RETAILER'] + ' ; ' + merged_df['CATEGORIES'].str.join(', ') + ' ; ' + merged_df['SUPER_CATEGORIES'].str.join(', ')).str.lower()

In [None]:
merged_df.head()

In [None]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(merged_df['TEXT'])

In [None]:
# Function to search for offers based on user input with a similarity threshold
def search_offers(user_input, threshold=0.05, dis_threshold=0.3):
    user_input = user_input.lower()

    # Calculate the TF-IDF vectors for user input
    user_vector = tfidf_vectorizer.transform([user_input])

    # Calculate cosine similarity between user input and each offer
    scores = cosine_similarity(user_vector, tfidf_matrix)

    # Sort offers by similarity and return the results above the threshold
    results = merged_df.copy()
    results['SIMILARITY_SCORE'] = scores[0]
    results = results[results['SIMILARITY_SCORE'] > threshold]
    results = results.sort_values(by='SIMILARITY_SCORE', ascending=False)

    # Display a bar graph of similarity scores
    plt.scatter(results['SIMILARITY_SCORE'], results['SIMILARITY_SCORE'])
    plt.xlabel('')
    plt.ylabel('')
    plt.title('Similarity Scores Scatter Plot')

    # Apply K-Means clustering to the similarity scores
    n_clusters = 2
    kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init='auto')
    results['Cluster'] = kmeans.fit_predict(results[['SIMILARITY_SCORE']])
    cluster_centers = kmeans.cluster_centers_

    # Overlay clusters on the scatter plot
    for cluster in range(n_clusters):
        cluster_data = results[results['Cluster'] == cluster]
        plt.scatter(cluster_data['SIMILARITY_SCORE'], cluster_data['SIMILARITY_SCORE'], label=f'Cluster {cluster}')

    # Calculate distances between cluster centers
    cluster_0_center = cluster_centers[0]
    cluster_1_center = cluster_centers[1]

    # Find the lowest point in the cluster with the higher center
    lowest_point_cluster_higher_center = results[results['Cluster'] == np.argmax(cluster_centers)]['SIMILARITY_SCORE'].min()

    # Find the highest point in the other cluster
    highest_point_other_cluster = results[results['Cluster'] != np.argmax(cluster_centers)]['SIMILARITY_SCORE'].max()

    # Calculate the Euclidean distance
    distance = np.abs(lowest_point_cluster_higher_center - highest_point_other_cluster)
    distance_between_centers = np.linalg.norm(cluster_0_center - cluster_1_center)
    print(distance, distance_between_centers)

    plt.legend()
    plt.show()

    return results[['OFFER', 'RETAILER', 'BRAND', 'CATEGORIES', 'SIMILARITY_SCORE', 'Cluster']]


In [None]:
# Example usage:
# user_input = input("Enter your search query: ")
user_input = "Carbonated Soft Drinks"
results = search_offers(user_input)
