In [None]:
# !pip install nltk streamlit sentencepiece textblob

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
from textblob import TextBlob
from bs4 import BeautifulSoup


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


import streamlit as st
import torch
import os
import re
import nltk
import string
import torch
import warnings


# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


warnings.filterwarnings('ignore')
%matplotlib inline

2024-05-13 00:54:26.342290: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-13 00:54:26.409285: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to /home/dell/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/dell/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/dell/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [75]:
dataset_path = '../shopping_queries_dataset/'
locale ="us"
random_state = 42
n_dev_queries = 200
max_description_lenght = 100

In [3]:
""" 0. Init variables """
col_query = "query"
col_query_id = "query_id"
col_product_id = "product_id" 
col_product_title = "product_title"
col_product_description = "product_description"
col_product_bullet_point = "product_bullet_point"
col_product_brand = "product_brand"
col_product_color = "product_color"
col_product_locale = "product_locale"
col_esci_label = "esci_label" 
col_small_version = "small_version"
col_split = "split"
col_gain = 'gain'
col_features = [col_product_id]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
to_print = "".join(['-']*40)
print(to_print)
print(f"---------> {device} is activated <----------")
print(to_print)
esci_label2gain = {
    'E' : 1.0,
    'S' : 0.1,
    'C' : 0.01,
    'I' : 0.0,
}

----------------------------------------
---------> cuda is activated <----------
----------------------------------------


In [13]:
""" 1. Load data """    
df_examples = pd.read_parquet(os.path.join(dataset_path, 
                                           'shopping_queries_dataset_examples.parquet'))
df_products = pd.read_parquet(os.path.join(dataset_path, 
                                           'shopping_queries_dataset_products.parquet'))
df_examples_products = pd.merge(
    df_examples,
    df_products,
    how='left',
    left_on=[col_product_locale, col_product_id],
    right_on=[col_product_locale, col_product_id]
)
df_examples_products = df_examples_products[df_examples_products[col_small_version] == 1]
df_examples_products = df_examples_products[df_examples_products[col_product_locale] == locale]
df_examples_products[col_gain] = \
    df_examples_products[col_esci_label].apply(lambda esci_label: esci_label2gain[esci_label])
df_examples_products.reset_index(inplace=True)

In [14]:
text_features = [col_query, col_product_title, col_product_description, 
                 col_product_bullet_point]
categorical_features = [col_product_brand, col_product_color]
# replacing null values with text as Unknown for feature processing
df_examples_products[[col_product_description, col_product_bullet_point, 
                      col_product_brand, col_product_color]] = \
df_examples_products[[col_product_description, col_product_bullet_point, 
                      col_product_brand, col_product_color]].fillna("Unknown")

### Process Product Description Feature
From the EDA step, the Product Description Feature is:
- Format HTML. 
To make the feature ready, 
- Cleaning the feature with 'preprocess_html' function which is a simple cleaning approach for HTML text. For more advanced cleaning -Denoising, Normalization, Lemmatization, etc.- we rely on the LLM model that is being used to summarize text. In real-world use case, I would investigate both methods (classical cleaning methods and cleaning with the help of LLMs). 
- Summerize it with 'Falconsai/text_summarization' if its length is larger that 200 character to address the input size limit for LLM models

In [15]:
def preprocess_html(html_text):
    # Remove HTML tags
    clean_text = BeautifulSoup(html_text, "html.parser").get_text(separator=" ")
    # Normalize whitespace
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()
    # Handle special characters
    clean_text = clean_text.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>')
    return clean_text
df_examples_products[col_product_description] = \
    df_examples_products[col_product_description].apply(preprocess_html)

In [16]:
# from https://huggingface.co/Falconsai/text_summarization
# Falconsai/text_summarization is a light LLM model for summarization tasks
summarizer = pipeline("summarization", model="Falconsai/text_summarization",
                      device=device)
def remove_extra_spaces(text):
    # Remove extra spaces using regular expression
    text = re.sub(r'\s+', ' ', text).strip()
    return re.sub(r'\s+([.,!?])', r'\1 ', text).strip()

def suumarize_text(text):
    if len(text) > 100:
        text = remove_extra_spaces(
            summarizer(text, max_length=max_description_lenght, min_length=100, 
                       do_sample=False)[0]["summary_text"]
        )
    return text

In [78]:
# Profiling Summarization with Falconsai/text_summarization
from transformers.utils import logging
import time


logging.set_verbosity_error() 

batch_size = 100
min_length = 20

df_tmp = df_examples_products[col_product_description][0:batch_size]
data = df_tmp[(df_tmp.apply(lambda x: len(x)) > max_description_lenght).values].to_list()

tic = time.time()
summarizer(data, max_length=max_description_lenght, min_length=min_length, 
           do_sample=False)[0]["summary_text"]
minutes, seconds = divmod(time.time() - tic, 60)
print(f"Elapsed time for one batch of size {batch_size} samples > min: {int(minutes)} sec: {int(seconds)}")

Elapsed time for one batch of size 100 samples > min: 0 sec: 35


Since summarizing will the LLM model is a time consuming approach, we pass for this assignment

In [79]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
import string

def summarize(document, num_sentences=1):
    # Tokenize the document into sentences
    sentences = sent_tokenize(document)
    # Tokenize the document into words
    words = word_tokenize(document.lower())
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english') + list(string.punctuation))
    words = [word for word in words if word not in stop_words]
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    # Calculate the frequency of each word
    freq = {}
    for word in words:
        if word not in freq:
            freq[word] = 0
        freq[word] += 1
    # Calculate the score of each sentence
    scores = {}
    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word in freq:
                if sentence not in scores:
                    scores[sentence] = 0
                scores[sentence] += freq[word]
    # Rank the sentences by score
    ranked_sentences = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    # Select the top ranked sentences
    summary = ' '.join([sentence for sentence, score in ranked_sentences[:num_sentences]])
    return summary

In [80]:
df_examples_products[col_product_description] = df_examples_products[col_product_description].apply(summarize)

In [82]:
if not os.path.exists("./processed_product_description.csv"):
    df_examples_products.to_csv("./processed_product_description.csv")

### Process Product Bullet Points Feature

In [83]:
def clean_itemized_text(text):
    # Remove leading and trailing whitespace
    text = text.strip()
    # Replace bullet points with newline characters
    text = re.sub(r'^[\s]*\*[\s]*', '\n', text, flags=re.MULTILINE)
    # Remove excess newline characters
    text = re.sub(r'\n+', '\n', text)
    # Remove leading and trailing whitespace from each line
    text = re.sub(r'^[\s]*|[\s]*$', '', text, flags=re.MULTILINE)
    return text

In [85]:
df_examples_products[col_product_bullet_point] = \
    df_examples_products[col_product_bullet_point].apply(
        lambda x: summarize(clean_itemized_text(x)))

# Combining Categorical Features with Text Features

To get the most out of all the features, one approach is combining categorical features with test features and adding some context to it. For instance:
- The product brand is \<x> or simply Brand: \<x>
For some models like BERT, it can be adding special tokens like \[SEP\] between features. This saves significantly on the sequence length while preserving the accuracy. 

In [88]:
# Combine all features into string

def combine_features(row):
    combined = ""
    combined += f"Brand: {row[col_product_brand]}, " \
                f"Color: {row[col_product_color]}, " \
                f"Product Title: {row[col_product_title]}, " \
                f"Product Description: {row[col_product_description]}, and " \
                f"Product Features: {row[col_product_bullet_point]}."
    return combined
        
combined_features = []
for (_, row) in df_examples_products.iterrows():
    combined_features.append(combine_features(row))
    

In [89]:
df_examples_products["combined_features"] = combined_features

In [90]:
if not os.path.exists("./final_features.csv"):
    df_examples_products.to_csv("./final_features.csv")

All toghther, the feature engineering processes are time consuming and we continue with just the main feature **Product Title**

In [91]:
df_examples_products["combined_features"]

0         Brand: RamPro, Color: 10 Inch, Product Title: ...
1         Brand: MaxAuto, Color: Unknown, Product Title:...
2         Brand: Neiko, Color: Unknown, Product Title: N...
3         Brand: Russo, Color: Unknown, Product Title: 2...
4         Brand: Antego Tire & Wheel, Color: Husqvarna S...
                                ...                        
601349    Brand: Nilight, Color: Unknown, Product Title:...
601350    Brand: Burley Design, Color: Red, Product Titl...
601351    Brand: Burley Design, Color: Yellow, Product T...
601352    Brand: BELL, Color: 20"x1.75-2.25" Schrader, P...
601353    Brand: Marcy, Color: Black/Gray/Copper, Produc...
Name: combined_features, Length: 601354, dtype: object