In [None]:
!pip install transformers
!pip install faiss-cpu==1.7.4

In [None]:
import ast
import gzip
import pandas as pd

# List of color and material
list_of_color = ['blue', 'red', 'green', 'purple', 'black', 'orange', 'yellow', 'gold', 'white', 'silver', 'pink', 'turquoise', 'sky blue', 'sapphire blue',\
                 'aquamarine', 'cyan', 'blood red', 'lime green', 'gray', 'dark blue', 'teal', 'violet', 'brown', 'emerald green', 'light blue', 'lavender', 'baby blue', 'mint green',\
                 'ruby red', 'indigo', 'navy blue', 'aqua', 'royal blue', 'chrome', 'amethyst', 'neon green', 'scarlet', 'cobalt blue', 'sunset orange', 'azure', 'electric blue', 'neon red',\
                 'light pink', 'hot pink', 'bright yellow', 'coral', 'platinum', 'midnight purple', 'grass green', 'sea green']

list_of_material = ['cotton', 'wool', 'silk', 'linen', 'polyester', 'nylon', 'acrylic', 'spandex']

def save_top_k_data(input_filepath, output_filepath, k):
    """Save the top items

    Args:
        input_filepath (str): The file path towards the compressed file that stores metadata.
        output_filepath (str): The output file path towards the compressed file that stores metadata of the top k items.
        k (int): Number of items to be saved.

    Returns:
        None: Stores the metadata of the top k items in the output file path.
    """
    if(os.path.exists(output_filepath)):
        return None
    g = gzip.open(input_filepath, 'rb')
    with gzip.open(output_filepath, 'ab') as f:
        for l in g:
            if k == 0:
                break
            f.write(l)
            k -= 1


def check_for_color(item):
    """Check for color in the metadata

    Args:
        item (dict): The dictionary that stores the metadata of an item.

    Returns:
        dict: The dictionary that stores the metadata of an item with the color added.
    """
    color_found = set()
    if("category" in item):
        for i in range(len(list_of_color)):
            for category in item["category"]:
                if(list_of_color[i] in category.lower()):
                    color_found.add(list_of_color[i])

    if("feature" in item):
        for i in range(len(list_of_color)):
            for feature in item["feature"]:
                if(list_of_color[i] in feature.lower()):
                    color_found.add(list_of_color[i])

    if("description" in item):
        for i in range(len(list_of_color)):
            for description in item["description"]:
                if(list_of_color[i] in description.lower()):
                    color_found.add(list_of_color[i])

    if("details" in item):
        for i in range(len(list_of_color)):
            if(list_of_color[i] in item["details"].lower()):
                color_found.add(list_of_color[i])

    color_found = list(color_found)
    if(len(color_found)>0):
        item['color'] = color_found

    return item

def check_for_material(item):
    """Checks for material in the metadata and store it into optional if found.

    Args:
        item (dict): Dictionary that stores the metadata of an item.

    Returns:
        dict: Dictionary that stores the metadata of an item with the material added.
    """
    material_found = set()
    if("category" in item):
        for i in range(len(list_of_material)):
            for category in item["category"]:
                if(list_of_material[i] in category.lower()):
                    material_found.add(list_of_material[i])

    if("feature" in item):
        for i in range(len(list_of_material)):
            for feature in item["feature"]:
                if(list_of_material[i] in feature.lower()):
                    material_found.add(list_of_material[i])

    if("description" in item):
        for i in range(len(list_of_material)):
            for description in item["description"]:
                if(list_of_material[i] in description.lower()):
                    material_found.add(list_of_material[i])

    if("details" in item):
        for i in range(len(list_of_material)):
            if(list_of_material[i] in item["details"].lower()):
                material_found.add(list_of_material[i])

    material_found = list(material_found)
    if(len(material_found)>0):
        item['material'] = material_found

    return item

def move_fit(item):
    """Move the fit data to the category, and store it as a sentence

    Args:
        item (dict): Dictionary that stores the metadata of an item.

    Returns:
        dict: Dictionary that stores the metadata of an item with the fit data moved to the category.
    """
    too_small = 0
    small = 0
    perfect = 0
    big = 0
    too_big = 0

    if('fit' in item):
        if('Too small' in item['fit']):
            # This data is very unorganized
            too_small = item['fit']['Too small']
            small = item['fit']['Somewhat small']
            perfect = item['fit']['Fits as expected']
            big = item['fit']['Somewhat large']
            too_big = item['fit']['Too large']
            item.pop("fit")

            sentence = f"{too_small} people thinks it is too small, {small} people thinks it is somewhat small, \
            {perfect} people thinks it is fits as expected, {big} people thinks it is somewhat large and {too_big} people thinks it is too large."

            item['category'].append(sentence)

    return item

In [None]:
import csv
import gzip
import json
import os
import re
import time

from bs4 import BeautifulSoup

keywords = ['deliver(y|ies|ed|ing)?', 'ship(ping|ped)?', 'service(s)?', 'packag(e|ing|ed|s)?', 'mail(s|ing)?', 'ship(s)?', 'arrive(s|d)?',
            'road', 'Amazon Your review could not be posted', 'UPS', 'FedEx', 'shipment', 'packaging', 'helpful', 'Responsive','Return',
            'seller', 'refund', 'communication', 'refund', 'communication', 'resolved', 'follow up', 'courteous(ness|ize|ing)?', 'replace',
            'exchange', 'support', 'delay', 'tracking', 'box', 'receiv(ed|es|ing)', 'week(s)','day(s)']

def parse(path: str):
    """Go through the compressed file and yield the data

    Args:
        path (str): The file path towards the compressed file.

    Yields:
        dict: A dictionary that stores the metadata of an item.
    """
    g = gzip.open(path, 'rb')
    for l in g:
         yield json.loads(l)


def html_table_to_json(html: str) -> dict:
    """This function converts the html table to a json dictionary

    Args:
        html (str): A string that stores the html table.

    Returns:
        dict: A dictionary that stores the html table.
    """
    soup = BeautifulSoup(html, 'html.parser')

    if soup is None:
        return {}

    # Find all rows in the table
    rows = soup.find_all('tr')

    # Create an empty dictionary to store the data
    data_dict = {}

    # Iterate over each row
    for row in rows:
        # Extract the key (rating) from the first column
        key = row.find('span').text.strip()

        # Extract the value (count) from the last column
        value = int(row.find('span').find_next('span').text.strip().replace(',', ''))

        # Add the key-value pair to the dictionary
        data_dict[key] = value
    return data_dict


def is_html(text: str) -> bool:
    """This function checks if the text is html

    Args:
        text (str): A string that stores the text.

    Returns:
        bool: True if the text is html, False otherwise.
    """
    return bool(re.search(r'</\w.*?>', text))


def is_javascript(text: str) -> bool:
    """This function checks if the text is javascript

    Args:
        text (str): A string that stores the text.

    Returns:
        bool: True if the text is javascript, False otherwise.
    """
    return bool(re.search(r"\bvar\b", text))


def remove_unnecessary_space(text: str) -> str:
    """This function removes unnecessary space in the text

    Args:
        text (str): A string that stores the text.

    Returns:
        str: A string that stores the text without unnecessary space.
    """
    return re.sub(r"\s+", " ", text)


def convert_html_to_readable_text(html: str) -> str:
    """This function converts html to readable text

    Args:
        html (str): A string that stores the html.

    Returns:
        str: A string that stores the readable text.
    """
    soup = BeautifulSoup(html, 'html.parser')
    if soup is None:
        return html
    return soup.get_text()


def is_review_relevant(review: dict) -> bool:
    """This function checks if the review is relevant(It must not contain certain keywords)

    Args:
        review (dict): A dictionary that stores the information of a review.

    Returns:
        bool: True if the review is relevant, False otherwise.
    """

    regex_pattern = r'\b(' + '|'.join(keywords) + r')\b'

    return review.get('summary', '').lower() not in {'delivery', 'service'} and \
                len(review['reviewText'].split(" ")) > 30 and \
                not re.search(regex_pattern, review["reviewText"], re.IGNORECASE) and \
                not is_html(review["reviewText"])


def is_valid(item: dict) -> bool:
    """This function checks if the item is valid

    Args:
        item (dict): A dictionary that stores the metadata of an item.

    Returns:
        bool: True if the item is valid, False otherwise.
    """
    return all('Clothing' in item['category'] and key in item and len(item[key]) != 0 for key in {'price', 'asin', 'feature', 'category', 'title', 'rank', 'brand'}) \
           and not is_html(item['title']) and not is_javascript(item['title'])


def preprocess_data(input_metadata_filepath: str, input_reviews_filepath: str, output_metadata_filepath: str, output_reviews_filepath: str):
    """Preprocess the metadata.
    
    Remove all items where:
        - "price" field are empty
        - "feature" field are empty
        - "category" field are empty
        - "title" field are javascript or html

    Args:
        input_metadata_filepath (str): The file path towards the compressed file that stores metadata.
        input_reviews_filepath (str): The file path towards the compressed file that stores reviews.
        output_metadata_filepath (str): The output file path towards the csv file that stores metadata.
        output_reviews_filepath (str): The output file path towards the csv file that stores reviews.
    
    Returns:
        None: The preprocessed data is stored in the output file path.
    """
    num_valid_item = 0
    num_total_item = 0
    num_valid_review = 0
    num_total_review = 0
    items = {}
    set_of_optional_keys = set()

    if os.path.exists(output_metadata_filepath) or os.path.exists(output_reviews_filepath):
        print('output file already exists')
        return

    for item in parse(input_metadata_filepath):

        # check if item is valid.
        if is_valid(item):
            # convert "fit" field to json from html table
            if 'fit' in item:
                dict_fit = html_table_to_json(f"<table{item['fit']} </table>")
                item['fit'] = dict_fit

            # convert "details" field to readable text from html
            if 'details' in item:
                item['details'] = convert_html_to_readable_text(item['details'])

            # remove empty string from categories
            new_categories = []
            for category in item['category']:
                if category != "":
                    if is_html(category):
                        text = convert_html_to_readable_text(category)
                    else:
                        text = category
                    new_categories.append(remove_unnecessary_space(text))
            item['category'] = new_categories

            # remove empty string from features
            new_features = []
            for feature in item['feature']:
                if feature != "":
                    if is_html(feature):
                        text = convert_html_to_readable_text(feature)
                    else:
                        text = feature
                    new_features.append(remove_unnecessary_space(text))
            item['feature'] = new_features

            if item['asin'] not in items:
                items[item['asin']] = item

        num_total_item += 1

    print("Finished Metadata clean up")
    for review in parse(input_reviews_filepath):
        asin = review['asin']
        if asin in items:
            if 'num_ratings' not in items[asin]:
                items[asin]['num_ratings'] = 0
            if 'total_stars' not in items[asin]:
                items[asin]['total_stars'] = 0

            items[asin]['num_ratings'] += 1
            items[asin]['total_stars'] += float(review['overall'])

            if 'reviewText' in review and is_review_relevant(review):
                if 'reviews' not in items[asin]:
                    items[asin]['reviews'] = []

                items[asin]['reviews'].append([asin, review['reviewText'], review['overall']])
                num_valid_review += 1
            num_total_review += 1

    print("Finished Reviews clean up")
    with open(output_metadata_filepath, mode='w', newline='', encoding='utf-8') as meta_file:
        with open(output_reviews_filepath, mode='w', newline='', encoding='utf-8') as review_file:
            meta_writer = csv.writer(meta_file)
            review_writer = csv.writer(review_file)

            meta_writer.writerow(['item_id', 'name', 'category', 'price', 'brand', 'rating', 'num_reviews', 'rank', 'imageURLs', 'optional'])
            review_writer.writerow(['item_id', 'text', 'stars'])
            for asin, item in items.items():
                if 'reviews' in item and len(item['reviews']) > 3:
                    item = check_for_color(item)
                    item = check_for_material(item)
                    item = move_fit(item)
                    rating = round(item['total_stars'] / item['num_ratings'], 2)
                    optional = {
                        key: item[key] for key in item if
                        key not in {'asin', 'title', 'category', 'price', 'brand', 'feature', 'rating', 'num_reviews',
                                    'rank', 'imageURLHighRes', 'main_cat', 'date', 'also_buy', 'also_view', 'imageURL',
                                    'total_stars', 'num_ratings', 'reviews'}
                    }
                    for key in item.keys():
                        if(key not in {'asin', 'title', 'category', 'price', 'brand', 'feature', 'rating', 'num_reviews',
                                    'rank', 'imageURLHighRes', 'main_cat', 'date', 'also_buy', 'also_view', 'imageURL',
                                    'total_stars', 'num_ratings', 'reviews'}):
                            set_of_optional_keys.add(key)

                    if('details' in optional):
                        optional['details'] = optional['details'].replace('\n\n', '')
                        optional['details'] = optional['details'].replace('\n', ' ')
                    meta_writer.writerow([item['asin'], item['title'], list(set(item['category'] + item['feature'])), item['price'], item['brand'],
                                         rating, len(item['reviews']), item['rank'],
                                          item.get('imageURLHighRes', []), optional])
                    review_writer.writerows(item['reviews'])
                    num_valid_item += 1

    print("Finished Saving")

    print(f'Valid reviews: {num_valid_review}/{num_total_review} ({round(num_valid_review / num_total_review, 4) * 100}%)')
    print(f'Valid items: {num_valid_item}/{num_total_item} ({round(num_valid_item / num_total_item, 4) * 100}%)')

    return items, set_of_optional_keys


num_review = -1
save_top_k_data('meta_Clothing_Shoes_and_Jewelry.json.gz',
                'top_all_meta_Clothing_Shoes_and_Jewelry.json.gz', num_review)

items, set_of_optional_keys = preprocess_data('all-preprocessed-data/top_all_meta_Clothing_Shoes_and_Jewelry.json.gz',
                'Clothing_Shoes_and_Jewelry.json.gz',
                'processed_top_all_meta_Clothing_Shoes_and_Jewelry.csv',
                'processed_top_all_review_Clothing_Shoes_and_Jewelry.csv')

In [None]:
from transformers import AutoConfig, AutoTokenizer, TFAutoModel, AutoModel
import tensorflow as tf
from tensorflow import keras
from keras import layers
import transformers
transformers.logging.set_verbosity_error()

"""
   Taken from  https://github.com/D3Mlab/rir/blob/main/prefernce_matching/LM.py
"""


def create_model(BERT_name, from_pt=True):
    ## BERT encoder
    encoder = TFAutoModel.from_pretrained(BERT_name, from_pt=True)

    ## Model
    input_ids = layers.Input(shape=(None,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(None,), dtype=tf.int32)
    # token_type_ids = layers.Input(shape=(None,), dtype=tf.int32)

    embedding = encoder(
        # input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
        input_ids=input_ids, attention_mask=attention_mask
    )

    model = keras.Model(
        # inputs=[input_ids, attention_mask, token_type_ids],
        inputs=[input_ids, attention_mask],
        outputs=embedding, )

    model.compile()
    return model, input_ids.name, attention_mask.name


class BERT_model:
    def __init__(self, BERT_name, tokenizer_name, from_pt=False):
        """
        :param BERT_name: name or address of language prefernce_matching
        :param tokenizer_name: name or address of the tokenizer
        """
        self.BERT_name = BERT_name
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.bert_model, self.name1, self.name2 = create_model(BERT_name, from_pt)

    def embed(self, texts, strategy=None, bs=48, verbose=0):
        tokenized_review = self.tokenizer.batch_encode_plus(
            texts,
            max_length=512,
            add_special_tokens=True,
            truncation=True,
            # truncation_strategy='longest_first',
            padding="max_length",
            return_token_type_ids=True,
        )

        data = {self.name1: tokenized_review['input_ids'],
                self.name2: tokenized_review['attention_mask'],
                # 'input_3': tokenized_review['token_type_ids']
                }

        if strategy is not None:
            with strategy.scope():
                dataset = tf.data.Dataset.from_tensor_slices(data).batch(bs, drop_remainder=False).prefetch(
                    buffer_size=tf.data.experimental.AUTOTUNE)
                outputs = self.bert_model.predict(dataset, verbose=verbose)
                return outputs['last_hidden_state'][:, 0, :].reshape(-1, 768)
        else:
            dataset = tf.data.Dataset.from_tensor_slices(data).prefetch(
                buffer_size=tf.data.experimental.AUTOTUNE).batch(bs, drop_remainder=False)
            outputs = self.bert_model.predict(dataset, verbose=verbose)
            return outputs['last_hidden_state'][:, 0, :].reshape(-1, 768)

In [None]:
import pandas as pd
import os

class EmbedderCreator:
    def __init__(self, model: BERT_model):
        # This model is the model used to convert the restaurant review file into
        # embeddings
        self.embedding_model = model

    def embed(self, review_file_path: str, embedding_file_path: str):
        """Create the embedding for the review file

        Args:
            review_file_path (str): The file path towards the review file.
            embedding_file_path (str): The file path towards the embedding file.
            
        Returns:
            None: The preprocessed data is stored in the output file path.
        """
        index = 0
        if os.path.exists(embedding_file_path):
            df = pd.read_csv(embedding_file_path)
            index = df.shape[0]
        else:
            # Creating the column title
            df = pd.DataFrame({
                'Review': [],
                'Embedding': [],
                "item_id": []
            })

            # Writing the DataFrame to CSV
            df.to_csv(embedding_file_path, index=False)

        review_dataset = pd.read_csv(review_file_path)

        size = len(review_dataset["text"])

        # # Writing the embedding into the file
        # df = pd.read_csv(embedding_file_path)

        batch_size = 1024

        list_of_review = [0] * batch_size
        list_of_business_id = [0] * batch_size

        for i in range(index, size):
            if i % 100 == 0:
                print(i / 100)

            if i % batch_size == 0 and i != 0:
                embedding = self.embedding_model.embed(list_of_review)
                embedding = embedding.tolist()
                df_new = pd.DataFrame(
                    {'Review': list_of_review, 'Embedding': embedding, "item_id": list_of_business_id})

                # Append df_new to an existing csv file
                df_new.to_csv(embedding_file_path, mode='a', header=False, index=False)

                list_of_review[0] = review_dataset["text"][i]
                list_of_business_id[0] = review_dataset["item_id"][i]
            else:
                list_of_review[i % batch_size] = review_dataset["text"][i]
                list_of_business_id[i % batch_size] = review_dataset["item_id"][i]

        # Embed the remaining reviews
        remaining_reviews = size % batch_size
        list_of_review = list_of_review[:remaining_reviews]
        list_of_business_id = list_of_business_id[:remaining_reviews]

        embedding = self.embedding_model.embed(list_of_review)
        embedding = embedding.tolist()
        df_new = pd.DataFrame({'Review': list_of_review, 'Embedding': embedding, "item_id": list_of_business_id})

        # Append df_new to an existing csv file
        df_new.to_csv(embedding_file_path, mode='a', header=False, index=False)


model_name = "sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco"
embedding_model = BERT_model(model_name, model_name)
embedder = EmbedderCreator(embedding_model)
embedder.embed('processed_top_all_review_Clothing_Shoes_and_Jewelry.csv',
                'embeddings_Clothing_Shoes_and_Jewelry.csv')

In [None]:
import pandas as pd
import torch
import numpy as np
import faiss

class CreateDatabase():
    def create_faiss_database(self, source_embedding_file_path: str, faiss_destination_file_path: str):
        """This function creates the faiss database and store it into a file.

        Args:
            source_embedding_file_path (str): The file path towards the embedding csv file.
            faiss_destination_file_path (str): The file path towards the faiss database file.

        Returns:
            pd.DataFrame: The dataframe that represents the embedding csv file.
        """
        
        # Load the metadata CSV file into a pandas DataFrame
        df_embedding = pd.read_csv(source_embedding_file_path)

        # Create the vector database
        dimension_size = 768
        index = faiss.IndexFlatIP(dimension_size)  # Create the index, uses dot product to measure similarity

        # For each embedding, store it into database. Set the corresponding metadata, id and review array
        for i in range(df_embedding.shape[0]):
            embedding = df_embedding["Embedding"][i]
            embedding = eval(embedding)
            embedding_numpy = np.array(embedding)
            embedding_numpy = np.expand_dims(embedding_numpy, axis=0)
            index.add(embedding_numpy)

        faiss.write_index(index, faiss_destination_file_path)

        return df_embedding

    def convert_clothing_meta_to_json(self, input_filepath, output_filepath):
        """This function converts the clothing metadata csv file to json file

        Args:
            input_filepath (str): The file path towards the clothing metadata csv file.
            output_filepath (str): The output file path towards the clothing metadata json file.
        """
        df = pd.read_csv(input_filepath)
        df['category'] = df['category'].apply(eval)
        df['rating'] = df['rating'].apply(float)
        df['num_reviews'] = df['num_reviews'].apply(float)
        df['optional'] = df['optional'].apply(eval)
        df.to_json(output_filepath, orient='records', lines=True)


    def convert_restaurant_meta_to_json(self, input_file_path, output_filepath):
        """This function converts the restaurant metadata csv file to json file

        Args:
            input_file_path (str): The file path towards the restaurant metadata csv file.
            output_filepath (str): The output file path towards the restaurant metadata json file.
        """
        df = pd.read_csv(input_file_path)
        df['latitude'] = df['latitude'].apply(float)
        df['longitude'] = df['longitude'].apply(float)
        df['stars'] = df['stars'].apply(float)
        df['review_count'] = df['review_count'].apply(int)
        df['is_open'] = df['is_open'].apply(bool)
        df['categories'] = df['categories'].apply(lambda x: list(x.split(",")))
        df['hours'] = df['hours'].apply(ast.literal_eval)
        df['optional'] = df['optional'].apply(ast.literal_eval)
        df.to_json(output_filepath, orient='records', lines=True)


    def create_matrix(self, source_file_pandas: pd.DataFrame, destination_file: str):
        """This function creates the matrix and store it into a pytorch tensor file.

        Args:
            source_file_pandas (pd.DataFrame): The file path towards the embedding csv file.
            destination_file (str): The file path towards the pytorch tensor file.
        """
        
        # Loop through the sorted embedding csv file
        df = source_file_pandas

        container = []

        size = len(df["Embedding"])

        for i in range(size):
            embedding = eval(df["Embedding"][i])
            embedding = torch.tensor(embedding)
            container.append(embedding)

        container = torch.stack(container)

        torch.save(container, destination_file)


    def create_review(self, source_file_pandas: pd.DataFrame, destination_file: str):
        """This function creates a csv file that only contain the review and corresponding item id

        Args:
            source_file_pandas (pd.DataFrame): The pandas dataframe that stores the embedding csv file.
            destination_file (str): The file path towards the review csv file.
        """
        
        # Delete the embedding column
        df = source_file_pandas

        df = df.drop('Embedding', axis=1)

        df.to_csv(destination_file, index=False)

In [None]:
create_database = CreateDatabase()

# Create FAISS database
embedding_df = create_database.create_faiss_database("embeddings_Clothing_Shoes_and_Jewelry.csv", "database.faiss")

# Create item metadata json
create_database.convert_clothing_meta_to_json("all-preprocessed-data/processed_top_all_meta_Clothing_Shoes_and_Jewelry.csv", "item_metadata.json")

# Create the matrix
create_database.create_matrix(embedding_df, "reviews_embedding_matrix.pt")

# Create the item review
create_database.create_review(embedding_df, "items_reviews.csv")