In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m73.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m76.5 MB/s[0m eta [36m0:00:0

In [1]:
import ast
import gzip
import pandas as pd

list_of_color = ['blue', 'red', 'green', 'purple', 'black', 'orange', 'yellow', 'gold', 'white', 'silver', 'pink', 'turquoise', 'sky blue', 'sapphire blue',\
                 'aquamarine', 'cyan', 'blood red', 'lime green', 'gray', 'dark blue', 'teal', 'violet', 'brown', 'emerald green', 'light blue', 'lavender', 'baby blue', 'mint green',\
                 'ruby red', 'indigo', 'navy blue', 'aqua', 'royal blue', 'chrome', 'amethyst', 'neon green', 'scarlet', 'cobalt blue', 'sunset orange', 'azure', 'electric blue', 'neon red',\
                 'light pink', 'hot pink', 'bright yellow', 'coral', 'platinum', 'midnight purple', 'grass green', 'sea green']

list_of_material = ['cotton', 'wool', 'silk', 'linen', 'polyester', 'nylon', 'acrylic', 'spandex']

def save_top_k_data(input_filepath, output_filepath, k):
    if(os.path.exists(output_filepath)):
        return None
    g = gzip.open(input_filepath, 'rb')
    with gzip.open(output_filepath, 'ab') as f:
        for l in g:
            if k == 0:
                break
            f.write(l)
            k -= 1

def cut_reviews_csv(in_reviews_filepath, in_meta_filepath, out_reviews_filepath, out_meta_filepath, k):
    reviews_df = pd.read_csv(in_reviews_filepath)
    print(reviews_df.shape[0])
    out_reviews_df = reviews_df.head(k)
    last_id = out_reviews_df.iloc[-1]['item_id']

    meta_df = pd.read_csv(in_meta_filepath)
    print(meta_df.shape[0])
    idx = meta_df.index[meta_df['item_id'] == last_id].tolist()[0]
    out_meta_df = meta_df.iloc[:idx + 1]

    print(out_reviews_df.shape[0])
    print(out_meta_df.shape[0])
    out_meta_df.to_csv(out_meta_filepath, index=False)
    out_reviews_df.to_csv(out_reviews_filepath, index=False)


def convert_embeddings_to_json(input_filepath, output_filepath):
    chunks = pd.read_csv(input_filepath, chunksize=1000)
    i = 0
    for df in chunks:
        df['Embedding'] = df['Embedding'].apply(ast.literal_eval)
        df.to_json(output_filepath, orient='records', mode='a', lines=True)
        i += 1
        print(i)


def convert_clothing_meta_to_json(input_filepath, output_filepath):
    df = pd.read_csv(input_filepath)
    df['category'] = df['category'].apply(ast.literal_eval)
    df['feature'] = df['feature'].apply(ast.literal_eval)
    df['rating'] = df['rating'].apply(float)
    df['num_reviews'] = df['num_reviews'].apply(float)
    df['optional'] = df['optional'].apply(ast.literal_eval)
    df.to_json(output_filepath, orient='records', lines=True)


def convert_restaurant_meta_to_json(input_filepath, output_filepath):
    df = pd.read_csv(input_filepath)
    df['latitude'] = df['latitude'].apply(float)
    df['longitude'] = df['longitude'].apply(float)
    df['stars'] = df['stars'].apply(float)
    df['review_count'] = df['review_count'].apply(int)
    df['is_open'] = df['is_open'].apply(bool)
    df['categories'] = df['categories'].apply(lambda x: list(x.split(",")))
    df['hours'] = df['hours'].apply(ast.literal_eval)
    df['optional'] = df['optional'].apply(ast.literal_eval)
    df.to_json(output_filepath, orient='records', lines=True)

def check_for_color(item):
    color_found = set()
    if("category" in item):
        for i in range(len(list_of_color)):
            for category in item["category"]:
                if(list_of_color[i] in category.lower()):
                    color_found.add(list_of_color[i])

    if("feature" in item):
        for i in range(len(list_of_color)):
            for feature in item["feature"]:
                if(list_of_color[i] in feature.lower()):
                    color_found.add(list_of_color[i])

    if("description" in item):
        for i in range(len(list_of_color)):
            for description in item["description"]:
                if(list_of_color[i] in description.lower()):
                    color_found.add(list_of_color[i])

    if("details" in item):
        for i in range(len(list_of_color)):
            if(list_of_color[i] in item["details"].lower()):
                color_found.add(list_of_color[i])

    color_found = list(color_found)
    if(len(color_found)>0):
        item['color'] = color_found

    return item

def check_for_material(item):
    # If there is a tag in the category then
    material_found = set()
    if("category" in item):
        for i in range(len(list_of_material)):
            for category in item["category"]:
                if(list_of_material[i] in category.lower()):
                    material_found.add(list_of_material[i])

    if("feature" in item):
        for i in range(len(list_of_material)):
            for feature in item["feature"]:
                if(list_of_material[i] in feature.lower()):
                    material_found.add(list_of_material[i])

    if("description" in item):
        for i in range(len(list_of_material)):
            for description in item["description"]:
                if(list_of_material[i] in description.lower()):
                    material_found.add(list_of_material[i])

    if("details" in item):
        for i in range(len(list_of_material)):
            if(list_of_material[i] in item["details"].lower()):
                material_found.add(list_of_material[i])

    material_found = list(material_found)
    if(len(material_found)>0):
        item['material'] = material_found

    return item

def move_fit(item):
    too_small = 0
    small = 0
    perfect = 0
    big = 0
    too_big = 0

    if('fit' in item):
        if('Too small' in item['fit']):
            # This data is very unorganized
            too_small = item['fit']['Too small']
            small = item['fit']['Somewhat small']
            perfect = item['fit']['Fits as expected']
            big = item['fit']['Somewhat large']
            too_big = item['fit']['Too large']
            item.pop("fit")

            sentence = f"{too_small} people thinks it is too small, {small} people thinks it is somewhat small, {perfect} people thinks it is fits as expected, {big} people thinks it is somewhat large and {too_big} people thinks it is too large."

            item['category'].append(sentence)

    return item

In [2]:
import csv
import gzip
import json
import os
import re
import time

from bs4 import BeautifulSoup


def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
         yield json.loads(l)


def html_table_to_json(html):
    soup = BeautifulSoup(html, 'html.parser')

    if soup is None:
        return {}

    # Find all rows in the table
    rows = soup.find_all('tr')

    # Create an empty dictionary to store the data
    data_dict = {}

    # Iterate over each row
    for row in rows:
        # Extract the key (rating) from the first column
        key = row.find('span').text.strip()

        # Extract the value (count) from the last column
        value = int(row.find('span').find_next('span').text.strip().replace(',', ''))

        # Add the key-value pair to the dictionary
        data_dict[key] = value
    return data_dict


def is_html(text: str):
    return bool(re.search(r'</\w.*?>', text))


def is_javascript(text: str):
    return bool(re.search(r"\bvar\b", text))


def remove_unnecessary_space(text):
    return re.sub(r"\s+", " ", text)


def convert_html_to_readable_text(html) -> str:
    soup = BeautifulSoup(html, 'html.parser')
    if soup is None:
        return html
    return soup.get_text()


def is_review_relevant(review) -> bool:
    keywords = [
        'deliver(y|ies|ed|ing)?',
        'ship(ping|ped)?',
        'service(s)?',
        'packag(e|ing|ed|s)?',
        'mail(s|ing)?',
        'ship(s)?',
        'arrive(s|d)?',
        'road',
        'Amazon Your review could not be posted',
        'UPS',
        'FedEx',
        'shipment',
        'packaging',
        'helpful',
        'Responsive',
        'Return',
        'seller',
        'refund',
        'communication',
        'refund',
        'communication',
        'resolved',
        'follow up',
        'courteous(ness|ize|ing)?',
        'replace',
        'exchange',
        'support',
        'delay',
        'tracking',
        'box',
        'receiv(ed|es|ing)',
        'week(s)',
        'day(s)'
    ]

    regex_pattern = r'\b(' + '|'.join(keywords) + r')\b'

    return review.get('summary', '').lower() not in {'delivery', 'service'} and \
                len(review['reviewText'].split(" ")) > 30 and \
                not re.search(regex_pattern, review["reviewText"], re.IGNORECASE) and \
                not is_html(review["reviewText"])


def is_valid(item) -> bool:
    return all('Clothing' in item['category'] and key in item and len(item[key]) != 0 for key in {'price', 'asin', 'feature', 'category', 'title', 'rank', 'brand'}) \
           and not is_html(item['title']) and not is_javascript(item['title'])


def preprocess_data(input_metadata_filepath, input_reviews_filepath, output_metadata_filepath, output_reviews_filepath):
    """
    Preprocess the metadata.

    Remove all items where:
        - "price" field are empty
        - "feature" field are empty
        - "category" field are empty
        - "title" field are javascript or html

    ....

    :param output_reviews_filepath:
    :param input_reviews_filepath:
    :param input_metadata_filepath:
    :param output_metadata_filepath:
    :return:
    """
    num_valid_item = 0
    num_total_item = 0
    num_valid_review = 0
    num_total_review = 0
    items = {}
    set_of_optional_keys = set()

    if os.path.exists(output_metadata_filepath) or os.path.exists(output_reviews_filepath):
        print('output file already exists')
        return

    for item in parse(input_metadata_filepath):

        # check if item is valid.
        if is_valid(item):
            # convert "fit" field to json from html table
            if 'fit' in item:
                dict_fit = html_table_to_json(f"<table{item['fit']} </table>")
                item['fit'] = dict_fit

            # convert "details" field to readable text from html
            if 'details' in item:
                item['details'] = convert_html_to_readable_text(item['details'])

            # remove empty string from categories
            new_categories = []
            for category in item['category']:
                if category != "":
                    if is_html(category):
                        text = convert_html_to_readable_text(category)
                    else:
                        text = category
                    new_categories.append(remove_unnecessary_space(text))
            item['category'] = new_categories

            # remove empty string from features
            new_features = []
            for feature in item['feature']:
                if feature != "":
                    if is_html(feature):
                        text = convert_html_to_readable_text(feature)
                    else:
                        text = feature
                    new_features.append(remove_unnecessary_space(text))
            item['feature'] = new_features

            if item['asin'] not in items:
                items[item['asin']] = item

        num_total_item += 1

    print("Finished Metadata clean up")
    for review in parse(input_reviews_filepath):
        asin = review['asin']
        if asin in items:
            if 'num_ratings' not in items[asin]:
                items[asin]['num_ratings'] = 0
            if 'total_stars' not in items[asin]:
                items[asin]['total_stars'] = 0

            items[asin]['num_ratings'] += 1
            items[asin]['total_stars'] += float(review['overall'])

            if 'reviewText' in review and is_review_relevant(review):
                if 'reviews' not in items[asin]:
                    items[asin]['reviews'] = []

                items[asin]['reviews'].append([asin, review['reviewText'], review['overall']])
                num_valid_review += 1
            num_total_review += 1

    print("Finished Reviews clean up")
    with open(output_metadata_filepath, mode='w', newline='', encoding='utf-8') as meta_file:
        with open(output_reviews_filepath, mode='w', newline='', encoding='utf-8') as review_file:
            meta_writer = csv.writer(meta_file)
            review_writer = csv.writer(review_file)

            meta_writer.writerow(['item_id', 'name', 'category', 'price', 'brand', 'rating', 'num_reviews', 'rank', 'imageURLs', 'optional'])
            review_writer.writerow(['item_id', 'text', 'stars'])
            for asin, item in items.items():
                if 'reviews' in item and len(item['reviews']) > 3:
                    item = check_for_color(item)
                    item = check_for_material(item)
                    item = move_fit(item)
                    rating = round(item['total_stars'] / item['num_ratings'], 2)
                    optional = {
                        key: item[key] for key in item if
                        key not in {'asin', 'title', 'category', 'price', 'brand', 'feature', 'rating', 'num_reviews',
                                    'rank', 'imageURLHighRes', 'main_cat', 'date', 'also_buy', 'also_view', 'imageURL',
                                    'total_stars', 'num_ratings', 'reviews'}
                    }
                    for key in item.keys():
                        if(key not in {'asin', 'title', 'category', 'price', 'brand', 'feature', 'rating', 'num_reviews',
                                    'rank', 'imageURLHighRes', 'main_cat', 'date', 'also_buy', 'also_view', 'imageURL',
                                    'total_stars', 'num_ratings', 'reviews'}):
                            set_of_optional_keys.add(key)

                    if('details' in optional):
                        optional['details'] = optional['details'].replace('\n\n', '')
                        optional['details'] = optional['details'].replace('\n', ' ')
                    meta_writer.writerow([item['asin'], item['title'], list(set(item['category'] + item['feature'])), item['price'], item['brand'],
                                         rating, len(item['reviews']), item['rank'],
                                          item.get('imageURLHighRes', []), optional])
                    review_writer.writerows(item['reviews'])
                    num_valid_item += 1

    print("Finished Saving")

    print(f'Valid reviews: {num_valid_review}/{num_total_review} ({round(num_valid_review / num_total_review, 4) * 100}%)')
    print(f'Valid items: {num_valid_item}/{num_total_item} ({round(num_valid_item / num_total_item, 4) * 100}%)')

    return items, set_of_optional_keys


if __name__ == '__main__':
    num_review = -1
    save_top_k_data('meta_Clothing_Shoes_and_Jewelry.json.gz',
                    'top_all_meta_Clothing_Shoes_and_Jewelry.json.gz', num_review)

    items, set_of_optional_keys = preprocess_data('top_all_meta_Clothing_Shoes_and_Jewelry.json.gz',
                    'Clothing_Shoes_and_Jewelry.json.gz',
                    'processed_top_all_meta_Clothing_Shoes_and_Jewelry.csv',
                    'processed_top_all_review_Clothing_Shoes_and_Jewelry.csv')

Finished Metadata clean up
Finished Reviews clean up
Finished Saving
Valid reviews: 985526/4513858 (21.83%)
Valid items: 35485/2685059 (1.32%)


In [3]:
from transformers import AutoConfig, AutoTokenizer, TFAutoModel, AutoModel
import tensorflow as tf
from tensorflow import keras
from keras import layers
import transformers
transformers.logging.set_verbosity_error()

"""
   Taken from  https://github.com/D3Mlab/rir/blob/main/prefernce_matching/LM.py
"""


def create_model(BERT_name, from_pt=True):
    ## BERT encoder
    encoder = TFAutoModel.from_pretrained(BERT_name, from_pt=True)

    ## Model
    input_ids = layers.Input(shape=(None,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(None,), dtype=tf.int32)
    # token_type_ids = layers.Input(shape=(None,), dtype=tf.int32)

    embedding = encoder(
        # input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
        input_ids=input_ids, attention_mask=attention_mask
    )

    model = keras.Model(
        # inputs=[input_ids, attention_mask, token_type_ids],
        inputs=[input_ids, attention_mask],
        outputs=embedding, )

    model.compile()
    return model, input_ids.name, attention_mask.name


class BERT_model:
    def __init__(self, BERT_name, tokenizer_name, from_pt=False):
        """
        :param BERT_name: name or address of language prefernce_matching
        :param tokenizer_name: name or address of the tokenizer
        """
        self.BERT_name = BERT_name
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.bert_model, self.name1, self.name2 = create_model(BERT_name, from_pt)

    def embed(self, texts, strategy=None, bs=48, verbose=0):
        tokenized_review = self.tokenizer.batch_encode_plus(
            texts,
            max_length=512,
            add_special_tokens=True,
            truncation=True,
            # truncation_strategy='longest_first',
            padding="max_length",
            return_token_type_ids=True,
        )

        data = {self.name1: tokenized_review['input_ids'],
                self.name2: tokenized_review['attention_mask'],
                # 'input_3': tokenized_review['token_type_ids']
                }

        if strategy is not None:
            with strategy.scope():
                dataset = tf.data.Dataset.from_tensor_slices(data).batch(bs, drop_remainder=False).prefetch(
                    buffer_size=tf.data.experimental.AUTOTUNE)
                outputs = self.bert_model.predict(dataset, verbose=verbose)
                return outputs['last_hidden_state'][:, 0, :].reshape(-1, 768)
        else:
            dataset = tf.data.Dataset.from_tensor_slices(data).prefetch(
                buffer_size=tf.data.experimental.AUTOTUNE).batch(bs, drop_remainder=False)
            outputs = self.bert_model.predict(dataset, verbose=verbose)
            return outputs['last_hidden_state'][:, 0, :].reshape(-1, 768)

In [9]:
import pandas as pd
import os

class EmbedderCreator:
    def __init__(self, model: BERT_model):
        # This model is the model used to convert the restaurant review file into
        # embeddings
        self.embedding_model = model

    def embed(self, review_file_path: str, embedding_file_path: str):
        index = 0
        if os.path.exists(embedding_file_path):
            df = pd.read_csv(embedding_file_path)
            index = df.shape[0]
        else:
            # Creating the column title
            df = pd.DataFrame({
                'Review': [],
                'Embedding': [],
                "item_id": []
            })

            # Writing the DataFrame to CSV
            df.to_csv(embedding_file_path, index=False)

        review_dataset = pd.read_csv(review_file_path)

        size = len(review_dataset["text"])

        # # Writing the embedding into the file
        # df = pd.read_csv(embedding_file_path)

        batch_size = 1024

        list_of_review = [0] * batch_size
        list_of_business_id = [0] * batch_size

        for i in range(index, size):
            if i % 100 == 0:
                print(i / 100)

            if i % batch_size == 0 and i != 0:
                embedding = self.embedding_model.embed(list_of_review)
                embedding = embedding.tolist()
                df_new = pd.DataFrame(
                    {'Review': list_of_review, 'Embedding': embedding, "item_id": list_of_business_id})

                # Append df_new to an existing csv file
                df_new.to_csv(embedding_file_path, mode='a', header=False, index=False)

                list_of_review[0] = review_dataset["text"][i]
                list_of_business_id[0] = review_dataset["item_id"][i]
            else:
                list_of_review[i % batch_size] = review_dataset["text"][i]
                list_of_business_id[i % batch_size] = review_dataset["item_id"][i]

        # Embed the remaining reviews
        remaining_reviews = size % batch_size
        list_of_review = list_of_review[:remaining_reviews]
        list_of_business_id = list_of_business_id[:remaining_reviews]

        embedding = self.embedding_model.embed(list_of_review)
        embedding = embedding.tolist()
        df_new = pd.DataFrame({'Review': list_of_review, 'Embedding': embedding, "item_id": list_of_business_id})

        # Append df_new to an existing csv file
        df_new.to_csv(embedding_file_path, mode='a', header=False, index=False)


if __name__ == '__main__':
    model_name = "sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco"
    embedding_model = BERT_model(model_name, model_name)
    embedder = EmbedderCreator(embedding_model)
    embedder.embed('processed_top_all_review_Clothing_Shoes_and_Jewelry.csv',
                   'embeddings_Clothing_Shoes_and_Jewelry.csv')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
3658.0
3659.0
3660.0
3661.0
3662.0
3663.0
3664.0
3665.0
3666.0
3667.0
3668.0
3669.0
3670.0
3671.0
3672.0
3673.0
3674.0
3675.0
3676.0
3677.0
3678.0
3679.0
3680.0
3681.0
3682.0
3683.0
3684.0
3685.0
3686.0
3687.0
3688.0
3689.0
3690.0
3691.0
3692.0
3693.0
3694.0
3695.0
3696.0
3697.0
3698.0
3699.0
3700.0
3701.0
3702.0
3703.0
3704.0
3705.0
3706.0
3707.0
3708.0
3709.0
3710.0
3711.0
3712.0
3713.0
3714.0
3715.0
3716.0
3717.0
3718.0
3719.0
3720.0
3721.0
3722.0
3723.0
3724.0
3725.0
3726.0
3727.0
3728.0
3729.0
3730.0
3731.0
3732.0
3733.0
3734.0
3735.0
3736.0
3737.0
3738.0
3739.0
3740.0
3741.0
3742.0
3743.0
3744.0
3745.0
3746.0
3747.0
3748.0
3749.0
3750.0
3751.0
3752.0
3753.0
3754.0
3755.0
3756.0
3757.0
3758.0
3759.0
3760.0
3761.0
3762.0
3763.0
3764.0
3765.0
3766.0
3767.0
3768.0
3769.0
3770.0
3771.0
3772.0
3773.0
3774.0
3775.0
3776.0
3777.0
3778.0
3779.0
3780.0
3781.0
3782.0
3783.0
3784.0
3785.0
3786.0
3787.0
3788.0
3789.0
3790.0
3791