In [1]:
from google.cloud import bigquery
client = bigquery.Client()

query = """ SELECT Distinct art_id as text FROM `ingka-feed-student-dev.RR.RatingsReviews` AS rr
            INNER JOIN `ingka-feed-student-dev.RR.product_categories` AS pc 
            ON rr.art_id = SPLIT(pc.global_id, ',')[SAFE_OFFSET(1)] 
            WHERE country_code = 'us' and PRODUCT_AREA = 'Open storage' """

query_job = client.query(query)

art_ids = []

for art_id in query_job:
     art_ids.append(art_id.text)

print(len(art_ids))
art_ids[:5]

222


['50339292', '30275861', '20423720', '40415601', '90324509']

In [2]:
import random

sample_size = min(5, len(art_ids))

# Read the blacklist file (if it exists) and store IDs in a set
blacklist = set()
try:
    with open("csv/GroundTruth_complete.csv", 'r') as f:
        for line in f:
            blacklist.add(line.strip().split(';')[0])
except FileNotFoundError:
    pass

blacklist

{'19189030',
 '19294535',
 '19442654',
 '19442692',
 '29278304',
 '29294554',
 '29442615',
 '29481701',
 '39442672',
 '40501892',
 '49189203',
 '49442676',
 '50454507',
 '59189472',
 '59294557',
 '59442685',
 '69329387',
 '69442680',
 '70301542',
 '70538846',
 '79278250',
 '79442689',
 '79442694',
 '80401292',
 '80487813',
 '89278259',
 '90301555',
 '90510865',
 '9442678',
 '99017445',
 '\ufeffArticleID'}

In [3]:
# Filter art_ids excluding entries in the first column of the blacklist
filtered_art_ids = [art_id for art_id in art_ids if art_id not in blacklist]

# Get a random sample of the filtered list (size at most 30)
ground_truth_ids = random.sample(filtered_art_ids, sample_size)

print(ground_truth_ids)

blacklist.add(ground_truth_id for ground_truth_id in ground_truth_ids)

blacklist = [str(number).zfill(8) for number in blacklist] #Add 0 padding for articleIDs

blacklist

['69442642', '99442674', '29197572', '79278269', '20423720']


['70538846',
 '59189472',
 '99017445',
 '19189030',
 '50454507',
 '49189203',
 '79442694',
 '29278304',
 '29481701',
 '19294535',
 '09442678',
 '19442654',
 '39442672',
 '80487813',
 '29294554',
 '40501892',
 '<generator object <genexpr> at 0x7f3eb99b3bc0>',
 '19442692',
 '89278259',
 '59442685',
 '69329387',
 '80401292',
 '29442615',
 '90510865',
 '\ufeffArticleID',
 '90301555',
 '79278250',
 '70301542',
 '49442676',
 '69442680',
 '79442689',
 '59294557']

In [4]:
filtered_art_ids = [art_id for art_id in art_ids if art_id not in blacklist]
ground_truth_ids = random.sample(filtered_art_ids, sample_size)

ground_truth_ids

['49306872', '59481653', '49275088', '89442641', '79552908']

In [3]:
all_reviews = []

for art_id in ground_truth_ids:
    query = f""" SELECT concat(title,'. ',text) as text FROM `ingka-feed-student-dev.RR.RatingsReviews` AS rr
                INNER JOIN `ingka-feed-student-dev.RR.product_categories` AS pc 
                ON rr.art_id = SPLIT(pc.global_id, ',')[SAFE_OFFSET(1)] 
                WHERE country_code = 'us' and PRODUCT_AREA = 'Open storage' and art_id = '{art_id}'
                ORDER BY inserted_on DESC """

    query_job = client.query(query)

    article_reviews = []

    for review in query_job:
         article_reviews.append(review.text)

    all_reviews.append(article_reviews)
len(all_reviews)

5

In [4]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import download
download('punkt')

def count_tokens_in_reviews(review_list):
    total_tokens = 0
    for review in review_list:
        total_tokens += len(word_tokenize(review))
    return total_tokens

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
def categorize_reviews_by_token_count(reviews, max_tokens):
    above_limit = []
    below_limit = []

    for review_group in reviews:
        token_count = count_tokens_in_reviews(review_group)
        if token_count > max_tokens:
            above_limit.append(review_group)
        else:
            below_limit.append(review_group)
    
    return below_limit, above_limit

In [6]:
# Applying the function
below_limit, above_limit = categorize_reviews_by_token_count(all_reviews, max_tokens=5000)

print("Products with <= 5000 tokens:")
print(len(below_limit))

print("Products with > 5000 tokens:")
print(len(above_limit))

Products with <= 5000 tokens:
5
Products with > 5000 tokens:
0


In [7]:
!pip install accelerate
import transformers
import torch

access_token = "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"
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    token=access_token,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
import time

start_time = time.time()

answers = []

for reviews in all_reviews:
    messages = [
        {"role": "system", "content": "You are a helpful chatbot who only answer with a comma-separated list of the requested topics and no additional text"},
        {"role": "user", "content": 'From these reviews, generate maximum 15 non-redundant topics. Topics should be 1 word maximum, can be 2 or 3 if necessary. The topics should be about qualities and characteristics of the products (ex: quality, price, …) and not names of objects (like Books, shelves, …). The topics should be names (ex: not durable but durability). Answer only with a comma-separated list of the topics you identify. Reviews: "Perfect Simple TV Stand. I actually went in to buy something different, but when I saw the possibility of changing the orientation of this bookshelf and combining with a stand (maybe not even necessary but I think it enhances the appearance), I bought this instead. It seems sturdy and durable, looks good, I’m happy with my purchase. I put a couple of Fossta bins in mine but there are many other options for the storage bays.","Good quality. Good looking and quality","TV Stand With Base. A Simple, clean and light weight unit when cocmbined with the metal base works well as a tv stand in my case a 55" TV. I\'ll probably get the black baskets which fit perfectly into this or craft something simple to cover the back. I was worried that the round bottom of the legs would not be secure into the sqare legs of the base but they seem to fit without movement. But be aware that the top is not secured in any way- meaning you can lift this kallax right up out of the base. Of course the weight of the tv will prevent that from occurring but ikea should have engineered a way to secure theKallax base to this Kallax unit.","IKEA perfection. IKEA perfection"	,"Sturdy, good quality and minimalist, Love it!. This media console is just what I needed! It simple, sturdy and minimalist. It was easy to built. Thanks to the visual instructions! Very good quality and it\'s a minimalist piece. It adds to the TV room without interfering with the rest of the furniture but adding weigh to it, which was necessary. It is also useful because I store a blanket in one of the shelves, my sleepers in another one and tv stuff...",	"Nice piece. I like it.","Simple and elegant. Simple and elegant".'},
        {"role": "assistant", "content": "Durability, Simplicity, Sturdiness, Quality, Weight, Minimalism, Appearance, Security, Ease of Assembly, Flexibility"},
        {"role": "user", "content": f'From these reviews, generate maximum 15 non-redundant topics. Topics should be 1 word maximum, can be 2 or 3 if necessary. The topics should be about qualities and characteristics of the products (ex: quality, price, …) and not names of objects (like Books, shelves, …). The topics should be names (ex: not durable but durability). Answer only with a comma-separated list of the topics you identify. Reviews: {reviews}.'},
    ]

    prompt = pipeline.tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
    )

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipeline(
    prompt,
    max_new_tokens=100,
    eos_token_id=terminators,    
    pad_token_id=pipeline.tokenizer.eos_token_id,
    do_sample=True,
    temperature=0.3,
    top_p=0.9,
    )

    answer=outputs[0]["generated_text"][len(prompt):]
    answers.append(answer)
    if (answers.index(answer) + 1) % 10 == 0: 
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"{answers.index(answer) + 1}th file completed in {elapsed_time} seconds.")
    torch.cuda.empty_cache()

In [9]:
import csv

filename = '5Shot_Examples.csv'

# Writing to the csv file
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
    # Creating a csv writer object with quoting set to quote all fields
    csvwriter = csv.writer(csvfile, delimiter=';')
    
    # Writing the columns
    csvwriter.writerow(['ArticleID', 'Topics'])
    
    # Writing the data
    for article, answer in zip(ground_truth_ids, answers):
        csvwriter.writerow([article, answer])

print(f'CSV file "{filename}" has been written successfully.')

CSV file "5Shot_Examples.csv" has been written successfully.
