In [1]:
from google.cloud import bigquery
client = bigquery.Client()

query = """ SELECT Distinct art_id as text FROM `ingka-feed-student-dev.RR.RatingsReviews` AS rr
            INNER JOIN `ingka-feed-student-dev.RR.product_categories` AS pc 
            ON rr.art_id = SPLIT(pc.global_id, ',')[SAFE_OFFSET(1)] 
            WHERE country_code = 'us' and PRODUCT_AREA = 'Open storage' """

query_job = client.query(query)

art_ids = []

for art_id in query_job:
     art_ids.append(art_id.text)

print(len(art_ids))
art_ids[:5]

222


['50339292', '30275861', '20423720', '40415601', '90324509']

In [2]:
import random

sample_size = min(30, len(art_ids))
    
ground_truth_ids = random.sample(art_ids, sample_size)

ground_truth_ids

['19294535',
 '29294554',
 '90301555',
 '69329387',
 '79442689',
 '39442672',
 '09442678',
 '90510865',
 '89278259',
 '29278304',
 '80401292',
 '29442615',
 '69442680',
 '50454507',
 '19189030',
 '59442685',
 '49442676',
 '70538846',
 '19442654',
 '29481701',
 '40501892',
 '49189203',
 '80487813',
 '19442692',
 '59189472',
 '70301542',
 '79278250',
 '59294557',
 '99017445',
 '79442694']

In [3]:
all_reviews = []

for art_id in ground_truth_ids:
    query = f""" SELECT concat(title,'. ',text) as text FROM `ingka-feed-student-dev.RR.RatingsReviews` AS rr
                INNER JOIN `ingka-feed-student-dev.RR.product_categories` AS pc 
                ON rr.art_id = SPLIT(pc.global_id, ',')[SAFE_OFFSET(1)] 
                WHERE country_code = 'us' and PRODUCT_AREA = 'Open storage' and art_id = '{art_id}'
                ORDER BY inserted_on DESC """

    query_job = client.query(query)

    article_reviews = []

    for review in query_job:
         article_reviews.append(review.text)

    all_reviews.append(article_reviews)
len(all_reviews)

30

In [4]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import download
download('punkt')

def count_tokens_in_reviews(review_list):
    total_tokens = 0
    for review in review_list:
        total_tokens += len(word_tokenize(review))
    return total_tokens

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
def categorize_reviews_by_token_count(reviews, max_tokens):
    above_limit = []
    below_limit = []

    for review_group in reviews:
        token_count = count_tokens_in_reviews(review_group)
        if token_count > max_tokens:
            above_limit.append(review_group)
        else:
            below_limit.append(review_group)
    
    return below_limit, above_limit

In [6]:
# Applying the function
below_limit, above_limit = categorize_reviews_by_token_count(all_reviews, max_tokens=5000)

print("Products with <= 5000 tokens:")
print(len(below_limit))

print("Products with > 5000 tokens:")
print(len(above_limit))

Products with <= 5000 tokens:
29
Products with > 5000 tokens:
1


In [7]:
import json
import nltk
import random
from nltk.tokenize import word_tokenize

# Ensure you have the NLTK tokenizers
nltk.download('punkt')

# Function to count tokens
def count_tokens(text):
    return len(word_tokenize(text))

above_limit_final = []
selected_indices = []

for reviews in above_limit:
    start_reviews = [(i, review, count_tokens(review)) for i, review in enumerate(reviews)]
    
    random.shuffle(start_reviews)

    selected_reviews = []
    selected_indices_set = []
    current_token_count = 0
    for index, review, tokens in start_reviews:
        if current_token_count + tokens > 5000:
            break
        selected_reviews.append(review)
        current_token_count += tokens
    
    above_limit_final.append(selected_reviews)

# Save the arrays of reviews to a JSON file
with open('selected_reviews.json', mode='w', encoding='utf-8') as file:
    json.dump({"reviews": above_limit_final}, file)

# Output lengths of the final results
print("Number of sets of selected reviews:", len(above_limit_final))
if above_limit_final:
    print("Number of reviews in the first set:", len(above_limit[0]))
    print("Number of selected reviews in the first set:", len(above_limit_final[0]))


Number of sets of selected reviews: 1
Number of reviews in the first set: 145
Number of selected reviews in the first set: 100


[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
with open('selected_reviews.json', mode='w', encoding='utf-8') as file:
    json.dump({"reviews": above_limit_final}, file)

In [9]:
len(above_limit_final)

1

In [10]:
all_reviews = above_limit_final + below_limit
len(all_reviews)

30

In [11]:
!pip install accelerate
import transformers
import torch

access_token = "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"
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    token=access_token,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
import time

start_time = time.time()

answers = []

for reviews in all_reviews:
    messages = [
        {"role": "system", "content": "You are a helpful chatbot who only answer with a comma-separated list of the requested topics and no additional text"},
        {"role": "user", "content": 'From these reviews, generate maximum 15 non-redundant topics. Topics should be 1 word maximum, can be 2 or 3 if necessary. The topics should be about qualities and characteristics of the products (ex: quality, price, …) and not names of objects (like Books, shelves, …). The topics should be names (ex: not durable but durability). Answer only with a comma-separated list of the topics you identify. Reviews: "Perfect Simple TV Stand. I actually went in to buy something different, but when I saw the possibility of changing the orientation of this bookshelf and combining with a stand (maybe not even necessary but I think it enhances the appearance), I bought this instead. It seems sturdy and durable, looks good, I’m happy with my purchase. I put a couple of Fossta bins in mine but there are many other options for the storage bays.","Good quality. Good looking and quality","TV Stand With Base. A Simple, clean and light weight unit when cocmbined with the metal base works well as a tv stand in my case a 55" TV. I\'ll probably get the black baskets which fit perfectly into this or craft something simple to cover the back. I was worried that the round bottom of the legs would not be secure into the sqare legs of the base but they seem to fit without movement. But be aware that the top is not secured in any way- meaning you can lift this kallax right up out of the base. Of course the weight of the tv will prevent that from occurring but ikea should have engineered a way to secure theKallax base to this Kallax unit.","IKEA perfection. IKEA perfection"	,"Sturdy, good quality and minimalist, Love it!. This media console is just what I needed! It simple, sturdy and minimalist. It was easy to built. Thanks to the visual instructions! Very good quality and it\'s a minimalist piece. It adds to the TV room without interfering with the rest of the furniture but adding weigh to it, which was necessary. It is also useful because I store a blanket in one of the shelves, my sleepers in another one and tv stuff...",	"Nice piece. I like it.","Simple and elegant. Simple and elegant".'},
        {"role": "assistant", "content": "Durability, Simplicity, Sturdiness, Quality, Weight, Minimalism, Appearance, Security, Ease of Assembly, Flexibility"},
        {"role": "user", "content": f'From these reviews, generate maximum 15 non-redundant topics. Topics should be 1 word maximum, can be 2 or 3 if necessary. The topics should be about qualities and characteristics of the products (ex: quality, price, …) and not names of objects (like Books, shelves, …). The topics should be names (ex: not durable but durability). Answer only with a comma-separated list of the topics you identify. Reviews: {reviews}.'},
    ]

    prompt = pipeline.tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
    )

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipeline(
    prompt,
    max_new_tokens=100,
    eos_token_id=terminators,    
    pad_token_id=pipeline.tokenizer.eos_token_id,
    do_sample=True,
    temperature=0.3,
    top_p=0.9,
    )

    answer=outputs[0]["generated_text"][len(prompt):]
    answers.append(answer)
    if (answers.index(answer) + 1) % 10 == 0: 
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"{answers.index(answer) + 1}th file completed in {elapsed_time} seconds.")
    torch.cuda.empty_cache()

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


10th file completed in 30.23777461051941 seconds.
20th file completed in 60.05793499946594 seconds.
30th file completed in 87.76824188232422 seconds.


In [13]:
import csv

filename = 'aGroundTruth_topics_llama3.csv'

# Writing to the csv file
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
    # Creating a csv writer object with quoting set to quote all fields
    csvwriter = csv.writer(csvfile, delimiter=';')
    
    # Writing the columns
    csvwriter.writerow(['ArticleID', 'Topics'])
    
    # Writing the data
    for article, answer in zip(ground_truth_ids, answers):
        csvwriter.writerow([article, answer])

print(f'CSV file "{filename}" has been written successfully.')

CSV file "aGroundTruth_topics_llama3.csv" has been written successfully.


In [14]:
import csv

def process_csv(input_file, output_file):
    with open(input_file, mode='r', newline='', encoding='utf-8') as infile, \
         open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
        
        reader = csv.reader(infile, delimiter=';')
        writer = csv.writer(outfile, delimiter=';')
        
        for row in reader:
            # Check if the row has at least two columns
            if len(row) >= 2:
                # Find the position of the first colon in the second column
                colon_index = row[1].find(':')
                # Remove everything before and including the colon, if it exists
                if colon_index != -1:
                    row[1] = row[1][colon_index + 1:].strip()
            writer.writerow(row)

# Replace 'input.csv' and 'output.csv' with your actual file paths
process_csv('aGroundTruth_topics_llama3.csv', 'GroundTruth_topics_llama3.csv')
