In [1]:
import pandas as pd

file_path="csv/GroundTruthV2.csv"

df = pd.read_csv(file_path, sep=";")

# Extract columns into separate arrays
art_ids = df.iloc[:, 0].tolist()  # First column
topics = df.iloc[:, 1].tolist()  # Second column

art_ids = [str(number).zfill(8) for number in art_ids] #Add 0 padding for articleIDs

# Print the arrays to verify
print("Art Ids:", art_ids)

Art Ids: ['70301542', '19294535', '89442655', '90301555', '69329387', '79442689', '39442672', '09442678', '90510865', '89278259', '29278304', '80401292', '29442615', '69442680', '50454507', '19189030', '59442685', '49442676', '70538846', '19442654', '29481701', '40501892', '49189203', '80487813', '19442692', '59189472', '79278250', '59294557', '99017445', '79442694']


In [2]:
from google.cloud import bigquery
client = bigquery.Client()

all_reviews = []

for art_id in art_ids:
    query = f"""SELECT concat(title,'. ',text) as text FROM `ingka-feed-student-dev.RR.RatingsReviews` AS rr
                INNER JOIN `ingka-feed-student-dev.RR.product_categories` AS pc 
                ON rr.art_id = SPLIT(pc.global_id, ',')[SAFE_OFFSET(1)] 
                WHERE country_code = 'us' and PRODUCT_AREA = 'Open storage' and art_id = '{art_id}'
                ORDER BY inserted_on DESC """

    query_job = client.query(query)

    article_reviews = []

    for review in query_job:
         article_reviews.append(review.text)

    all_reviews.append(article_reviews)
len(all_reviews)

30

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import download
download('punkt')

def count_tokens_in_reviews(review_list):
    total_tokens = 0
    for review in review_list:
        total_tokens += len(word_tokenize(review))
    return total_tokens

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
def categorize_reviews_by_token_count(reviews, max_tokens):
    above_limit = []
    below_limit = []
    index_above = []
    
    for review_group in reviews:
        token_count = count_tokens_in_reviews(review_group)
        if token_count > max_tokens:
            above_limit.append(review_group)
            index_above.append(all_reviews.index(review_group))
        else:
            below_limit.append(review_group)
    
    return below_limit, above_limit, index_above

In [5]:
# Applying the function
below_limit, above_limit, index_above = categorize_reviews_by_token_count(all_reviews, max_tokens=5000)

print("Products with <= 5000 tokens:")
print(len(below_limit))

print("Products with > 5000 tokens:")
print(len(above_limit))

Products with <= 5000 tokens:
29
Products with > 5000 tokens:
1


In [6]:
import json

with open('selected_reviews.json', mode='r', encoding='utf-8') as file:
    data = json.load(file)
    above_limit_final = data["reviews"]

In [7]:
elements_to_move = [art_ids[i] for i in index_above]
    
remaining_elements = [art_id for i, art_id in enumerate(art_ids) if i not in index_above]

art_ids = elements_to_move + remaining_elements

art_ids

['70301542',
 '19294535',
 '89442655',
 '90301555',
 '69329387',
 '79442689',
 '39442672',
 '09442678',
 '90510865',
 '89278259',
 '29278304',
 '80401292',
 '29442615',
 '69442680',
 '50454507',
 '19189030',
 '59442685',
 '49442676',
 '70538846',
 '19442654',
 '29481701',
 '40501892',
 '49189203',
 '80487813',
 '19442692',
 '59189472',
 '79278250',
 '59294557',
 '99017445',
 '79442694']

In [8]:
all_reviews = above_limit_final + below_limit
len(all_reviews)

30

In [9]:
import pandas as pd

data = pd.read_csv('csv/core_keyphrases_hierarchical.csv')  

core_keyphrases = []
for art_id in art_ids:
    keyphrases = data[data['art_id'] == int(art_id)]['core_keyphrases'].tolist()
    core_keyphrases.append(keyphrases)

print(core_keyphrases)

[['Nice and sturdy, Great storage space, easily assemble, loose construction,'], [], [], ['several sizes, works well, sturdy, great storage space, great value, easy to assemble, looks good, Arrived damaged,'], ["Frustrating item, latch doesn't work, easy to clean underneath, permanent room feature, Architectural Digest look, crooked drawers,"], ['looks good, optional drawers, would recommend Kallax products, easy assembly,'], [], ['looks wonderful, many options,'], [], ['great storage, Unable to purchase,'], ['Easy to transport, Kallax series, Installing doors and drawers, Durable, Excellent storage solution, Looks great, Great product, affordable, easy assembly,'], ['great use of space, well-built, Handy for craft supplies, for kallax shelves, holds wine bottles, Easy to assemble,'], ['comes together nicely, addition of legs,'], ['Built an entire wall, decent appearance, heavy-duty c-clamps, clear directions for base installation, several different colors, Works great for storage, Old

In [10]:
import pandas as pd

file_path="5Shot_Examples.csv"

df = pd.read_csv(file_path, sep=";")

# Extract columns into separate arrays
art_ids_5shot = df.iloc[:, 0].tolist()  # First column
topics_5shot = df.iloc[:, 1].tolist()  # Second column

art_ids_5shot = [str(number).zfill(8) for number in art_ids_5shot] #Add 0 padding for articleIDs

# Print the arrays to verify
print("Art Ids:", art_ids_5shot)
print("Topics:", topics_5shot)

Art Ids: ['80508504', '49398678', '09294526', '10538849', '29278262']
Topics: ['Quality, Price, Availability, Practicality, Style, Versatilite, Ease of Assembly, Screw', 'Quality, Ease of Assembly, Delivery', 'Quality, Size, Sturdiness, Space, Height, Ease of Assembly', 'Quality, Ease of Assembly, Instructions, Appearance, Material, Price, Design, Sturdiness', 'Quality, Ease of Assembly, Appearance, Versatility, Limitations, Value, Functionality, Sturdiness']


In [11]:
import pandas as pd

data = pd.read_csv('csv/core_keyphrases_hierarchical.csv')  

core_keyphrases_5shot = []
for art_id in art_ids_5shot:
    keyphrases = data[data['art_id'] == int(art_id)]['core_keyphrases'].tolist()
    core_keyphrases_5shot.append(keyphrases)

print(core_keyphrases_5shot)

[['Good product, Kallax shelf unit, easy installation, limited availability, new price increase, stripped screws, white interior, wish for right/left options,'], ['easy to assemble, Exceptional pieces of storage, good experience,'], ["Perfect storage solution, didn't always line up right,"], ['nice material, include wall anchor kit, Clear installation instructions, pleasing shelf unit, carpenter friend found it challenging,'], ['Sturdy, Organization, Happy about product, Easy to find, junk doors, Easy assembly, good looking, slightly different,']]


In [12]:
from google.cloud import bigquery
client = bigquery.Client()

reviews_5shot = []

for art_id in art_ids_5shot:
    query = f""" SELECT concat(title,'. ',text) as text FROM `ingka-feed-student-dev.RR.RatingsReviews` AS rr
                INNER JOIN `ingka-feed-student-dev.RR.product_categories` AS pc 
                ON rr.art_id = SPLIT(pc.global_id, ',')[SAFE_OFFSET(1)] 
                WHERE country_code = 'us' and PRODUCT_AREA = 'Open storage' and art_id = '{art_id}'
                ORDER BY inserted_on DESC """

    query_job = client.query(query)

    article_reviews = []

    for review in query_job:
         article_reviews.append(review.text)

    reviews_5shot.append(article_reviews)
len(reviews_5shot)

5

In [13]:
!pip install accelerate
import transformers
import torch

access_token = "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"

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    token=access_token,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
import time

start_time = time.time()

answers = []

for reviews, core in zip(all_reviews, core_keyphrases):
    messages = [
        {"role": "system", "content": "You are a helpful chatbot who only answer with a comma-separated list of the requested topics and no additional text"},
        {"role": "user", "content": f'From these reviews and core keyphrases, generate maximum 8 non-redundant topics. Topics should be 1 word maximum, can be 2 or 3 if necessary. The topics should be about qualities and characteristics of the products (ex: quality, price, …). The topics should be nouns (ex: not durable but durability). Answer only with a comma-separated list of the topics you identify. Reviews: {reviews_5shot[0]}. Keyphrases: {core_keyphrases_5shot[0]}.'},        
        {"role": "assistant", "content":f'{topics_5shot[0]}'},
        {"role": "user", "content": f'From these reviews and core keyphrases, generate maximum 8 non-redundant topics. Topics should be 1 word maximum, can be 2 or 3 if necessary. The topics should be about qualities and characteristics of the products (ex: quality, price, …). The topics should be nouns (ex: not durable but durability). Answer only with a comma-separated list of the topics you identify. Reviews: {reviews_5shot[1]}. Keyphrases: {core_keyphrases_5shot[1]}.'},        
        {"role": "assistant", "content":f'{topics_5shot[1]}'},
        {"role": "user", "content": f'From these reviews and core keyphrases, generate maximum 8 non-redundant topics. Topics should be 1 word maximum, can be 2 or 3 if necessary. The topics should be about qualities and characteristics of the products (ex: quality, price, …). The topics should be nouns (ex: not durable but durability). Answer only with a comma-separated list of the topics you identify. Reviews: {reviews_5shot[2]}. Keyphrases: {core_keyphrases_5shot[2]}.'},        
        {"role": "assistant", "content":f'{topics_5shot[2]}'},
        {"role": "user", "content": f'From these reviews and core keyphrases, generate maximum 8 non-redundant topics. Topics should be 1 word maximum, can be 2 or 3 if necessary. The topics should be about qualities and characteristics of the products (ex: quality, price, …). The topics should be nouns (ex: not durable but durability). Answer only with a comma-separated list of the topics you identify. Reviews: {reviews_5shot[3]}. Keyphrases: {core_keyphrases_5shot[3]}.'},        
        {"role": "assistant", "content":f'{topics_5shot[3]}'},
        {"role": "user", "content": f'From these reviews and core keyphrases, generate maximum 8 non-redundant topics. Topics should be 1 word maximum, can be 2 or 3 if necessary. The topics should be about qualities and characteristics of the products (ex: quality, price, …). The topics should be nouns (ex: not durable but durability). Answer only with a comma-separated list of the topics you identify. Reviews: {reviews_5shot[4]}. Keyphrases: {core_keyphrases_5shot[4]}.'},        
        {"role": "assistant", "content":f'{topics_5shot[4]}'},
        {"role": "user", "content": f'From these reviews and core keyphrases, generate maximum 8 non-redundant topics. Topics should be 1 word maximum, can be 2 or 3 if necessary. The topics should be about qualities and characteristics of the products (ex: quality, price, …). The topics should be nouns (ex: not durable but durability). Answer only with a comma-separated list of the topics you identify. Reviews: {reviews}. Keyphrases: {core}.'},        
    ]

    prompt = pipeline.tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
    )

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipeline(
    prompt,
    max_new_tokens=50,
    eos_token_id=terminators,    
    pad_token_id=pipeline.tokenizer.eos_token_id,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    )

    answer=outputs[0]["generated_text"][len(prompt):]
    answers.append(answer)
    if (answers.index(answer) + 1) % 10 == 0: 
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"{answers.index(answer) + 1}th file completed in {elapsed_time} seconds.")
    torch.cuda.empty_cache()

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


10th file completed in 27.975313425064087 seconds.
20th file completed in 52.160367488861084 seconds.
30th file completed in 76.06164073944092 seconds.


In [15]:
import csv

filename = "csv/GTv2/5ShotLlama3ReviewsKeyphrasesHierarchical.csv"

def process_answer(answer, max_words=8):
    return ','.join(answer.split(',')[:max_words])

def write_csv(filename, art_ids, answers, max_words=8):
  
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        # Creating a csv writer object with quoting set to quote all fields
        csvwriter = csv.writer(csvfile, delimiter=';')

        # Writing the columns
        csvwriter.writerow(['ArticleID', 'Topics'])

        # Writing the data with processed answers
        for article, answer in zip(art_ids, answers):
            processed_answer = process_answer(answer, max_words)
            csvwriter.writerow([article, processed_answer])

write_csv(filename, art_ids, answers)
print(f"{filename} written with limited answer length!")

csv/GTv2/5ShotLlama3ReviewsKeyphrasesHierarchical.csv written with limited answer length!
