In [1]:
#!pip install mlxtend

In [2]:
### POST PROCESSING - ADDITION OF KEYPHRASES

import pandas as pd

# Load the CSV files
df1 = pd.read_csv('csv/GTv2/5ShotLlama3KeyphrasesKMeans.csv', sep=';')
df2 = pd.read_csv('csv/GTv2/5ShotLlama3Reviews.csv', sep=';')

# Ensure Topics are read as strings and handle NaN values
df1['Topics'] = df1['Topics'].fillna('').astype(str)
df2['Topics'] = df2['Topics'].fillna('').astype(str)

# Convert comma-separated string to list
df1['Topics'] = df1['Topics'].apply(lambda x: x.split(',') if x != '' else [])
df2['Topics'] = df2['Topics'].apply(lambda x: x.split(',') if x != '' else [])

# Merging topics with conditions
for index, row in df1.iterrows():
    article_id = row['ArticleID']
    topics_to_add = row['Topics']

    # Find the corresponding rows in the second dataframe
    match_indices = df2[df2['ArticleID'] == article_id].index
    for idx in match_indices:
        existing_topics = df2.at[idx, 'Topics']
        updated_topics = existing_topics.copy()

        # Add new topics if not present and check length constraint
        for topic in topics_to_add:
            if topic not in existing_topics and topic != ' Ease' and topic!= ' Assembly' and len(updated_topics) < 10:
                updated_topics.append(topic)

        # Update the second dataframe
        df2.at[idx, 'Topics'] = updated_topics

# Convert the topics list back to a string
df2['Topics'] = df2['Topics'].apply(lambda x: ','.join(x) if x else '')

# Save the updated second file
df2.to_csv('csv/GTv2/H10topics-5shotReviews.csv', index=False, sep=';')

In [3]:
import pandas as pd

file_path="csv/GTv2/H10topics-5shotReviews.csv"

df = pd.read_csv(file_path, sep=";")

# Extract columns into separate arrays
art_ids = df.iloc[:, 0].tolist()  # First column
topics = df.iloc[:, 1].tolist()  # Second column

art_ids = [str(number).zfill(8) for number in art_ids] #Add 0 padding for articleIDs

# Print the arrays to verify
print("Art Ids:", art_ids)

Art Ids: ['70301542', '19294535', '89442655', '90301555', '69329387', '79442689', '39442672', '09442678', '90510865', '89278259', '29278304', '80401292', '29442615', '69442680', '50454507', '19189030', '59442685', '49442676', '70538846', '19442654', '29481701', '40501892', '49189203', '80487813', '19442692', '59189472', '79278250', '59294557', '99017445', '79442694']


In [4]:
from google.cloud import bigquery
client = bigquery.Client()

all_reviews = []

for art_id in art_ids:
    query = f"""SELECT concat(title,'. ',text) as text FROM `ingka-feed-student-dev.RR.RatingsReviews` AS rr
                INNER JOIN `ingka-feed-student-dev.RR.product_categories` AS pc 
                ON rr.art_id = SPLIT(pc.global_id, ',')[SAFE_OFFSET(1)] 
                WHERE country_code = 'us' and PRODUCT_AREA = 'Open storage' and art_id = '{art_id}'
                ORDER BY inserted_on DESC """

    query_job = client.query(query)

    article_reviews = []

    for review in query_job:
         article_reviews.append(review.text)

    all_reviews.append(article_reviews)
len(all_reviews)

30

In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import download
download('punkt')

def count_tokens_in_reviews(review_list):
    total_tokens = 0
    for review in review_list:
        total_tokens += len(word_tokenize(review))
    return total_tokens

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
def categorize_reviews_by_token_count(reviews, max_tokens):
    above_limit = []
    below_limit = []
    index_above = []
    
    for review_group in reviews:
        token_count = count_tokens_in_reviews(review_group)
        if token_count > max_tokens:
            above_limit.append(review_group)
            index_above.append(all_reviews.index(review_group))
        else:
            below_limit.append(review_group)
    
    return below_limit, above_limit, index_above

In [7]:
# Applying the function
below_limit, above_limit, index_above = categorize_reviews_by_token_count(all_reviews, max_tokens=5000)

print("Products with <= 5000 tokens:")
print(len(below_limit))

print("Products with > 5000 tokens:")
print(len(above_limit))

Products with <= 5000 tokens:
29
Products with > 5000 tokens:
1


In [8]:
import json

with open('selected_reviews.json', mode='r', encoding='utf-8') as file:
    data = json.load(file)
    above_limit_final = data["reviews"]

In [9]:
elements_to_move = [art_ids[i] for i in index_above]
    
remaining_elements = [art_id for i, art_id in enumerate(art_ids) if i not in index_above]

art_ids = elements_to_move + remaining_elements

all_reviews = above_limit_final + below_limit
len(all_reviews)

30

In [10]:
art_ids[8]

'90510865'

In [11]:
!pip install accelerate
import transformers
import torch

access_token = "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"

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    token=access_token,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
import time

start_time = time.time()

answers = []

for reviews, topic in zip(all_reviews, topics):
    
    messages = [
        {"role": "system", "content":"You are a precise review analyzer. Your task is to count the frequency of user-specified topics in product reviews. Count both explicit mentions and implicit references that strongly relate to each topic. Report results only as 'topic: count' pairs. Only include topics with non-zero counts."},
        {"role": "user", "content": f'Analyze the following product review and provide the count for each listed topic. Include both explicit mentions and implicit references that strongly relate to each topic. Report only as "topic: count" pairs for non-zero counts.Review Text: {reviews}.Topics to analyze: {topic}.Examples of implicit mentions:- Quality: "good product", "great product", "works well"- Appearance: "looks great", "stylish", "attractive"- Ease of Assembly: "simple to put together", "no hassle setup"- Functionality: "works as expected", "does the job"- Sturdiness: "built to last", "sturdy"- Value: "worth the price", "cheap"Provide your analysis as "topic: count" pairs only for topics with non-zero counts.'},
        ]

    prompt = pipeline.tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
    )

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipeline(
    prompt,
    max_new_tokens=150,
    eos_token_id=terminators,    
    pad_token_id=pipeline.tokenizer.eos_token_id,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    )

    answer=outputs[0]["generated_text"][len(prompt):]

    answers.append(answer)
    if (answers.index(answer) + 1) % 10 == 0: 
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"{answers.index(answer) + 1}th file completed in {elapsed_time} seconds.")
    torch.cuda.empty_cache()

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


10th file completed in 41.79352641105652 seconds.
20th file completed in 88.07239603996277 seconds.
30th file completed in 129.66538786888123 seconds.


In [15]:
answers = [answer.replace('\n', ', ') for answer in answers]
answers = [answer.replace('*', ', ') for answer in answers]


In [18]:
import csv

filename = "csv/GTv2/llamaPostProcessing.csv"

def write_csv(filename, art_ids, answers):
  
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        # Creating a csv writer object with quoting set to quote all fields
        csvwriter = csv.writer(csvfile, delimiter=';')

        # Writing the columns
        csvwriter.writerow(['ArticleID', 'Frequencies'])

        # Writing the data with processed answers
        for article, answer in zip(art_ids, answers):
            csvwriter.writerow([article, answer])

write_csv(filename, art_ids, answers)
print(f"{filename} written!")

csv/GTv2/llamaPostProcessing.csv written!


In [19]:
df = pd.read_csv('csv/GTv2/llamaPostProcessing.csv', sep=';')

df['Frequencies'] = df['Frequencies'].apply(lambda x: x.split(':, ,', 1)[1] if ':' in x else x)

df.to_csv('csv/GTv2/llamaPostProcessing.csv', sep=';', index=False)

In [20]:
df['Frequencies'] = df['Frequencies'].apply(lambda x: x.split(", ,")[0] if ", ," in x else x)

df.to_csv('csv/GTv2/llamaPostProcessing.csv', sep=';', index=False)

In [21]:
import pandas as pd

def process_frequencies(freq_str):
    # Remove any leading/trailing whitespace and split by comma
    items = freq_str.strip().split(',')
    topics = []
    # Extract topics and frequencies, filter by frequency > 0
    for item in items:
        topic, freq = item.split(':')
        if int(freq.strip()) > 0:
            topics.append(topic.strip())
    # Limit to a maximum of 8 topics
    return topics[:8]

def process_frequencies_freq(freq_str):
    # Remove any leading/trailing whitespace and split by comma
    items = freq_str.strip().split(',')
    topics_with_freq = []
    # Extract topics and frequencies
    for item in items:
        # Split by colon and strip spaces
        topic_freq = item.split(':')
        if len(topic_freq) == 2:
            topic, freq = topic_freq
            freq = int(freq.strip())
            if freq > 0:
                topics_with_freq.append((topic.strip(), freq))
    # Sort topics by decreasing frequency
    topics_with_freq.sort(key=lambda x: x[1], reverse=True)
    # Extract topics and limit to a maximum of 8 topics
    topics = [topic for topic, freq in topics_with_freq][:8]
    return topics

# Load the CSV file
df = pd.read_csv('csv/GTv2/llamaPostProcessing.csv', sep=';')

df['Topics'] = df['Frequencies'].apply(process_frequencies)

df['Topics'] = df['Topics'].apply(lambda x: ', '.join(x))

df = df[['ArticleID', 'Topics']]

df.to_csv('csv/GTv2/llamaPostProcessedKmeans.csv', index=False, sep=';')
print(f"File written!")

File written!
