In [1]:
import pandas

asos_path = "asos-v1.xlsx"
asos_dataframe = pandas.read_excel(asos_path, header=1).rename(columns=lambda column_name: column_name.lower())
asos_dataframe.drop(asos_dataframe.tail(2).index, inplace=True)

review_column = "review_text"
sentiment_column = "tone"
multilabel_columns = asos_dataframe.columns[-27:]

asos_dataframe[multilabel_columns] = asos_dataframe[multilabel_columns].fillna(False).astype(bool)

asos_dataframe[[review_column, sentiment_column, *multilabel_columns]]

Unnamed: 0,review_text,tone,fit,durable materials,fabric,durability in use,price,durability while laundering,manufacturing (sewing),suitability in use,...,ethcally,ecological,long life span,reason why returned,reason why not returned,match w/ photo,fit suggestions,style suggestions,practicality,feelings
0,"Great Color but impossible to wear, makes too ...",neutral,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1,I don‚Äôt understand why some people say that ...,neutral,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2,The first think I noticed out of the bag was a...,negative,True,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
3,I wanted to love these but sadly returned as t...,negative,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,"I will be sending these back, not true to size...",negative,True,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,This will be perfect for when I go on holiday....,positive,True,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
195,Waste of money having to pay for duties and th...,negative,False,False,False,False,True,False,False,False,...,False,False,False,True,False,False,False,False,False,False
196,I‚Äôm so glad I bought this dress. Usually I d...,positive,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
197,Are usually wear a 10/12 in dresses I am an E ...,neutral,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [2]:
sentiments = ["positive", "neutral", "negative"]

In [3]:
import torch

device = torch.device("cpu")
device_string = "cpu"
if torch.backends.mps.is_available():
    device = torch.device("mps")
    device_string = "mps"
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    device_string = "cuda:0"

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it", token="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")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", token="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").to(device)

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
def run_model(*prompts):
    input_ids = tokenizer("\n".join(prompts), return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        do_sample=True,
        max_new_tokens=200,
        temperature=1.2,
        top_k=50,
        top_p=0.9,
        repetition_penalty=1.5,
    )
    return tokenizer.decode(outputs[0])


In [6]:
import torch
from tqdm import tqdm

stride = 1

# taken from https://huggingface.co/docs/transformers/perplexity
def get_perplexity(*prompts, progress=False):
    encodings = tokenizer("\n".join(prompts), return_tensors="pt")
    max_length = model.config.max_position_embeddings
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    locs = tqdm(range(0, seq_len, stride)) if progress else range(0, seq_len, stride)
    for begin_loc in locs:
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    return torch.exp(torch.stack(nlls).mean())

In [7]:
print(get_perplexity("Who was the US president in 2010? Barack Obama"))
print(get_perplexity("Who was the US president in 2010? Donald Trump"))

tensor(1015.7299, device='mps:0')
tensor(1512.7534, device='mps:0')


In [8]:
def most_probable_answer(*prompts, answers=None):
    answers = answers or []
    probabilities = [(get_perplexity(*prompts, answer).item(), answer) for answer in answers]
    return sorted(probabilities)

In [9]:
most_probable_answer("Who was the US president in 2010? ", answers=["George Bush", "Donald Trump", "Barack Obama"])

[(716.4212646484375, 'Barack Obama'),
 (1189.866455078125, 'Donald Trump'),
 (1567.660888671875, 'George Bush')]

In [10]:
review_sentiments = asos_dataframe[[review_column, sentiment_column]].to_numpy()

In [11]:
import random

review, sentiment = random.choice(review_sentiments)
question = "What is the sentiment of the following review of a clothing item?"

print(f"{review}\n{sentiment}")
print(most_probable_answer(question, "REVIEW:", review, "The sentiment is ", answers=sentiments))
print(run_model(question, "REVIEW:", review, "The sentiment is "))

Happy with this! I‚Äôm anywhere between 10 - 14 depending on brand and the 12 was right for me.
positive
[(296.5985412597656, 'positive'), (326.2908935546875, 'neutral'), (366.61968994140625, 'negative')]
<bos>What is the sentiment of the following review of a clothing item?
REVIEW:
Happy with this! I‚Äôm anywhere between 10 - 14 depending on brand and the 12 was right for me.
The sentiment is 5 out of 5, suggesting that the customer is happy with their purchase.<eos>


In [12]:
for review, sentiment in review_sentiments:
    question = "What is the sentiment of the following review of a clothing item?"

    print(f"{review}\n{sentiment}")
    answer = most_probable_answer(question, "REVIEW:", review, "The sentiment is ", answers=sentiments)
    print(answer)
    # print(1 - (answer[0][0] / answer[1][0]))
    print()
    # print(run_model(question, "REVIEW:", review, "The sentiment is "))

Great Color but impossible to wear, makes too much noise when walking! would have kept it otherwise. will return
neutral
[(668.9400024414062, 'negative'), (792.3026733398438, 'neutral'), (843.6640014648438, 'positive')]

I don‚Äôt understand why some people say that it‚Äôs too big‚Ä¶ It‚Äôs too small for my size (36) and I weight 55kg. I will size up for sure.
neutral
[(115.72589874267578, 'negative'), (121.41893005371094, 'neutral'), (121.44758605957031, 'positive')]

The first think I noticed out of the bag was a fishy/chemical smell. It was not flattering at all if you have even a little bit of curve. It made a loud awful plastic noise when walking or moving. Returned.
negative
[(178.47996520996094, 'negative'), (196.32516479492188, 'neutral'), (202.2724609375, 'positive')]

I wanted to love these but sadly returned as they were too big. The colour and style is great but these trousers are not petite in my opinion
negative
[(172.5623779296875, 'negative'), (188.97222900390625, 'neut

In [13]:
from sklearn.metrics import classification_report

def classify_dataset(reviews, prompt_function, answers):
    all_predictions = [most_probable_answer(*prompt_function(review), answers=answers) for review, _ in tqdm(reviews)]
    all_predictions = [min(answers)[1] for answers in all_predictions]
    all_labels = [sentiment for _, sentiment in reviews]

    print(classification_report(all_labels, all_predictions))

In [16]:
not_neutral_sentiments = [sentiment for sentiment in sentiments if sentiment != "neutral"]

not_neutral_dataframe = asos_dataframe[asos_dataframe[sentiment_column] != "neutral"]
not_neutral_review_sentiments = not_neutral_dataframe[[review_column, sentiment_column]].to_numpy()

question = "What is the sentiment of the following review of a clothing item?"
print("Version 1, filtered out neutral")
classify_dataset(not_neutral_review_sentiments, lambda review: (question, "REVIEW:", review, "The sentiment is "), not_neutral_sentiments)

Version 1, filtered out neutral


100%|██████████| 173/173 [01:19<00:00,  2.16it/s]

              precision    recall  f1-score   support

    negative       0.80      0.97      0.87        61
    positive       0.98      0.87      0.92       112

    accuracy                           0.90       173
   macro avg       0.89      0.92      0.90       173
weighted avg       0.92      0.90      0.90       173






In [18]:
question = "What is the sentiment of the following review of a clothing item?"
print("Version 1, including neutral")
classify_dataset(review_sentiments, lambda review: (question, "REVIEW:", review, "The sentiment is "), sentiments)

Version 1, including neutral


100%|██████████| 199/199 [02:15<00:00,  1.47it/s]

              precision    recall  f1-score   support

    negative       0.62      0.95      0.75        61
     neutral       0.43      0.12      0.18        26
    positive       0.96      0.85      0.90       112

    accuracy                           0.78       199
   macro avg       0.67      0.64      0.61       199
weighted avg       0.79      0.78      0.76       199






In [14]:
def few_shot(reviews, number):
    chosen_reviews = random.sample(list(reviews), number)

    given = "Given the following review:"
    extract = "Your task is to extract the following sentiment:"
    return "\n".join("\n".join((given, review, extract, sentiment)) for review, sentiment in chosen_reviews)

In [16]:
number = 1
question = "What is the sentiment of the following review of a clothing item?"
prompt_function = lambda review: (few_shot(review_sentiments, number), question, "REVIEW:", review, "The sentiment is ")

classify_dataset(review_sentiments, prompt_function, sentiments)

100%|██████████| 199/199 [02:18<00:00,  1.43it/s]

              precision    recall  f1-score   support

    negative       0.64      0.67      0.66        61
     neutral       0.19      0.19      0.19        26
    positive       0.80      0.78      0.79       112

    accuracy                           0.67       199
   macro avg       0.54      0.55      0.55       199
weighted avg       0.67      0.67      0.67       199






In [15]:
number = 2
question = "What is the sentiment of the following review of a clothing item?"
prompt_function = lambda review: (few_shot(review_sentiments, number), question, "REVIEW:", review, "The sentiment is ")

classify_dataset(review_sentiments, prompt_function, sentiments)

100%|██████████| 199/199 [04:00<00:00,  1.21s/it]

              precision    recall  f1-score   support

    negative       0.64      0.72      0.68        61
     neutral       0.08      0.08      0.08        26
    positive       0.92      0.86      0.89       112

    accuracy                           0.71       199
   macro avg       0.55      0.55      0.55       199
weighted avg       0.73      0.71      0.72       199






In [17]:
number = 5
question = "What is the sentiment of the following review of a clothing item?"
prompt_function = lambda review: (few_shot(review_sentiments, number), question, "REVIEW:", review, "The sentiment is ")

classify_dataset(review_sentiments, prompt_function, sentiments)

100%|██████████| 199/199 [12:18<00:00,  3.71s/it]

              precision    recall  f1-score   support

    negative       0.69      0.80      0.74        61
     neutral       0.27      0.31      0.29        26
    positive       0.94      0.82      0.88       112

    accuracy                           0.75       199
   macro avg       0.63      0.64      0.63       199
weighted avg       0.77      0.75      0.76       199




