In [30]:
# true_label = "True"
# false_label = "False"

true_label = "Yes"
false_label = "No"

label_values = [true_label, false_label]

In [2]:
import pandas

asos_path = "asos.xlsx"
asos_dataframe = pandas.read_excel(asos_path, header=1).rename(columns=lambda column_name: column_name.lower())
asos_dataframe.drop(asos_dataframe.tail(2).index, inplace=True)

review_column = "review_text"
sentiment_column = "tone"
multilabel_columns = list(asos_dataframe.columns[-27:])

label_value_mapper = {key: value for key, value in zip([True, False], label_values)}
asos_dataframe[multilabel_columns] = asos_dataframe[multilabel_columns].fillna(False) \
                                                                       .astype(bool) \
                                                                       .replace(label_value_mapper)

asos_dataframe[[review_column, sentiment_column, *multilabel_columns]]

Unnamed: 0,review_text,tone,fit,durable materials,fabric,durability in use,price,durability while laundering,manufacturing (sewing),suitability in use,...,ethcally,ecological,long life span,reason why returned,reason why not returned,match w/ photo,fit suggestions,style suggestions,practicality,feelings
0,"Great Color but impossible to wear, makes too ...",neutral,No,No,No,No,No,No,No,No,...,No,No,No,Yes,No,No,No,No,No,No
1,I don‚Äôt understand why some people say that ...,neutral,Yes,No,No,No,No,No,No,No,...,No,No,No,No,No,No,Yes,No,No,No
2,The first think I noticed out of the bag was a...,negative,Yes,No,No,No,No,No,No,No,...,No,No,No,Yes,No,No,No,No,No,No
3,I wanted to love these but sadly returned as t...,negative,Yes,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
4,"I will be sending these back, not true to size...",negative,Yes,No,No,No,No,No,No,No,...,No,No,No,Yes,No,No,No,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,This will be perfect for when I go on holiday....,positive,Yes,No,No,No,No,No,No,Yes,...,No,No,No,No,No,No,No,No,No,No
195,Waste of money having to pay for duties and th...,negative,No,No,No,No,Yes,No,No,No,...,No,No,No,Yes,No,No,No,No,No,No
196,I‚Äôm so glad I bought this dress. Usually I d...,positive,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
197,Are usually wear a 10/12 in dresses I am an E ...,neutral,Yes,No,No,No,No,No,No,No,...,No,No,No,No,No,No,Yes,No,No,No


In [3]:
import torch

device = torch.device("cpu")
device_string = "cpu"
if torch.backends.mps.is_available():
    device = torch.device("mps")
    device_string = "mps"
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    device_string = "cuda:0"

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it", token="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")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", token="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").to(device)

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
def run_model(*prompts):
    input_ids = tokenizer("\n".join(prompts), return_tensors="pt").to(device)
    outputs = model.generate(
        **input_ids,
        do_sample=True,
        max_new_tokens=200,
        temperature=1.2,
        top_k=50,
        top_p=0.9,
        repetition_penalty=1.5,
    )
    return tokenizer.decode(outputs[0])


In [6]:
import torch
from tqdm import tqdm

stride = 1

# taken from https://huggingface.co/docs/transformers/perplexity
def get_perplexity(*prompts, progress=False):
    encodings = tokenizer("\n".join(prompts), return_tensors="pt")
    max_length = model.config.max_position_embeddings
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    locs = tqdm(range(0, seq_len, stride)) if progress else range(0, seq_len, stride)
    for begin_loc in locs:
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    return torch.exp(torch.stack(nlls).mean())

In [7]:
print(get_perplexity("Who was the US president in 2010? Barack Obama"))
print(get_perplexity("Who was the US president in 2010? Donald Trump"))

tensor(1015.7299, device='mps:0')
tensor(1512.7534, device='mps:0')


In [8]:
def most_probable_answer(*prompts, answers=None):
    answers = answers or []
    probabilities = [(get_perplexity(*prompts, answer).item(), answer) for answer in answers]
    return sorted(probabilities)

In [9]:
most_probable_answer("Who was the US president in 2010? ", answers=["George Bush", "Donald Trump", "Barack Obama"])

[(716.4212646484375, 'Barack Obama'),
 (1189.866455078125, 'Donald Trump'),
 (1567.660888671875, 'George Bush')]

In [10]:
review_multilabels = asos_dataframe[[review_column, *multilabel_columns]].to_numpy()

In [11]:
import random

review, *multilabels = random.choice(review_multilabels)
label, label_name = random.choice(list(zip(multilabels, multilabel_columns)))
pre_question = f"Does the following review correspond to '{label_name}'?"
post_question = f"Can we label the review with '{label_name}'? "

print(f"{review}\n{label_name}: {label}")
print(most_probable_answer(pre_question, "REVIEW:", review, post_question, "ANSWER:", answers=label_values))
print(run_model(pre_question, "REVIEW:", review, post_question, "ANSWER:"))

I felt really comfy  in this dress I got a lot of compliments love how it opens at the back I wore a size 10\r\nI‚Äôm usually a 12
fabric: No
[(335.684814453125, 'No'), (338.4942932128906, 'Yes')]
<bos>Does the following review correspond to 'fabric'?
REVIEW:
I felt really comfy  in this dress I got a lot of compliments love how it opens at the back I wore a size 10\r\nI‚Äôm usually a 12
Can we label the review with 'fabric'? 
ANSWER: Yes, the review definitely corresponds to "Fabric". It provides information about the material and fit of the garment.<eos>


In [12]:
from sklearn.metrics import classification_report

def classify_dataset(reviews, prompt_function, answers):
    all_predictions = [most_probable_answer(*prompt_function(review), answers=answers) for review, _ in tqdm(reviews)]
    all_predictions = [min(answers)[1] for answers in all_predictions]
    all_labels = [label for _, label in reviews]

    print(classification_report(all_labels, all_predictions))

In [17]:
label_name = random.choice(multilabel_columns)
print(f"Classifying {label_name}")

label = multilabel_columns.index(label_name) + 1
review_labels = review_multilabels[:, [0, label]]

pre_question = f"Does the following review correspond to '{label_name}'?"
post_question = f"Can we label the review with '{label_name}'? "
classify_dataset(review_labels, lambda review: (pre_question, "REVIEW:", review, post_question, "ANSWER:"), label_values)

Classifying practicality


100%|██████████| 199/199 [00:52<00:00,  3.78it/s]

              precision    recall  f1-score   support

          No       0.98      0.90      0.94       195
         Yes       0.00      0.00      0.00         4

    accuracy                           0.88       199
   macro avg       0.49      0.45      0.47       199
weighted avg       0.96      0.88      0.92       199






In [None]:
for label_name in multilabel_columns:
    print(f"Classifying '{label_name}'")

    label = multilabel_columns.index(label_name) + 1
    review_labels = review_multilabels[:, [0, label]]

    pre_question = f"Does the following review correspond to '{label_name}'?"
    post_question = f"Can we label the review with '{label_name}'? "
    classify_dataset(review_labels, lambda review: (pre_question, "REVIEW:", review, post_question, "ANSWER:"), label_values)

In [28]:
def few_shot(reviews, label_name, number):
    chosen_reviews = random.sample(list(reviews), number)

    given = "Given the following review:"
    corresponds = "The review corresponds to the following label:"
    not_correspond = "The review does not correspond to the following label:"

    example = lambda review, label: "\n".join((given, review, corresponds if label == true_label else not_correspond, label_name))
    return "\n\n".join(example(review, label) for review, label in chosen_reviews)

In [31]:
label_name = random.choice(multilabel_columns)
print(f"Classifying {label_name}")

label = multilabel_columns.index(label_name) + 1
review_labels = review_multilabels[:, [0, label]]


number = 1
pre_question = f"Does the following review correspond to '{label_name}'?"
post_question = f"Can we label the review with '{label_name}'? "
prompt_function = lambda review: (few_shot(review_labels, label_name, number), pre_question, "REVIEW:", review, post_question, "ANSWER:")

classify_dataset(review_labels, prompt_function, label_values)

Classifying style






[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A

              precision    recall  f1-score   support

          No       0.74      0.79      0.76       141
         Yes       0.39      0.33      0.36        58

    accuracy                           0.65       199
   macro avg       0.56      0.56      0.56       199
weighted avg       0.64      0.65      0.64       199






In [None]:
number = 2
question = "What is the sentiment of the following review of a clothing item?"
prompt_function = lambda review: (few_shot(review_multilabels, number), question, "REVIEW:", review, "The sentiment is ")

classify_dataset(review_multilabels, prompt_function, sentiments)

In [None]:
number = 5
question = "What is the sentiment of the following review of a clothing item?"
prompt_function = lambda review: (few_shot(review_multilabels, number), question, "REVIEW:", review, "The sentiment is ")

classify_dataset(review_multilabels, prompt_function, sentiments)