# LLM Decoder Inference of the Restaurant Reviews

## Download the necessary packages
Recommend using Linux or WSL for Windows.
A good Nvidia GPU (rtx-30xx/40xx) is required for reasonable speed.

In [None]:
!nvidia-smi

The following install the required package for cuda 12.1 in Linux, for the newer RTX 30xx GPUs or higher 

In [None]:
# !python -m pip install --upgrade pip
# !python -m pip install --upgrade --force-reinstall --no-cache-dir torch==2.2.0 triton \
#   --index-url https://download.pytorch.org/whl/cu121
# !python -m pip install "unsloth[cu121-ampere-torch220] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
import pandas as pd
from unsloth import FastLanguageModel
from scipy.stats import spearmanr, pearsonr
import numpy as np
from tqdm import trange, tqdm # progress bar
from matplotlib import pyplot as plt
import torch

If installed without failure, starts by downloading the model we are gonna use.

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama-3-8b-Instruct-bnb-4bit",
    dtype=None,        # None for auto detect
    load_in_4bit=True,
) # about 6GB in size

## Acquire Dataset

We will be using the yelp dataset for experimenting

In [None]:
df = pd.read_csv('dataset/la_mini_df.csv',sep='|', encoding='utf-8', escapechar='\\')

In [None]:
df_reviews = df[df['text'].notnull()]

In [None]:
# Sample Dataset for Testing
from datasets import Dataset

dataset = Dataset.from_pandas(df_reviews)
dataset.set_format(type="torch")

In [None]:
BATCH_SIZE = 32
num_batch = len(dataset) // BATCH_SIZE
DATASET_SIZE = num_batch * BATCH_SIZE

dataset = dataset.select(range(DATASET_SIZE))

## Preprocesssing

Now let's set up the function to format the prompt according to LLama 3's specication, as well as tokenizing it.

In [None]:
categories = ["FOOD", "LOCATION", "ATOMSPHERE", "SERVICE", "PRICE", "MENU", "SPEED"]

def format_prompt(category, review):
    # review = 'The waitress is nice and the food is the best thing i have ever had, it is so good! However, the atomsphere sucks and i hate how loud it is.'

    messages = [
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": f'Rate the following restaurant review in the category of {category} from 1 to 5 where 1 means the worst possible and 5 means the best in their life. Only rate how good the {category} is. Do not pay attention to other factors. If the category {category} is not mentioned in the review, output "NOT MENTIONED" instead. Review: "{review}" '},
        {"role": "assistant", "content": f"The rating of {category} is: "}
    ]
    # The apply_chat_template auto inserts a [eot_id] token at the end, which we will discard.
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
    )[:-10]
    return prompt

def preprocess(batch):
    reviews = batch['text']
    results = {}
    for category in categories:
        prompts = []
        for review in reviews:
            prompts.append(format_prompt(category, review))

        results[category] = prompts
    return results

dataset = dataset.map(preprocess, batched=True, batch_size=1024)

Let's checkout the length distrbution of the review:

In [None]:
empty_prmopt_length = preprocess({"text": [""]})
empty_prmopt_length = tokenizer(empty_prmopt_length['ATOMSPHERE'], return_tensors='pt')['input_ids'].shape[1]
empty_prmopt_length

In [None]:
tokens = tokenizer(
    dataset['ATOMSPHERE'][:10000],
    return_tensors="pt",
    padding=True
)

input_ids = tokens.input_ids
attention_mask = tokens.attention_mask

token_lengths = torch.sum(attention_mask, dim=-1)

# Plot the distribution of token lengths
plt.hist(token_lengths, bins=120)
plt.show()

In [None]:
low_index = token_lengths.topk(1, largest=False).indices.item()

In [None]:
token_lengths = token_lengths.to(torch.float32)

print(f"Mean: {torch.mean(token_lengths).item()}")
print(f"Std: {torch.std(token_lengths).item()}")
print(f"95th percentile: {torch.quantile(token_lengths, 0.95).item()}")

From the histogram, we can see that 95% review has length of less than 284. We will use that as the max length.

In [None]:
MAX_LENGTH = 284

Let's batch up the data and send it to GPU before the actual inteference. This should hopefully speed things up because there is now less cpu to gpu communication.

In [None]:
batched_input_ids = {}
batched_attention_masks = {}

for category in tqdm(categories):
    tokens = tokenizer(
        dataset[category],
        return_tensors="pt",
        padding='max_length',
        truncation=True,
        max_length=MAX_LENGTH
    )

    input_ids = tokens.input_ids.to(model.device)
    attention_mask = tokens.attention_mask.to(model.device)

    input_ids = input_ids.reshape(-1, BATCH_SIZE, MAX_LENGTH)
    attention_mask = attention_mask.reshape(-1, BATCH_SIZE, MAX_LENGTH)

    batched_input_ids[category] = input_ids
    batched_attention_masks[category] = attention_mask

## Inference

We will use the probability output of the model for the last token.

We will record the logits for each of the rating number, as well as the probability of the model saying "NOT".

Let the logits for rating number $i$ be $l_i$, the score can be cacluated as

$$s = \frac{\sum_{i=1}^5 i \exp(l_i)}{\sum_{i=1}^5 \exp(l_i)}$$
for that category.

If the probablity of the model saying "NOT" is very high, that means the model thinks the category is not mentioned in the review. In that case we should discard the rating.

In particular, denote the logits for predicting NOT as $l_n$, we can extract a "usefulness" parameter $u$

$$u = \frac{\sum_{i=1}^5 \exp(l_i)}{exp(l_n) + \sum_{i=1}^5 \exp(l_i)}$$

In [None]:
for category in categories:
    df[f'{category}_score'] = pd.Series(dtype='float')
    df[f'{category}_usefulness'] = pd.Series(dtype='float')

In [None]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

In [57]:
# llama 3 stores digit id as 15 + that digit
numbers = torch.tensor([1,2,3,4,5]).to(model.device)
indices = numbers + 15

# the id for the word "NOT" is 14394
# If the model has a high probability of predicting this word, it means the review score is not going to be useful
not_id = 14394

review_scores = torch.zeros(len(categories), len(dataset))
review_usefulness= torch.zeros(len(categories), len(dataset))

with torch.no_grad():
    for i, category in enumerate(categories):
        for j in trange(num_batch):
            input_ids = batched_input_ids[category][j]
            attention_mask = batched_attention_masks[category][j]
            # get the logits for the 5 number we are interested in
            logits = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                output_hidden_states=False,
                use_cache=False
                ).logits[:, -1]

            number_logits = logits[:, indices]
            exp_logits = torch.exp(number_logits)
            exp_logits_sum = torch.sum(exp_logits, dim=1)

            not_logits = logits[:, not_id]
            exp_not_logits = torch.exp(not_logits)

            usefulness = exp_not_logits / (exp_not_logits + exp_logits_sum)
            review_usefulness[i, j*BATCH_SIZE:(j+1)*BATCH_SIZE] = usefulness

            scores = torch.sum(exp_logits * numbers, dim=1) / exp_logits_sum
            review_scores[i, j*BATCH_SIZE:(j+1)*BATCH_SIZE] = scores

            df_ids = dataset['__index_level_0__'][j*BATCH_SIZE:(j+1)*BATCH_SIZE]
            df.loc[df_ids, f'{category}_score'] = scores.to(torch.float16).cpu().numpy()
            df.loc[df_ids, f'{category}_usefulness'] = usefulness.to(torch.float16).cpu().numpy()
        df.to_csv(f'dataset/la_mini_{category}.csv', sep='|', encoding='utf-8', escapechar='\\')

100%|██████████| 3754/3754 [1:25:12<00:00,  1.36s/it]
100%|██████████| 3754/3754 [1:25:12<00:00,  1.36s/it]
100%|██████████| 3754/3754 [1:38:21<00:00,  1.57s/it]  
 81%|████████  | 3041/3754 [1:09:04<16:16,  1.37s/it]

In [None]:
dataset['__index_level_0__']

In [None]:
dataset['FOOD'][0]

In [None]:
with torch.no_grad():
    input_ids = tokenizer(
        dataset['FOOD'][0],
        return_tensors="pt"
    )['input_ids'].to(model.device)
    logits = model(
                input_ids=input_ids,
                output_hidden_states=False,
                use_cache=False
                ).logits[:, -1]
    number_logits = logits[:, indices]
    exp_logits = torch.exp(number_logits)
    exp_logits_sum = torch.sum(exp_logits, dim=1)

    not_logits = logits[:, not_id]
    exp_not_logits = torch.exp(not_logits)

    usefulness = exp_not_logits / (exp_not_logits + exp_logits_sum)
    scores = torch.sum(exp_logits * numbers, dim=1) / exp_logits_sum

scores, usefulness

In [None]:
df[3]

## Analyze

Let's plot the review usefulness to see just how much review can we consider as useful.

In [None]:
usefulness = review_usefulness[:, ].flatten().cpu().numpy()

plt.hist(usefulness, bins=20)
plt.plot()

Sadly the model thinks that 2/3 of the reviews is not helpful. This is to be expected because a normal review won't cover all the categories.

Let's choose a usefulness threshold of 0.5 for the mask.

In [None]:
review_mask = review_usefulness > 0.7

print(f"Useful Review: {torch.sum(review_mask).item() / review_mask.numel() * 100:.2f}%")

Let's save the review_mask and review_scores just in case

In [1]:
np.save("usefulness.npy", review_usefulness.cpu().numpy())
np.save("scores.npy", review_scores.cpu().numpy()) # Good thing that we didn't shuffle the dataset!

NameError: name 'np' is not defined

In [None]:
review_mask = review_usefulness > 0.5

# let's see how much responses we are ignoring
for i, category in enumerate(categories):
    print(f"{category}: {num_batch*BATCH_SIZE - torch.sum(review_mask[i]).item()} out of {num_batch*BATCH_SIZE} reviews are ignored")

# let's also see how much useful category does each review have
useful_reviews = torch.sum(review_mask[:, :num_batch*BATCH_SIZE], dim=0)
plt.hist(useful_reviews, bins=len(categories), width=0.7)
plt.title("Number of Useful Categories per Review")
plt.show()

Let's insepct somes reviews that are relevant to the location. Just to make sure they are correct.

In [None]:
location_id = categories.index("LOCATION")
index = torch.nonzero(review_mask[location_id] == True)

# let's see some of the reviews that are useful for location
for i in range(10):
    print(dataset['text'][index[i].item()])

Most of these reviews have mentioned something about the location, which means the model behaves as expected.

In [None]:
actual_scores = dataset['label']

actual_scores.shape

In [None]:
review_scores.shape, review_mask.shape

For each category, let's calculate the spearmanr and pearsonr corrleation to see how relevant they are. We will only consider the reviews that is useful to that category.

In [None]:
# spearsman and pearson correlation
spearmans = []
pearsons = []
for i, category in enumerate(categories):
    mask = review_mask[i]
    predicated_scores = review_scores[i][mask].cpu()
    real_scores = actual_scores[mask]
    spearman = spearmanr(predicated_scores, real_scores.cpu())
    pearson = pearsonr(predicated_scores, real_scores.cpu())
    spearmans.append(spearman)
    pearsons.append(pearson)

fig = plt.figure(figsize=(12, 6))

plt.bar(categories, [pearson[0] for pearson in pearsons], label='Pearson')
plt.bar(categories, [spearman.correlation for spearman in spearmans], label='Spearman')

plt.legend()
plt.show()

In [None]:
# Let's see the box plot of ratings for each category
fig = plt.figure(figsize=(12, 6))

for i, category in enumerate(categories):
    mask = review_mask[i]
    predicated_scores = review_scores[i][mask].cpu()
    real_scores = actual_scores[mask]

    plt.boxplot(predicated_scores, positions=[i], showfliers=False)

plt.xticks(range(len(categories)), categories)
plt.title("Predicted Scores")
plt.show()


We can also export the reviews to a csv file for further analysis.

In [None]:
usefulness = np.load("usefulness.npy")
scores = np.load("scores.npy")

df = dataset.to_pandas()
for i, category in enumerate(categories):
    df[f"{category}_usefulness"] = usefulness[i]
    df[f"{category}_score"] = scores[i]

# save the dataframe
df.to_csv("yelp_restaurant_review_labelled.csv", index=False)