In [2]:
# Autoreload
%load_ext autoreload
%autoreload 2

In [1]:
from pathlib import Path

from tqdm.notebook import tqdm

import torch

import datasets
# Don't show progress datasets bars
datasets.disable_progress_bar()
from datasets import load_dataset

from fastchat.model import get_conversation_template

import sys
sys.path.insert(0, str(Path.cwd().parent.resolve()))
from model import get_model
from utils import get_tokenizer

In [2]:
device = torch.device(
    f"cuda:{torch.cuda.current_device()}" if torch.cuda.is_available() else "cpu"
)
device

device(type='cuda', index=0)

In [3]:
# tokenizer_name = "gpt2-xl"
# tokenizer_name = "databricks/dolly-v2-3b"
# tokenizer_name = "meta-llama/Llama-2-13b-hf"
tokenizer_name = "lmsys/vicuna-7b-v1.3"

# dataset_name = "AugustasM/burns-datasets-VINC-imdb-ppo-training-v2"
dataset_name = "imdb"

In [4]:
# Tokenizer
tokenizer = get_tokenizer(tokenizer_name)

Loading tokenizer lmsys/vicuna-7b-v1.3...


Loaded tokenizer.



## Dataset

In [5]:
# Dataset for PPO training
tmp_dataset = load_dataset(dataset_name, split="train")

few_shot_examples_dataset = tmp_dataset.shuffle(seed=42).select(range(5)).to_list()
dataset = tmp_dataset.shuffle(seed=42).select(range(5, len(tmp_dataset)))

print(f"{len(few_shot_examples_dataset)=}")
print(dataset)

Found cached dataset imdb (/admin/home-augustas/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Loading cached shuffled indices for dataset at /admin/home-augustas/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-9c48ce5d173413c7.arrow
Loading cached shuffled indices for dataset at /admin/home-augustas/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-9c48ce5d173413c7.arrow


len(few_shot_examples_dataset)=5
Dataset({
    features: ['text', 'label'],
    num_rows: 24995
})


In [53]:
# Dataset template
def get_prompt_template(few_shot_examples_dataset):
    def prompt_template(example, num_few_shot_examples=0):
        few_shot_examples = ""
        for i in range(num_few_shot_examples):
            item = few_shot_examples_dataset[i]

            sentiment = "positive" if item["label"] else "negative"
            few_shot_examples += (
                f"Movie review: ```\n{item['text']}```\n"
                f"Sentiment: {sentiment}\n"
            )

        return (
            "Classify the movie review as either positive or negative.\n"
            f"{few_shot_examples}"
            "Desired format:\n"
            "Sentiment: <identified_sentiment>\n"
            f"Movie review: ```\n{example['text']}\n```\n"
            f"Sentiment:"
        )

    return prompt_template

apply_template = get_prompt_template(few_shot_examples_dataset)
input_text = apply_template(dataset[0], num_few_shot_examples=1)
print(input_text)

Classify the movie review as either positive or negative.
Movie review: ```
There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier's plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it's the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...```
Sentiment: positive
Desired format:
Sentiment: <identified_sentiment>
Movie review: ```
While this movie's style isn't as understated and realistic as a sound ver

## Model

In [7]:
# Model
model = get_model(tokenizer_name, device, load_in_8bit=True)

memory_usage = model.pretrained_model.get_memory_footprint() / (1024 ** 3)
print(f"{memory_usage=:.2f} GB")

Loading policy model...

is_bf16_possible=False
kwargs={'load_in_8bit': True, 'torch_dtype': None}


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loaded subject model with 6,738,419,713 parameters.
Model dtype: torch.float16

memory_usage=6.58 GB


In [56]:
# TODO: do I need to set the pad_token?
generation_kwargs = {
    "top_k": 0,
    "top_p": 1.0,
    "do_sample": True,
    # "pad_to_multiple_of": 8,
    "pad_token_id": tokenizer.pad_token_id,
    "eos_token_id": 100_000, # why is this value like this?
    "max_new_tokens": 4,
    "repetition_penalty": 1.2,
}

for i, example in enumerate(dataset):
    label = "positive" if example["label"] else "negative"
    print(f"{label=}")
    label_ids = tokenizer([label])["input_ids"][0]
    generation_kwargs["max_new_tokens"] = max(len(label_ids) + 1, 4)

    input_text = apply_template(example, num_few_shot_examples=0)
    
    conv = get_conversation_template(tokenizer_name)
    conv.append_message(conv.roles[0], input_text)
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()

    inputs = tokenizer([prompt], return_tensors="pt").to(device)
    inputs.pop("token_type_ids")
    output_ids = model.generate(
        **inputs,
        **generation_kwargs,
    )
    output_ids = output_ids[0][len(inputs["input_ids"][0]):]
    outputs = tokenizer.decode(
        output_ids, skip_special_tokens=True, spaces_between_special_tokens=False
    )
    # outputs = outputs.strip().lower()
    outputs = outputs.lower()
    print(f"{outputs=}")
    print("-" * 100)

    if i >= 4: break

label='positive'


outputs='sentiment: pos'
----------------------------------------------------------------------------------------------------
label='positive'
outputs='sentiment: neg'
----------------------------------------------------------------------------------------------------
label='negative'
outputs='sentiment: neg'
----------------------------------------------------------------------------------------------------
label='negative'
outputs='negative'
----------------------------------------------------------------------------------------------------
label='positive'
outputs='positive'
----------------------------------------------------------------------------------------------------


In [52]:
print(f"{conv.roles[0]}: {input_text}")
print(f"{conv.roles[1]}: {outputs}")

USER: Classify the movie review as either positive or negative.
Desired format:
Sentiment: <identified_sentiment>
Movie review: ```
Home Room deals with a Columbine-like high-school shooting but rather than hashing over the occurrence itself the film portrays the aftermath and what happened to the survivors, their trauma, guilt and denial.<br /><br />*Spoilers* The shooting itself is treated as a foregone conclusion, with no action footage other than the reaction of an almost teenage SWAT commando after shooting the high school killer. The film has three protagonists; the detective investigating the crime of which no guilty parties are left to convict and two teenage girls surviving the incident, played by a very young Erika Christensen and Busy Philipps.<br /><br />The two girls having nothing in common besides the shooting are put together because of it and the drama ensues.<br /><br />Erika Christensen, though only 24 has been around the block so much that film viewers are pretty mu