In [1]:
# Autoreload
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

from tqdm.notebook import tqdm

import torch

import datasets
# Don't show progress datasets bars
datasets.disable_progress_bar()
from datasets import load_dataset

from fastchat.model import get_conversation_template

import sys
sys.path.insert(0, str(Path.cwd().parent.resolve()))
from model import get_model
from utils import get_tokenizer

In [3]:
device = torch.device(
    f"cuda:{torch.cuda.current_device()}" if torch.cuda.is_available() else "cpu"
)
device

device(type='cuda', index=0)

In [4]:
# tokenizer_name = "gpt2-xl"
# tokenizer_name = "databricks/dolly-v2-3b"
# tokenizer_name = "meta-llama/Llama-2-13b-hf"
tokenizer_name = "lmsys/vicuna-7b-v1.3"

# dataset_name = "AugustasM/burns-datasets-VINC-imdb-ppo-training-v2"
dataset_name = "imdb"

In [5]:
# Tokenizer
tokenizer = get_tokenizer(tokenizer_name)

Loading tokenizer lmsys/vicuna-7b-v1.3...
Loaded tokenizer.



## Dataset

In [6]:
# Dataset for PPO training
tmp_dataset = load_dataset(dataset_name, split="train")

few_shot_examples_dataset = tmp_dataset.shuffle(seed=42).select(range(5)).to_list()
dataset = tmp_dataset.shuffle(seed=42).select(range(5, len(tmp_dataset)))

print(f"{len(few_shot_examples_dataset)=}")
print(dataset)

Found cached dataset imdb (/admin/home-augustas/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
Loading cached shuffled indices for dataset at /admin/home-augustas/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-9c48ce5d173413c7.arrow
Loading cached shuffled indices for dataset at /admin/home-augustas/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-9c48ce5d173413c7.arrow


len(few_shot_examples_dataset)=5
Dataset({
    features: ['text', 'label'],
    num_rows: 24995
})


In [54]:
def get_sentiment(label):
    # return "Positive" if label else "Negative"
    return "positive" if label else "negative"

# Dataset template
def get_prompt_template(few_shot_examples_dataset):
    def prompt_template(example, num_few_shot_examples=0):
        few_shot_examples = ""
        for i in range(num_few_shot_examples):
            item = few_shot_examples_dataset[i]

            sentiment = get_sentiment(item["label"])
            few_shot_examples += (
                f"Movie review: ```\n{item['text']}```\n"
                f"Sentiment: {sentiment}\n"
            )

        return (
            "Classify the movie review as either positive or negative.\n\n"
            f"{few_shot_examples}"
            "Desired format:\n"
            "Sentiment: <identified_sentiment>\n"
            "Do not print \"Sentiment:\" again, just the sentiment.\n\n"
            f"Movie review: ```\n{example['text']}\n```\n"
            f"Sentiment:"
        )

    return prompt_template

apply_template = get_prompt_template(few_shot_examples_dataset)
input_text = apply_template(dataset[0], num_few_shot_examples=0)
print(input_text)

Classify the movie review as either positive or negative.

Desired format:
Sentiment: <identified_sentiment>
Do not print "Sentiment:" again, just the sentiment.

Movie review: ```
While this movie's style isn't as understated and realistic as a sound version probably would have been, this is still a very good film. In fact, it was seen as an excellent film in its day, as it was nominated for the first Best Picture Oscar (losing to WINGS). I still consider WINGS to be a superior film, but this one is excellent despite a little bit of overacting by the lead, Emil Jannings.<br /><br />Jannings is a general from Czarist Russia who is living out his final days making a few bucks in the 1920s by being a Hollywood extra. His luck appears to have changed as he gets a casting call--to play an Imperial Russian general fighting against the Communists during the revolution. Naturally this isn't much of a stretch acting-wise, but it also gets the old man to thinking about the old days and the revo

## Model

In [8]:
# Model
# model = get_model(tokenizer_name, device)
model = get_model(tokenizer_name, device, load_in_8bit=True)

memory_usage = model.pretrained_model.get_memory_footprint() / (1024 ** 3)
print(f"{memory_usage=:.2f} GB")

Loading policy model...

is_bf16_possible=False
kwargs={'load_in_8bit': True, 'torch_dtype': None}


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loaded subject model with 6,738,419,713 parameters.
Model dtype: torch.float16

memory_usage=6.58 GB


In [55]:
# TODO: do I need to set the pad_token?
generation_kwargs = {
    "top_k": 0,
    "top_p": 1.0,
    # "do_sample": True,
    "do_sample": False,
    # "pad_to_multiple_of": 8,
    "pad_token_id": tokenizer.pad_token_id,
    "eos_token_id": 100_000, # why is this value like this?
    "max_new_tokens": 4,
    # "repetition_penalty": 1.2,
}

for i, example in tqdm(enumerate(dataset), total=len(dataset), leave=False):
    label = "Positive" if example["label"] else "Negative"
    print(f"{label=}")
    label_ids = tokenizer([label])["input_ids"][0]
    generation_kwargs["max_new_tokens"] = max(len(label_ids) + 1, 4)

    input_text = apply_template(example, num_few_shot_examples=0)
    
    conv = get_conversation_template(tokenizer_name)
    conv.append_message(conv.roles[0], input_text)
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()

    inputs = tokenizer([prompt], return_tensors="pt").to(device)
    inputs.pop("token_type_ids")
    output_ids = model.generate(
        **inputs,
        **generation_kwargs,
    )
    output_ids = output_ids[0][len(inputs["input_ids"][0]):]
    outputs = tokenizer.decode(
        output_ids, skip_special_tokens=True, spaces_between_special_tokens=False
    )
    # outputs = outputs.strip().lower()
    # outputs = outputs.lower()
    print(f"{outputs=}")
    print("-" * 100)

    if i >= 4: break

  0%|          | 0/24995 [00:00<?, ?it/s]

label='Positive'
outputs='Positive'
----------------------------------------------------------------------------------------------------
label='Positive'
outputs='Positive'
----------------------------------------------------------------------------------------------------
label='Negative'
outputs='Negative'
----------------------------------------------------------------------------------------------------
label='Negative'
outputs='Negative'
----------------------------------------------------------------------------------------------------
label='Positive'
outputs='Positive'
----------------------------------------------------------------------------------------------------


In [56]:
print(f"{conv.roles[0]}: {input_text}")
print(f"{conv.roles[1]}: {outputs}")

USER: Classify the movie review as either positive or negative.

Desired format:
Sentiment: <identified_sentiment>
Do not print "Sentiment:" again, just the sentiment.

Movie review: ```
Home Room deals with a Columbine-like high-school shooting but rather than hashing over the occurrence itself the film portrays the aftermath and what happened to the survivors, their trauma, guilt and denial.<br /><br />*Spoilers* The shooting itself is treated as a foregone conclusion, with no action footage other than the reaction of an almost teenage SWAT commando after shooting the high school killer. The film has three protagonists; the detective investigating the crime of which no guilty parties are left to convict and two teenage girls surviving the incident, played by a very young Erika Christensen and Busy Philipps.<br /><br />The two girls having nothing in common besides the shooting are put together because of it and the drama ensues.<br /><br />Erika Christensen, though only 24 has been a

## Inspect the model

In [17]:
# model.pretrained_model

In [12]:
# model.pretrained_model.requires_grad_(True)

In [13]:
num_trainable_params = sum(p.numel() for p in model.pretrained_model.parameters() if p.requires_grad)
num_trainable_params

262410240

In [14]:
import functools
from typing import Tuple, Union


def rhasattr(obj, attr):
    """A chain-able attribute version of hasattr. For example, to check if
    `obj` has the attribute `foo.bar.baz`, you can use:
        `rhasattr(obj, "foo.bar.baz")`
    Reference: https://stackoverflow.com/a/67303315
    """
    _nested_attrs = attr.split(".")
    _curr_obj = obj
    for _a in _nested_attrs[:-1]:
        if hasattr(_curr_obj, _a):
            _curr_obj = getattr(_curr_obj, _a)
        else:
            return False
    return hasattr(_curr_obj, _nested_attrs[-1])


def rgetattr(obj, attr: str, *args) -> object:
    """A chain-able attribute version of getattr. For example, to get the
    attribute `foo.bar.baz` from `obj`, you can use:
        `rgetattr(obj, "foo.bar.baz")`
    Reference: https://stackoverflow.com/a/31174427
    """

    def _getattr(obj, attr):
        return getattr(obj, attr, *args)

    return functools.reduce(_getattr, [obj] + attr.split("."))

def findattr(obj, attrs: Tuple[str]) -> Union[object, None]:
    for attr in attrs:
        if rhasattr(obj, attr):
            return rgetattr(obj, attr)
    raise ValueError(f"Could not find an attribute from `{attrs}` in `{obj}`")

def hf_get_decoder_blocks(model: torch.nn.Module) -> Tuple[torch.nn.Module]:
    """Returns the decoder hidden layers of the specified model.
    NOTE: Different model configurations have different hidden layer attribute names.
        - transformer.h: (BloomForCausalLM, GPT2LMHeadModel, GPTJForCausalLM)
        - model.decoder.layers: (OPTForCausalLM)
        - gpt_neox.layers: (GPTNeoXForCausalLM)
        - decoder.block: (T5ForConditionalGeneration)
    """
    hidden_layers_attrs = (
        "h",
        "layers",
        "model.layers",
        "decoder.layers",
        "transformer.h",
        "model.decoder.layers",
        "gpt_neox.layers",
        "decoder.block",
    )
    return findattr(model, hidden_layers_attrs)

In [15]:
def freeze_bottom_causal_layers(model: torch.nn.Module, num_layers_unfrozen: int = 0):
    """Freezes the bottom transformer block layers of the specified model."""
    hidden_layers = hf_get_decoder_blocks(model)
    if num_layers_unfrozen == 0:
        hidden_layers_to_freeze = list(hidden_layers)
    elif num_layers_unfrozen > 0:
        hidden_layers_to_freeze = list(hidden_layers)[:-num_layers_unfrozen]
    else:
        hidden_layers_to_freeze = []
    for layer in hidden_layers_to_freeze:
        layer.requires_grad_(False)

In [16]:
# model.pretrained_model.requires_grad_(True)

# freeze_bottom_causal_layers(model.pretrained_model, num_layers_unfrozen=3)

# num_trainable_params = sum(p.numel() for p in model.pretrained_model.parameters() if p.requires_grad)
# num_trainable_params