In [1]:
%load_ext autoreload
%autoreload 2

## Imports


In [2]:
import sys
from pathlib import Path

import torch
from torch import nn

from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

In [3]:
ELK_PATH = Path("../../../elk/")
ELK_PATH.resolve()

PosixPath('/rds/user/am3052/hpc-work/elk')

In [4]:
modules = [
    ELK_PATH,
    ELK_PATH / "elk" / "training",
    ELK_PATH / "elk" / "promptsource",
]

for module in modules:
    if not str(module) in sys.path:
        sys.path.insert(0, str(module.resolve()))

sys.path[:2]

['/rds/user/am3052/hpc-work/elk/elk/promptsource',
 '/rds/user/am3052/hpc-work/elk/elk/training']

In [5]:
from reporter import Reporter
from templates import DatasetTemplates

## Data

In [53]:
dataset = load_dataset("imdb", split="test")
dataset

Found cached dataset imdb (/home/am3052/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [55]:
dataset[12501]["label"]

1

In [7]:
dataset_templates = DatasetTemplates("imdb")
dataset_templates.all_template_names

['Movie Expressed Sentiment',
 'Movie Expressed Sentiment 2',
 'Negation template for positive and negative',
 'Reviewer Enjoyment',
 'Reviewer Enjoyment Yes No',
 'Reviewer Expressed Sentiment',
 'Reviewer Opinion bad good choices',
 'Reviewer Sentiment Feeling',
 'Sentiment with choices ',
 'Text Expressed Sentiment',
 'Writer Expressed Sentiment',
 'burns_1',
 'burns_2']

In [8]:
TEMPLATE_NAME = "Movie Expressed Sentiment"
template = [x for x in dataset_templates.templates.values() if x.name == TEMPLATE_NAME][0]
print(template.jinja)

{{text}}
The sentiment expressed for the movie is ||| {{ answer_choices [label] }}


In [9]:
template.answer_choices

'negative ||| positive'

In [10]:
examples = []

for label in [True, False]:
    example = dataset[0].copy()
    example["label"] = label
    examples.append(example)

examples

[{'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as

In [11]:
res = template.apply(examples[0])
print(res[1])

positive


In [12]:
res = template.apply(examples[1])
print(res[1])

negative


In [13]:
print(f"{res[0]} {res[1]}")

I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to alway

## Pass through a language model

In [28]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
tokenizer.pad_token = tokenizer.eos_token
tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

GPT2TokenizerFast(name_or_path='EleutherAI/gpt-j-6B', vocab_size=50257, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

In [29]:
%%time

model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
model.to("cuda");

Downloading (…)lve/main/config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

CPU times: user 49.3 s, sys: 58.5 s, total: 1min 47s
Wall time: 4min 12s


GPTJForCausalLM(
  (transformer): GPTJModel(
    (wte): Embedding(50400, 4096)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-27): 28 x GPTJBlock(
        (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): GPTJAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (out_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): GPTJMLP(
          (fc_in): Linear(in_features=4096, out_features=16384, bias=True)
          (fc_out): Linear(in_features=16384, out_features=4096, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f)

In [30]:
sum(p.numel() for p in model.parameters()) / 1e9

6.050882784

In [92]:
index = 12497

def get_prompt(text):
    return (
        f"{text}\n"
        "Q: is the sentiment expressed for the movie positive or negative? "
        "Answer positive/negative.\n"
        "A:"
    )

print(get_prompt(dataset[index]["text"]))

If you decide to watch Wild Rebels, don't expect anything deep and meaningful. If you're looking for a film that explores the relationships and structure of a motorcycle gang, Wild Rebels is the wrong movie. If you're looking for an expose on the breakdown of the American educational system and the problem of juvenile delinquency, Wild Rebels is the wrong movie. If you're looking for a movie that examines how undermanned rural police departments are when facing a well-financed, well-organized gang, Wild Rebels is the wrong movie. But if you're looking for an absurd movie filled with scene after scene of unintentional humor, horrendous acting, a paper-thin plot, and community theater style production values, Wild Rebels is the right movie.<br /><br />Wild Rebels is the story of a down-on-his-luck stock-car driver named Rod Tillman (Steve Alaimo). After a fiery crash (which Rod walks away from completely unscathed despite having only a cotton pants and a London Fog style jacket for prote

In [93]:
dataset[index]["label"]

0

In [94]:
# Get the inputs
inputs = tokenizer(get_prompt(dataset[index]["text"]), return_tensors="pt", padding=True)
# inputs = tokenizer(template.apply(dataset[0])[0], return_tensors="pt", padding=True)

inputs["input_ids"].shape

torch.Size([1, 470])

In [95]:
%%time

generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "eos_token_id": -1,
    "max_new_tokens": 2,
}

output = model.generate(
    input_ids=inputs["input_ids"].to("cuda"),
    attention_mask=inputs["attention_mask"].to("cuda"),
    **generation_kwargs
)
text_output = tokenizer.batch_decode(output)

CPU times: user 440 ms, sys: 0 ns, total: 440 ms
Wall time: 441 ms


In [96]:
print(text_output[0])

If you decide to watch Wild Rebels, don't expect anything deep and meaningful. If you're looking for a film that explores the relationships and structure of a motorcycle gang, Wild Rebels is the wrong movie. If you're looking for an expose on the breakdown of the American educational system and the problem of juvenile delinquency, Wild Rebels is the wrong movie. If you're looking for a movie that examines how undermanned rural police departments are when facing a well-financed, well-organized gang, Wild Rebels is the wrong movie. But if you're looking for an absurd movie filled with scene after scene of unintentional humor, horrendous acting, a paper-thin plot, and community theater style production values, Wild Rebels is the right movie.<br /><br />Wild Rebels is the story of a down-on-his-luck stock-car driver named Rod Tillman (Steve Alaimo). After a fiery crash (which Rod walks away from completely unscathed despite having only a cotton pants and a London Fog style jacket for prote