In [None]:
# This block handles some basic setup and data loading.  

## imports
from collections import defaultdict, Counter
import numpy as np
import math
import matplotlib.pyplot as plt
import tqdm
import random
import pdb

import torch
from torch import nn
import torch.nn.functional as F
import torchtext.legacy as torchtext

## download and load the data
text_field = torchtext.data.Field()
datasets = torchtext.datasets.WikiText2.splits(root='.', text_field=text_field)
train_dataset, validation_dataset, test_dataset = datasets

text_field.build_vocab(train_dataset, validation_dataset, test_dataset)
vocab = text_field.vocab
vocab_size = len(vocab)

In [None]:
!pip install transformers
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.1 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 53.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 63.1 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 50.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found ex

The following code downloads model weights and can take a few minutes to run. For debugging purposes, feel free to swap out `gpt2-large` with a smaller model that downloads faster.

In [None]:
if torch.cuda.is_available():
    device = "cuda"
    print("Running on GPU")
else:
    device = "cpu"
    print("Running on CPU")
    
model = GPT2LMHeadModel.from_pretrained("gpt2-large").to(device)
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2-large")

Running on GPU


Downloading:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.02G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [None]:
vocab_map = {}
vocab = tokenizer.vocab
for token in tqdm.tqdm(vocab):
    idx = vocab[token]
    vocab_map[idx] = token

100%|██████████| 50257/50257 [00:00<00:00, 1533554.03it/s]


In [None]:
sentence = "<|endoftext|>I love LSTMs"
encodings = tokenizer(sentence, return_tensors='pt')
print(encodings["input_ids"])
print([vocab_map[int(idx)] for idx in encodings["input_ids"][0]])

tensor([[50256,    40,  1842,   406,  2257, 10128]])
['<|endoftext|>', 'I', 'Ġlove', 'ĠL', 'ST', 'Ms']


In [None]:
tokenizer('good', return_tensors='pt')["input_ids"].item()
tokenizer('bad', return_tensors='pt')["input_ids"].item()
tokenizer('positive', return_tensors='pt')["input_ids"].item()
tokenizer('negative', return_tensors='pt')["input_ids"].item()

31591

### Prompting with Language Models

Language models can be coerced into performing a variety of different tasks via *prompting*, as shown in the [GPT-3 paper](https://arxiv.org/abs/2005.14165). At a high level, prompting involves putting a few training examples into the context of a model and then using next word prediction to predict labels. In this sextion, we'll use prompting and the GPT-2 Large model on the sentiment task SST-2. But first, we'll download the dataset:

In [None]:
!pip install datasets
from datasets import load_dataset
dataset = load_dataset("glue", "sst2")

Collecting datasets
  Downloading datasets-2.0.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 4.9 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 9.6 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 66.6 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.2.0-py3-none-any.whl (134 kB)
[K     |████████████████████████████████| 134 kB 69.9 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 62.2 MB/s 
[?25hCollecting frozenlist>=1.1.1
  Downloading frozenlist-1

Downloading builder script:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading and preparing dataset glue/sst2 (download: 7.09 MiB, generated: 4.81 MiB, post-processed: Unknown size, total: 11.90 MiB) to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Now, it's time to write a prompt for sentiment prediction. Here's an example what a prompt might look like for the task of machine translation:


```
The French "Le Tea Party est atterré." in English is "The tea party is aghast"
The French "Cela est vraiment indispensable pour notre nation." in English is "This really is a must for our nation."
The French "Il va y avoir du changement dans la façon dont nous payons ces taxes." in English is "There is going to be a change in how we pay these taxes."
The French "La technologie est là pour le faire." in English is
```
The model would then predict the next tokens, hopefully resulting in an English translation of the final sentence. Note that these examples are all taken from the `wmt14` translation dataset. In our case, however, we'll be predicting sentiment, which is represented by a binary value in the Stanford Sentiment Treebank. To convert next-word predictions into a binary decision, we'll compute the probability of two words (e.g., "yes" vs. "no") and then choose our label based on whichever one has the highest probability. 

In [None]:
def predict_sentiment(prompt,sentence):
  # YOUR CODE HERE
  # - Write a prompt using 3-5 example sentences
  # - Append the current example to the prompt, with a template
  # - Compare probabilities of possible next tokens to get a predicted label
  # - Warning: DO NOT TRAIN OR FINE-TUNE A MODEL FOR THIS ASSIGNMENT

  prompt = prompt.format(sentence)

  encodings = tokenizer(prompt, return_tensors='pt')
  test = encodings.input_ids.to(device)

  logits = model(test).logits.squeeze(0)
  probs = F.log_softmax(logits, dim = 1).squeeze(0)
  neg = probs[-1][14774] ; pos = probs[-1][11274]

  return 0 if neg > pos else return 1

In [None]:
prompt = "The sentiment of 'love and romance are integral parts of human nature' is good. \
            The sentiment of 'distressed widow living alone' is bad. The sentiment \
            of 'wonderful joyous refreshing days ahead' is good. The sentiment of 'sad \
            depressing nights' is bad. The sentiment of 'the broken glass \
            shattered onto the bloody floor' is bad. The sentiment of '{}' is "

In [None]:
num_correct = 0
for idx in tqdm.tqdm(range(1000)):
  example = dataset["train"][idx]
  predicted_label = predict_sentiment(prompt,example["sentence"])
  if predicted_label == example["label"]:
    num_correct += 1
print()
print("Accuracy: {}".format(num_correct / 1000))

100%|██████████| 1000/1000 [01:35<00:00, 10.49it/s]


Accuracy: 0.81





In [None]:
prompt = "'Love and romance are integral parts of human nature' Sentiment: good. \
           'Distressed widow living alone' is bad. The sentiment \
            of 'wonderful joyous refreshing days ahead' is good. The sentiment of 'sad \
            depressing nights' is bad. The sentiment of 'the broken glass \
            shattered onto the bloody floor' is bad. The sentiment of '{}' is "

Our best performing model gets 85% accuracy on this task, and our template-only baseline (i.e., without any training examples in the context) gets 69%. We'll assign full credit to any solutions that score above 70%, but we encourage you to try to beat our scores! Please be aware that prompting can lead to high variance results, and checkout the following paper for additional tips and details: https://arxiv.org/pdf/2102.09690.pdf

Are we cheating? How do we explain this difference between top-of-the-line model to something worse than guessing? Note that this variability of accuracy arises in not only the key-word we search for, but also in the order of the prompts, the distribution of the promps, and the number of prompts.