<a href="https://colab.research.google.com/github/FernandoFGr/Automatic-Visual-Inspection-of-Road-Pavements/blob/master/Final_project/Fernando_Fortes_Granado/Pruning_GPT_J_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Tue Jul  5 02:43:19 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   56C    P0    43W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import torch
torch.cuda.empty_cache()

In [6]:
import random
from transformers import AutoTokenizer, GPTNeoXForCausalLM
from transformers import GPTJForCausalLM, GPTNeoXTokenizerFast
from transformers import GPT2Tokenizer, GPT2Model

random.seed(10)

model = GPTJForCausalLM.from_pretrained(
    "EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16, low_cpu_mem_usage=True
)

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")

device = torch.device("cuda")
model.to(device)

GPTJForCausalLM(
  (transformer): GPTJModel(
    (wte): Embedding(50400, 4096)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0): GPTJBlock(
        (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): GPTJAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (out_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): GPTJMLP(
          (fc_in): Linear(in_features=4096, out_features=16384, bias=True)
          (fc_out): Linear(in_features=16384, out_features=4096, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
      (1): GPTJBlock(
  

In [7]:
from typing import *
from datasets import load_dataset
import numpy as np

class Dataset:
  def __init__(self, dataset, max_tokens):
    self.max_tokens = max_tokens
    self.dataset = dataset
    self.token_dataset = [
        (
            tokenizer(dataset[idx]["text"], return_tensors="pt"),
            "Positive" if dataset[idx]["label"] else "Negative",
        ) for idx in range(len(dataset))
    ]
    num_tokens = np.array([self.token_dataset[idx][0].input_ids.shape[1] for idx in range(len(dataset))])
    self.dataset_idx = np.argwhere(num_tokens <= max_tokens).reshape(-1)
  
  def __getitem__(self, idx):
    return self.token_dataset[int(self.dataset_idx[idx])]

  def get_raw_item(self, idx):
    item = self.dataset[int(self.dataset_idx[idx])]
    return item["text"], "Positive" if item["label"] else "Negative"

  def __len__(self):
    return len(self.dataset_idx)

  
class InferencePromptGen:
  def __init__(self, learning_examples: List[Tuple[str, str]], dataset: Dataset):
    self.dataset = dataset
    self.learning_examples = learning_examples
    self.fixed_text = self._get_fixed_text()

  def __getitem__(self, idx):
    return f"{self.fixed_text}\n\n{self._get_inference_text(idx)}", self.dataset[idx][1]

  def _get_fixed_text(self):
    instruction = "Instruction: Given a movie review, answer if the sentiment of the review is positive or negative."
    examples = "\n\n".join([f"""Example {i}:\nReview: {text}\nSentiment: {label}""" for i, (text, label) in enumerate(self.learning_examples)])
    return f"{instruction}\n\n{examples}"

  def _get_inference_text(self, idx):
    return f"Example {len(self.learning_examples)}:\nReview: {self.dataset.get_raw_item(idx)[0]}\nSentiment"


def predict(prompt, model, tokenizer):
  tokenized = tokenizer(prompt, return_tensors="pt")
  input_ids = tokenized.input_ids.to(device)
  attention_mask = tokenized.attention_mask.to(device)

  with torch.no_grad():
    gen_tokens = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=False,
        temperature=1,
        max_length=2048,
    )
  gen_text = tokenizer.batch_decode(gen_tokens)[0]
  return gen_text.split("\n")[len(prompt.split("\n")) - 1].split()[1]


imdb = load_dataset("imdb")
MAX_TOKENS = 300
NUM_INFERENCE_SAMPLES = 200
NUM_LEARNING_SAMPLES = 7

train_dataset = Dataset(
    dataset=imdb["train"].shuffle(seed=42),
    max_tokens=MAX_TOKENS,
)

test_dataset = Dataset(
    dataset=imdb["test"].shuffle(seed=42),
    max_tokens=MAX_TOKENS,
)

train_prompt_gen = InferencePromptGen(
    learning_examples=[train_dataset.get_raw_item(i) for i in range(NUM_LEARNING_SAMPLES)],
    dataset=train_dataset,
)

test_prompt_gen = InferencePromptGen(
    learning_examples=[train_dataset.get_raw_item(i) for i in range(NUM_LEARNING_SAMPLES)],
    dataset=test_dataset,
)

samples = list(range(NUM_INFERENCE_SAMPLES))
test_prompts = [test_prompt_gen[i][0] for i in samples]
test_labels = [test_prompt_gen[i][1] for i in samples]

samples = list(range(NUM_LEARNING_SAMPLES, NUM_LEARNING_SAMPLES + NUM_INFERENCE_SAMPLES))
train_prompts = [train_prompt_gen[i][0] for i in samples]
train_labels = [train_prompt_gen[i][1] for i in samples]

Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-f700cd6f222bd13f.arrow
Token indices sequence length is longer than the specified maximum sequence length for this model (2174 > 2048). Running this sequence through the model will result in indexing errors
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-b23cfeb68a931a8d.arrow


## **Balanceamento**


*   200 prompts de teste: 101 positivos, 99 negativos



In [8]:
print(test_labels.count("Positive"))
print(test_labels.count("Negative"))

101
99


In [9]:
import torch
def evaluate(model, tokenizer, prompts: List[str], labels: List[str]):
  num_correct_preds = 0
  for idx in range(len(prompts)):
    if ((idx % 10) == 0) and (idx != 0):
      print(f"Number of samples evaluated: {idx}, Number of correct preds: {num_correct_preds}, Accuracy: {num_correct_preds / idx}")
    output = predict(prompts[idx], model, tokenizer)
    print(labels[idx], output, labels[idx] == output)

    if labels[idx] == output:
      num_correct_preds += 1
    else:
      print(f"\n_____________________________\n\nPROMPT\n{prompts[idx]}\n\n_____________________________\n\nLABEL\n{labels[idx]}\nOUTPUT\n{output}\n\n_____________________________\n\n")
  return num_correct_preds / len(prompts)


def evaluate_perplexity(model, tokenizer, dataset: Dataset, num_samples):
  acc_loss = 0
  acc_tokens = 0
  all_idxs = list(range(len(dataset)))
  sampled_idxs = random.sample(all_idxs, num_samples)
  for samples_evaluated, idx in enumerate(sampled_idxs):
    if ((samples_evaluated % 10) == 0) and (samples_evaluated != 0):
      print(f"Number of samples evaluated: {samples_evaluated}, Average perplexity: {torch.exp(torch.tensor(acc_loss / acc_tokens))}")

    input_ids = dataset[idx][0].input_ids.to(device)
    attention_mask = dataset[idx][0].attention_mask.to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
    loss = outputs.loss
    logits = outputs.logits
    acc_loss += loss.item() * input_ids.shape[1]  # soma do loss para todos os tokens da frase
    acc_tokens += input_ids.shape[1]  # numero de tokens

  return torch.exp(torch.tensor(acc_loss / acc_tokens))

## **Cálculo da Perplexidade:**


*   Filtro de amostras com até 2048 tokens
*   20 amostras (5100 tokens). 13 segundos por avaliação
*   **Baseline de treino: X**
*   **Baseline de teste: X**

TODO: fazer predicts em batch para aumentar velocidade e avaliar com mais dados?




In [10]:
train_dataset_lim_2048 = Dataset(
    dataset=imdb["train"].shuffle(seed=42),
    max_tokens=2048,
)
evaluate_perplexity(model, tokenizer, train_dataset_lim_2048, len(train_dataset_lim_2048))

Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-f700cd6f222bd13f.arrow


Number of samples evaluated: 10, Average perplexity: 18.094568252563477
Number of samples evaluated: 20, Average perplexity: 21.42386245727539
Number of samples evaluated: 30, Average perplexity: 20.825502395629883
Number of samples evaluated: 40, Average perplexity: 20.08266830444336
Number of samples evaluated: 50, Average perplexity: 19.392093658447266


KeyboardInterrupt: ignored

In [None]:
test_dataset_lim_2048 = Dataset(
    dataset=imdb["test"].shuffle(seed=42),
    max_tokens=2048,
)
evaluate_perplexity(model, tokenizer, test_dataset_lim_2048, len(test_dataset_lim_2048))

## **Precisão Few-Shot:**


*   200 prompts
*   Filtro de amostras com até 300 tokens
*   **Baseline de treino: X**
*   **Baseline de teste: X**

In [10]:
evaluate(model, tokenizer, train_prompts, train_labels)

RuntimeError: ignored

In [35]:
evaluate(model, tokenizer, test_prompts, test_labels)

Positive Positive True


KeyboardInterrupt: ignored

## **Testes de Pruning de cabeças de atenção:**


*   Pruning de todas as cabeças de uma camada devem levar o desempenho a ser péssimo, pois acaba com o fluxo de informação
*   Pruning de poucas cabeças devem interferir pouco no resultado
*   Pruning de muitas cabeças aleatoriamente devem progressivamente derrubar o desempenho

Implementação: o GPT-J possui camadas com 16 cabeças de atenção, cada uma com dimensão 256. Para realizar o pruning, iremos zerar um bloco de 256 elementos do vetor v_proj, fazendo com que o vetor de valores de uma cabeça seja sempre 0, de forma que sua saída será 0.

Adicionar camada de pruning pode fazer o modelo ocupar mais memória durante o desenvolvimento.

In [10]:
v_proj_layers = []
for name, param in model.named_parameters():
    if "v_proj" in name:
        v_proj_layers.append((name, param))

In [19]:
# new_shape = param.size()[:-1] + (16, 256)  # torch.Size([4096, 16, 256])
# param = param.view(new_shape)
# [:, 0:256, :]

torch.Size([4096, 16, 256])

In [21]:
print(param[:, 0:256])

tensor([[ 0.0047, -0.0033, -0.0013,  ...,  0.0275,  0.0188,  0.0024],
        [-0.0052, -0.0143, -0.0018,  ...,  0.0131, -0.0031,  0.0146],
        [ 0.0203,  0.0271,  0.0326,  ..., -0.0054, -0.0129, -0.0244],
        ...,
        [ 0.0131, -0.0034,  0.0187,  ...,  0.0260, -0.0007,  0.0075],
        [-0.0081, -0.0316,  0.0125,  ...,  0.0189,  0.0147,  0.0095],
        [ 0.0006,  0.0200,  0.0051,  ...,  0.0244, -0.0071,  0.0047]],
       device='cuda:0', dtype=torch.float16, grad_fn=<SliceBackward0>)


In [22]:
print(param.view(torch.Size([4096, 16, 256]))[:,0,:])

tensor([[ 0.0047, -0.0033, -0.0013,  ...,  0.0275,  0.0188,  0.0024],
        [-0.0052, -0.0143, -0.0018,  ...,  0.0131, -0.0031,  0.0146],
        [ 0.0203,  0.0271,  0.0326,  ..., -0.0054, -0.0129, -0.0244],
        ...,
        [ 0.0131, -0.0034,  0.0187,  ...,  0.0260, -0.0007,  0.0075],
        [-0.0081, -0.0316,  0.0125,  ...,  0.0189,  0.0147,  0.0095],
        [ 0.0006,  0.0200,  0.0051,  ...,  0.0244, -0.0071,  0.0047]],
       device='cuda:0', dtype=torch.float16, grad_fn=<SliceBackward0>)


In [11]:
layer = 0
head = 0
model.state_dict()[v_proj_layers[layer][0]].view(torch.Size([4096, 16, 256]))[:,head,:]

tensor([[ 0.0047, -0.0033, -0.0013,  ...,  0.0275,  0.0188,  0.0024],
        [-0.0052, -0.0143, -0.0018,  ...,  0.0131, -0.0031,  0.0146],
        [ 0.0203,  0.0271,  0.0326,  ..., -0.0054, -0.0129, -0.0244],
        ...,
        [ 0.0131, -0.0034,  0.0187,  ...,  0.0260, -0.0007,  0.0075],
        [-0.0081, -0.0316,  0.0125,  ...,  0.0189,  0.0147,  0.0095],
        [ 0.0006,  0.0200,  0.0051,  ...,  0.0244, -0.0071,  0.0047]],
       device='cuda:0', dtype=torch.float16)

In [12]:
model.state_dict()[v_proj_layers[layer][0]].view(torch.Size([4096, 16, 256]))[:,:,:] = 0

In [13]:
model.state_dict()[v_proj_layers[layer][0]].view(torch.Size([4096, 16, 256]))[:,head,:]

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0', dtype=torch.float16)

In [14]:
evaluate(model, tokenizer, test_prompts, test_labels)
# hipotese: skip connection ta fazendo o fluxo funcionar

Positive Positive True
Negative Negative True
Positive Positive True


KeyboardInterrupt: ignored

In [15]:
train_dataset_lim_2048 = Dataset(
    dataset=imdb["train"].shuffle(seed=42),
    max_tokens=2048,
)
evaluate_perplexity(model, tokenizer, train_dataset_lim_2048, len(train_dataset_lim_2048))

Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-f700cd6f222bd13f.arrow


RuntimeError: ignored

In [None]:
# Ideia de pruning: Setar o v_proj em 0. Setar apenas as posicoes referentes a cabeca. 16 cabeças de 256.

In [None]:
# Obter baseline (perplexidade) com predict de proxima palavra no IMDB em paralelo. Fixar N amostras para obter essas avaliações.
# Implementar um pruning de uma cabeça especificada por um indice

In [None]:
# Acuracia no treino: 93%, NUM_INFERENCE_SAMPLES: 200 5 examples, 93,5% com 7 exemplos

In [None]:
# Acuracia no teste: 93,5%, NUM_INFERENCE_SAMPLES: 200 5 examples