In [1]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [2]:
%pwd

'/content'

In [3]:
%cd drive/MyDrive/milestone_3_NotAnAGI/project_folders/notebooks/evaluation

/content/drive/MyDrive/milestone_3_NotAnAGI/project_folders/notebooks/evaluation


In [4]:
import sys
MODULES_PATHS = ["../../modules"]
for mpath in MODULES_PATHS:
    if mpath not in sys.path:
        sys.path.append(mpath)

In [6]:
!pip install torch==1.13.1
!pip install -U bitsandbytes
!pip install -U git+https://github.com/huggingface/transformers.git
!pip install -U git+https://github.com/huggingface/peft.git
!pip install -U git+https://github.com/huggingface/accelerate.git
!pip install datasets
!pip install evaluate
!pip install "git+https://github.com/AIPHES/DiscoScore.git"
!pip install bert-score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-tneoo_e_
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-tneoo_e_
  Resolved https://github.com/huggingface/transformers.git to commit 0c3fdccf2f271fb7c44f6ea6e9f4ee234795f2c5
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/huggin

In [45]:
import json
import torch
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
from peft import PeftConfig, PeftModel


from datasets_ft import build_hf_dataset
from evaluation import Evaluator

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

### Preparation

In [55]:
gen_config = {
    # general
    "num_return_sequences": 1,
    "min_length": 16,
    "max_length": 512,

    # sampling
    "top_k": 0,
    "do_sample": True,
    "top_p": 0.9,
    "temperature": 0.3,

    # against repetition
    "no_repeat_ngram_size": 4,
    "repetition_penalty": 1.5,

    # speed-up
    "use_cache": True
}


def generate_responses(model, tokenizer, all_input_ids, device=torch.device(device)):
    responses = []
    for input_ids in tqdm(all_input_ids):
      gen = model.generate(input_ids=input_ids.to(device).unsqueeze(0), **gen_config).squeeze()
      response = tokenizer.decode(gen, skip_special_tokens=True)
      responses.append(response)

    return responses

In [16]:
evaluator = Evaluator(model_type="bert-base-uncased", device_name=device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Our Assistant Model: Evaluating T5-LARGE (FINETUNED)

In [52]:
datapath = "../../../submission_items/datasets/gen_dataset_NotAnAGI.json"

tokenizer_id = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
with open(datapath, "r") as reader:
    _, testset = build_hf_dataset(json.load(reader), tokenizer)
    testset = testset.select(range(512))

Map:   0%|          | 0/10647 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10647 [00:00<?, ? examples/s]

In [18]:
model_path = "../../../submission_items/assistant-t5-large-lm"

config = PeftConfig.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, device_map="auto")
model = PeftModel.from_pretrained(model, model_path)
model = model.to(device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [56]:
queries = testset["query"]
responses = generate_responses(
    model=model,
    tokenizer=tokenizer,
    all_input_ids=testset["input_ids"])



  0%|          | 0/512 [00:00<?, ?it/s][A[A

  0%|          | 1/512 [00:06<51:32,  6.05s/it][A[A

  0%|          | 2/512 [00:10<44:21,  5.22s/it][A[A

  1%|          | 3/512 [00:21<1:04:52,  7.65s/it][A[A

  1%|          | 4/512 [00:24<48:28,  5.73s/it]  [A[A

  1%|          | 5/512 [00:27<42:19,  5.01s/it][A[A

  1%|          | 6/512 [00:31<39:33,  4.69s/it][A[A

  1%|▏         | 7/512 [00:38<44:07,  5.24s/it][A[A

  2%|▏         | 8/512 [00:41<39:38,  4.72s/it][A[A

  2%|▏         | 9/512 [00:43<30:54,  3.69s/it][A[A

  2%|▏         | 10/512 [00:45<25:59,  3.11s/it][A[A

  2%|▏         | 11/512 [00:52<38:06,  4.56s/it][A[A

  2%|▏         | 12/512 [01:09<1:09:21,  8.32s/it][A[A

  3%|▎         | 13/512 [01:17<1:08:28,  8.23s/it][A[A

  3%|▎         | 14/512 [01:19<52:20,  6.31s/it]  [A[A

  3%|▎         | 15/512 [01:22<43:30,  5.25s/it][A[A

  3%|▎         | 16/512 [01:32<54:14,  6.56s/it][A[A

  3%|▎         | 17/512 [01:47<1:16:04,  9.22s/it][A

In [66]:
responses_in_dict = [{"query": q, "model_answer": r} for q, r in zip(queries, responses)]
with open("./t5-large-finetuned-responses.json", "w") as writer:
  json.dump(responses_in_dict, writer, indent=2)

In [None]:
with open("./t5-large-finetuned-responses.json", "r") as reader:
  queries, responses = zip(*[(x["query"], x["model_answer"]) for x in json.load(reader)])
  queries = list(queries)
  responses = list(responses)

In [57]:
result = evaluator.evaluate(responses, [[q] for q in queries])
result

Already downloaded a model for the 'en' language


### Evaluating T5-BASE (FINETUNED)

In [65]:
tokenizer_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
with open(datapath, "r") as reader:
    _, testset = build_hf_dataset(json.load(reader), tokenizer)
    testset = testset.select(range(512))

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Map:   0%|          | 0/10647 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10647 [00:00<?, ? examples/s]

In [67]:
model_path = "../../models/t5-finetuned/t5-base"

config = PeftConfig.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, device_map="auto")
model = PeftModel.from_pretrained(model, model_path)
model = model.to(device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [69]:
queries = testset["query"]
responses = generate_responses(
    model=model,
    tokenizer=tokenizer,
    all_input_ids=testset["input_ids"])

100%|██████████| 512/512 [42:10<00:00,  4.94s/it]


In [70]:
responses_in_dict = [{"query": q, "model_answer": r} for q, r in zip(queries, responses)]
with open("./t5-base-finetuned-responses.json", "w") as writer:
  json.dump(responses_in_dict, writer, indent=2)

In [None]:
with open("./t5-base-finetuned-responses.json", "r") as reader:
  queries, responses = zip(*[(x["query"], x["model_answer"]) for x in json.load(reader)])
  queries = list(queries)
  responses = list(responses)

In [None]:
result = evaluator.evaluate(responses, [[q] for q in queries])
result

### Evaluating T5-BASE (FINETUNED + PPO)

In [71]:
model_path = "../../models/t5-base-finetuned-ppo/full-training"

config = PeftConfig.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, device_map="auto")
model = PeftModel.from_pretrained(model, model_path)
model = model.to(device)

In [72]:
queries = testset["query"]
responses = generate_responses(
    model=model,
    tokenizer=tokenizer,
    all_input_ids=testset["input_ids"])

100%|██████████| 512/512 [17:57<00:00,  2.10s/it]


In [73]:
responses_in_dict = [{"query": q, "model_answer": r} for q, r in zip(queries, responses)]
with open("./t5-base-finetuned-ppo-responses.json", "w") as writer:
  json.dump(responses_in_dict, writer, indent=2)

In [None]:
with open("./t5-base-finetuned-ppo-responses.json", "r") as reader:
  queries, responses = zip(*[(x["query"], x["model_answer"]) for x in json.load(reader)])
  queries = list(queries)
  responses = list(responses)

In [None]:
result = evaluator.evaluate(responses, [[q] for q in queries])
result

### Evaluating GPT2 (PRETRAINED)

In [86]:
tokenizer_id = "gpt2-large"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

with open(datapath, "r") as reader:
    _, testset = build_hf_dataset(json.load(reader), tokenizer)
    testset = testset.select(range(512))

Downloading (…)lve/main/config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/10647 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10647 [00:00<?, ? examples/s]

In [87]:
model_path = "gpt2-large"

model = AutoModelForCausalLM.from_pretrained(model_path)
model = model.to(device)

Downloading model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [100]:
def gpt2_generate_responses(model, tokenizer, all_input_ids, all_queries, device=device):
  attn_masks = [torch.ones_like(input_ids) for input_ids in all_input_ids]

  responses = []
  for query, input_ids, attn_mask in tqdm(list(zip(all_queries, all_input_ids, attn_masks))):
      gen = model.generate(input_ids=input_ids.to(device).unsqueeze(0), attention_mask=attn_mask.to(device).unsqueeze(0), **gen_config).squeeze()
      response = tokenizer.decode(gen, skip_special_tokens=True)
      response = response.split(query)[-1].strip()
      responses.append(response)

  return responses

In [None]:
queries = testset["query"]
responses = gpt2_generate_responses(
    model=model,
    tokenizer=tokenizer,
    all_input_ids=testset["input_ids"],
    all_queries=queries)

  0%|          | 0/512 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 1/512 [00:00<03:22,  2.53it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 2/512 [00:20<1:39:49, 11.74s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|          | 3/512 [00:29<1:28:59, 10.49s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|          | 4/512 [00:31<1:02:01,  7.33s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 605, but `max_length` is set to 512. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
  1%|          | 5/512 [00:31<40:35,  4.80s/it]  Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|          | 6/512 [00:41<55:26,  6.58s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|▏        

In [None]:
responses_in_dict = [{"query": q, "model_answer": r} for q, r in zip(queries, responses)]
with open("./gpt2-large-responses.json", "w") as writer:
  json.dump(responses_in_dict, writer, indent=2)

In [None]:
with open("./gpt2-large-responses.json", "r") as reader:
  queries, responses = zip(*[(x["query"], x["model_answer"]) for x in json.load(reader)])
  queries = list(queries)
  responses = list(responses)

In [None]:
result = evaluator.evaluate(responses, [[q] for q in queries])
result