In [1]:
import transformers
from datasets import load_dataset

dataset = load_dataset("squad_it")
DEVICE = "cuda:0"

  from .autonotebook import tqdm as notebook_tqdm
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [2]:
from get_model import get_model
model_name = "saiga-7b"

model, tokenizer = get_model(model_name)
model = model.to(DEVICE)

Loading checkpoint shards: 100%|██████████| 2/2 [01:38<00:00, 49.36s/it]


In [3]:
from get_prompt import get_prompt

generate_prompt, stop = get_prompt(model_name)

In [4]:
def build_question(context, question):
    return f"Dato il seguente testo:\n{context}\nRispondi brevemente a questa domanda:\n{question}"

def build_answer(answer):
    return f"Risposta breve: {answer}"

In [5]:
import random

def get_shots(dataset, n):
    conversation = []
    for i in range(n):
        elem = random.choice(dataset["train"])
        q_shot = build_question(elem["context"], elem["question"])
        conversation.append(dict(
            role="user",
            text=q_shot
        ))
        conversation.append(dict(
            role="ai",
            text=build_answer(elem["answers"]["text"][0])
        ))
    return conversation

In [6]:
import torch

def forward_model(*, prompt, model, tokenizer):
    if "pad_token" not in tokenizer.special_tokens_map:
        tokenizer.pad_token = tokenizer.eos_token

    input_ids = tokenizer(
        prompt,
        return_tensors='pt',
        padding=True,
    ).input_ids
    with torch.no_grad():
        input_ids = input_ids.to(DEVICE)
        output_ids = model.generate(
            input_ids=input_ids,
            max_new_tokens=32,
            pad_token_id=tokenizer.eos_token_id
        )

    ret = []
    for i in range(0, len(output_ids)):
        generated_text = tokenizer.decode(
            output_ids[i],
            skip_special_tokens=True
        )
        generated_text = generated_text[len(prompt[i]):]

        if "\n" in generated_text:
            generated_text = generated_text[:generated_text.index("\n")]
        
        ret.append(generated_text.strip())

    return ret

In [7]:
import time

t0 = time.time()
out = forward_model(
    prompt=["La vita è"],
    model=model,
    tokenizer=tokenizer,
)
print(time.time() - t0)

print(out)

4.7999794483184814
['bella (Life is Beautiful) is a 1997 Italian film directed by Roberto Benigni. The film is a Holocaust drama set']


In [8]:
import uuid
from tqdm import tqdm

batch_size = 1

predicted_answers = []
theoretical_answers = []

ds = dataset["test"].shuffle(seed=42)

bar = tqdm(ds.iter(batch_size=batch_size), total=len(ds))
total = 0
for elem in bar:
    try:
        ids = elem["id"]
        context = elem["context"]
        question = elem["question"]
        answers = elem["answers"]

        model_inputs = []
        for c, q in zip(context, question):
            model_inputs.append(
                generate_prompt(
                    get_shots(dataset, 3) + [
                        dict(
                            role="user",
                            text=build_question(c, q)
                        )
                    ],
                    do_continue=True
                ) + " " + build_answer("").strip()
            )

        model_outputs = forward_model(
            prompt=model_inputs,
            model=model,
            tokenizer=tokenizer,
        )

        for model_output, id in zip(model_outputs, ids):
            predicted_answers.append(dict(
                id=id,
                prediction_text=model_output,
            ))

        for ans, id in zip(answers, ids):
            theoretical_answers.append(dict(
                id=id,
                answers=ans,
            ))
        
    except Exception as e:
        print(e)
        continue






100%|██████████| 7609/7609 [12:58:06<00:00,  6.14s/it]  


FileNotFoundError: [Errno 2] No such file or directory: './cache/generated-squad-saiga-7b.json'

In [10]:
import os
import json

# Create the directory if it doesn't exist
os.makedirs('./cache', exist_ok=True)
with open(f"./cache/generated-squad-{model_name}.json", "w") as f:
    json.dump(dict(
        predicted_answers=predicted_answers,
        theoretical_answers=theoretical_answers,
    ), f, indent=4)

In [11]:
import json
with open(f"./cache/generated-squad-{model_name}.json", "r") as f:
    data = json.load(f)

print(len(data["predicted_answers"]))

7609


In [12]:
import evaluate

predicted_answers = data["predicted_answers"]
theoretical_answers = data["theoretical_answers"]

metric = evaluate.load("squad")
results = metric.compute(predictions=predicted_answers, references=theoretical_answers)

import time


print("=== REPORT ===")
print("current date:", time.strftime("%d/%m/%Y %H:%M:%S"))
print("Dataset: SQuAD-it")
print("Model:", model_name)
print(results)
print("==========================")

Downloading builder script: 100%|██████████| 4.53k/4.53k [00:00<?, ?B/s]
Downloading extra modules: 100%|██████████| 3.32k/3.32k [00:00<?, ?B/s]


=== REPORT ===
current date: 13/01/2024 11:19:59
Dataset: SQuAD-it
Model: saiga-7b
{'exact_match': 51.79392824287029, 'f1': 70.87586474960958}
