In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import json
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === LLaMA Student Model ===
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(device)

def generate_output(prompt, max_new_tokens=128):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def clean_prediction(pred, task):
    if task == "summarization":
        return pred.split("Summary:")[-1].strip() if "Summary:" in pred else pred.strip()
    elif task == "qa":
        return pred.split("Answer:")[-1].strip() if "Answer:" in pred else pred.strip()
    elif task == "paraphrase":
        return pred.replace("Paraphrase:", "").strip()
    return pred.strip()

# === 1. CNN/DailyMail: Summarization ===
summarization_data = load_dataset("cnn_dailymail", "3.0.0", split="test[:100]")

summarization_outputs = []
for item in tqdm(summarization_data, desc="Summarization"):
    prompt = f"Summarize the following article:\n\n{item['article']}\n\nSummary:"
    output = generate_output(prompt)
    summarization_outputs.append({
        "id": item["id"],
        "reference": item["highlights"],
        "prediction": output,
        "clean_prediction": clean_prediction(output, "summarization")
    })

with open("llama_summarization_outputs.json", "w") as f:
    json.dump(summarization_outputs, f, indent=2)


# === 2. SQuAD v2: Question Answering ===
qa_data = load_dataset("squad_v2", split="validation[:100]")

qa_outputs = []
for item in tqdm(qa_data, desc="QA"):
    prompt = (
        f"Context: {item['context']}\n"
        f"Question: {item['question']}\n"
        f"Answer:"
    )
    output = generate_output(prompt)
    qa_outputs.append({
        "id": item["id"],
        "reference": item["answers"],  # Dict: {text: [...], answer_start: [...]}
        "prediction": output,
        "clean_prediction": clean_prediction(output, "qa")
    })

with open("llama_qa_outputs.json", "w") as f:
    json.dump(qa_outputs, f, indent=2)


# === 3. Quora Question Pairs: Paraphrase Generation ===
quora_data = load_dataset("quora", split="train[:100]")

para_outputs = []
for item in tqdm(quora_data, desc="Paraphrasing"):
    if item["is_duplicate"]:
        prompt = f"Paraphrase this question:\n{item['questions']['text']}\n"
        output = generate_output(prompt)
        para_outputs.append({
            "reference": item["questions"]["text"],
            "prediction": output,
            "clean_prediction": clean_prediction(output, "paraphrase")
        })

with open("llama_paraphrase_outputs.json", "w") as f:
    json.dump(para_outputs, f, indent=2)

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

2025-05-09 21:43:57.580879: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746827037.772913      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746827037.825016      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Summarization:  31%|███       | 31/100 [01:23<03:00,  2.61s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (2197 > 2048). Running this sequence through the model will result in indexing errors
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
Summarization: 100%|██████████| 100/100 [04:37<00:00,  2.78s/it]


README.md:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

QA: 100%|██████████| 100/100 [04:46<00:00,  2.86s/it]


README.md:   0%|          | 0.00/5.69k [00:00<?, ?B/s]

quora.py:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

The repository for quora contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/quora.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/58.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/404290 [00:00<?, ? examples/s]

Paraphrasing: 100%|██████████| 100/100 [01:40<00:00,  1.01s/it]


In [3]:
del model
del tokenizer
import gc
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [4]:
!pip install evaluate --quiet

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.12.0 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.8.4.1 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cudnn-cu12==9.1.0.70; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cudnn-cu12 9.3.0.75 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cufft-cu12==11.2.1.3; platform_system == "Linux" and platform_machine == "x86_64", 

In [5]:
pip install rouge_score bert_score sacrebleu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.m

In [6]:
import json
from datasets import load_dataset
from tqdm import tqdm
import evaluate  # This is the new official way

# Load metrics using evaluate (not datasets)
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
bleu = evaluate.load("sacrebleu")
meteor = evaluate.load("meteor")

# === Load Predictions ===
with open("llama_summarization_outputs.json") as f:
    summarization_preds = json.load(f)

with open("llama_qa_outputs.json") as f:
    qa_preds = json.load(f)

with open("llama_paraphrase_outputs.json") as f:
    paraphrase_preds = json.load(f)

# === 1. Summarization Evaluation ===
ref_sum = [item["reference"] for item in summarization_preds]
pred_sum = [item["clean_prediction"] for item in summarization_preds]

print("=== Summarization Metrics ===")
print(rouge.compute(predictions=pred_sum, references=ref_sum, use_stemmer=True)['rougeL'])

# === 2. QA Evaluation ===
ref_qa = [item["reference"]["text"][0] if item["reference"]["text"] else "no answer" for item in qa_preds]
pred_qa = [item["clean_prediction"] for item in qa_preds]

print("\n=== Question Answering Metrics ===")
print("ROUGE-L:", rouge.compute(predictions=pred_qa, references=ref_qa, use_stemmer=True)['rougeL'])

bert = bertscore.compute(predictions=pred_qa, references=ref_qa, lang="en")
print(f"BERTScore (F1): {sum(bert['f1']) / len(bert['f1']):.4f}")

# === 3. Paraphrase Generation Evaluation ===
ref_para = [item["reference"] for item in paraphrase_preds]
pred_para = [item["clean_prediction"] for item in paraphrase_preds]

print("\n=== Paraphrase Generation Metrics ===")
print("SacreBLEU:", bleu.compute(predictions=pred_para, references=[[r] for r in ref_para])['score'])
print("METEOR:", meteor.compute(predictions=pred_para, references=ref_para)['meteor'])

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


=== Summarization Metrics ===
0.21415716736243506

=== Question Answering Metrics ===
ROUGE-L: 0.018712809985732416


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore (F1): 0.7869

=== Paraphrase Generation Metrics ===
SacreBLEU: 21.49810392211137
METEOR: 0.4045737687460605
