## 0. Set up

In [1]:
!pip install datasets evaluate transformers
!pip install -q evaluate
!pip install -q rouge_score
!pip install tree-sitter-python==0.21
!pip install codebleu

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m9

In [2]:
from datasets import load_dataset
from evaluate import load
import re
import json
import torch
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
from transformers import (AutoTokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Trainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, GPT2LMHeadModel,
                          DataCollatorForLanguageModeling, TrainingArguments,EvalPrediction)
import os
import pickle
from tqdm import tqdm
from codebleu import calc_codebleu
os.environ["WANDB_DISABLED"] = "true"

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## 1. Load and clean dataset

In [5]:
number_of_solutions = 1
def clean_dataset(batch):
  cleaned_questions = []
  cleaned_solutions = []
  difficulties = []

  for question, solutions_str, difficulty in zip(batch["question"], batch["solutions"], batch["difficulty"]):
      cleaned_question = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*)', '', question)
      solutions = json.loads(solutions_str) if solutions_str  else []
      for sol in solutions[:number_of_solutions]:
          cleaned_questions.append(cleaned_question)
          cleaned_solutions.append(sol)
          difficulties.append(difficulty)

  return {"question": cleaned_questions, "solution": cleaned_solutions, "difficulty": difficulties}

def tokenize_function_for_seq2seq(example, tokenizer): # tokenization function
    inputs = tokenizer(example['question'], truncation=True, max_length=512) # Tokenize inputs (questions)
    labels = tokenizer(example['solution'], truncation=True, max_length=512) # Tokenize targets (solutions)

    labels_with_ignore = [] # Replace padding token id with -100 so it's ignored in the loss
    for label in labels['input_ids']:
        labels_with_ignore.append([-100 if token == tokenizer.pad_token_id else token for token in label])
    inputs['labels'] = labels_with_ignore

    inputs['difficulty'] = example['difficulty']

    return inputs

def tokenize_function_for_auto_regressive_evaluation(example, tokenizer): # tokenization function
    inputs = tokenizer(example['question'][:500] + '\n[CODE]\n', truncation=True, max_length=512)
    solution = tokenizer(example['solution'][:500] + tokenizer.eos_token, truncation=True, max_length=512)

    # Create labels - we only want to compute loss on the solution part
    labels = solution['input_ids']

    inputs['labels'] = labels
    return inputs

In [6]:
cpa_dataset = load_dataset('codeparrot/apps')
cpa_test_set = cpa_dataset["test"]

cleaned_cpa_test_set = cpa_test_set.map(clean_dataset, batched=True, remove_columns=cpa_test_set.column_names)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.63k [00:00<?, ?B/s]

apps.py:   0%|          | 0.00/4.95k [00:00<?, ?B/s]

The repository for codeparrot/apps contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/codeparrot/apps.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


train.jsonl:   0%|          | 0.00/107M [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/1.29G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

## 2. Load all models to evaluate

In [7]:
all_model_locs = {
    "T5-base": "Salesforce/codet5-base",
    "T5-base (finetuned)": '/content/drive/MyDrive/NLP_Code_Generation/t5-base-codeparrots', ## replace this with your fine-tuned model
    "GPT2-base": "gpt2",
    "GPT2-base (finetuned)": '/content/drive/MyDrive/NLP_Code_Generation/gpt2-codeparrots',  ## replaced this with your fidne-tuned model
}
### dictionary storing the model and tokenizer for each model for evaluation
all_models = {}
for model_name, model_loc in all_model_locs.items():
  all_models[model_name] = {}
  all_models[model_name]["tokenizer"] = AutoTokenizer.from_pretrained(model_loc)

  if model_name.startswith("T5"):
    all_models[model_name]["model"] = T5ForConditionalGeneration.from_pretrained(model_loc)

  elif model_name.startswith("GPT2"):
      all_models[model_name]["tokenizer"].pad_token = all_models[model_name]["tokenizer"].eos_token
      all_models[model_name]["model"] = GPT2LMHeadModel.from_pretrained(model_loc)

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

## 3. Load and define evaluation metrics and utility functions

In [10]:
bleu = load('bleu')
rouge = load('rouge')
meteor = load('meteor')

def preprocess_logits_for_metrics_for_seq2seq(logits, labels):
    '''
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    https://discuss.huggingface.co/t/cuda-out-of-memory-when-using-trainer-with-compute-metrics/2941/15
    '''
    if isinstance(logits, tuple): pred_ids = logits[0]
    else: pred_ids = logits
    if pred_ids.ndim == 3: pred_ids = torch.argmax(pred_ids, dim=-1)
    return pred_ids, labels

def compute_metrics(eval_preds, tokenizer):
    preds = eval_preds.predictions[0]
    labels = eval_preds.label_ids
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id) # Replace -100 with pad token id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id) # Replace -100 with pad token id

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute BLEU, ROUGE, and exact match score
    bleu_results = bleu.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    rouge_results = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    meteor_results = meteor.compute(predictions=decoded_preds, references=decoded_labels)
    codebleu_results = calc_codebleu(decoded_labels, decoded_preds, lang="python", weights=(0.1, 0.1, 0.4, 0.4), tokenizer=None)
    # exact_match = sum(pred == label for pred, label in zip(decoded_preds, decoded_labels)) / len(decoded_preds)

    return {
        'bleu': bleu_results['bleu'],
        'rouge1': rouge_results['rouge1'],
        'rouge2': rouge_results['rouge2'],
        'rougeL': rouge_results['rougeL'],
        'meteor': meteor_results['meteor'],
        'codebleu': codebleu_results['codebleu'],
        'ngram_match_score': codebleu_results['ngram_match_score'],
        'weighted_ngram_match_score': codebleu_results['weighted_ngram_match_score'],
        'syntax_match_score': codebleu_results['syntax_match_score'],
        'dataflow_match_score': codebleu_results['dataflow_match_score']
        # 'codebleu': meteor_results['codebleu'],
        # 'exact_match': exact_match,
    }

# Create eval trainer for seq2seq model
def create_eval_trainer_for_seq2seq(model, tokenizer, eval_batch_size=128, predict_with_generate=True, generation_max_length=512):
  eval_args = Seq2SeqTrainingArguments(
      per_device_eval_batch_size=eval_batch_size,                       # Evaluation batch size
      predict_with_generate=predict_with_generate,          # Whether to use generate to calculate generative metrics (ROUGE, BLEU)
      generation_max_length=generation_max_length,                            # max generation length
      fp16=torch.cuda.is_available(),                       # Enable mixed-precision training if a CUDA GPU is available (faster, less memory)
      report_to=[]                                          # disable wandb report
  )

  eval_trainer = Seq2SeqTrainer(
      model=model,
      args=eval_args,
      data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model), # Set up data collator for dynamic padding
      preprocess_logits_for_metrics=preprocess_logits_for_metrics_for_seq2seq,
      compute_metrics=lambda x: compute_metrics(x, tokenizer),
  )
  return eval_trainer

# Evaluation functions seq2seq model: use Trainer.evaluate #
def evaluate_seq2seq_model(model, tokenizer, test_dataset):
  tokenized_test_set= test_dataset.map(lambda x: tokenize_function_for_seq2seq(x, tokenizer), batched=True, remove_columns=test_dataset.column_names)
  ## Create trainer
  eval_trainer = create_eval_trainer_for_seq2seq(model, tokenizer, predict_with_generate=True)
  preds = eval_trainer.predict(tokenized_test_set, metric_key_prefix='test')
  metrics = preds.metrics
  # metrics = eval_trainer.evaluate(tokenized_test_set, metric_key_prefix='test')
  return metrics, preds


def shift_padding_to_left(batch, pad_token=-100):
    shifted_batch = []

    for sample in batch:
        # Find the index of the last non-padding token
        last_non_pad_idx = (sample != pad_token).nonzero(as_tuple=True)[0].max().item() if (sample != pad_token).any() else -1

        # Create a new tensor with padding shifted to the left
        shifted_sample = torch.full_like(sample, pad_token)  # Start with all padding (-100)
        shifted_sample[-(last_non_pad_idx + 1):] = sample[:last_non_pad_idx + 1]  # Copy valid tokens to the right side

        shifted_batch.append(shifted_sample)

    return torch.stack(shifted_batch)

# Evaluation functions for autoregressive #
def evaluate_autoregressive_model(model, tokenizer, test_dataset, generation_config={"do_sample":True, "temperature":0.9}, max_new_tokens=512):
  tokenized_test_set= test_dataset.map(lambda x: tokenize_function_for_auto_regressive_evaluation(x, tokenizer), batched=False, remove_columns=test_dataset.column_names)
  eval_dataloader = DataLoader(tokenized_test_set, shuffle=False, batch_size=16,
                               collate_fn=DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True))
  model.to(device)
  model.eval()
  ## predictions and labels of all samples
  preds = []
  labels = []
  for batch in tqdm(eval_dataloader):
    input_ids = batch['input_ids']
    ## Reverse the pad token to the left for proper generation
    input_ids = shift_padding_to_left(input_ids, tokenizer.pad_token_id)

    with torch.no_grad():
        gen_ids = model.generate(
            input_ids.to(device),
            # do_sample=True,
            # temperature=0.9,
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.pad_token_id,
            attention_mask=batch['attention_mask'].to(device),
            eos_token_id=tokenizer.eos_token_id,
            **generation_config
        )
        # Get the generated sequences but ignore the input part
        gen_ids = gen_ids[:, input_ids.shape[-1]:]

    preds.extend(gen_ids.tolist())
    labels.extend(batch['labels'].tolist())

  preds = np.array(preds)
  labels = np.array(labels)
  eval_preds = EvalPrediction(predictions=(preds,), label_ids=labels)
  # gen_text = tokenizer.batch_decode(gen_ids.to('cpu'))

  metrics = compute_metrics(eval_preds, tokenizer)

  return metrics, eval_preds

  # bleu_results = bleu.compute(predictions=decoded_preds, references=[[label] for label in labels])
  # rouge_results = rouge.compute(predictions=decoded_preds, references=labels)
  # meteor_results = meteor.compute(predictions=decoded_preds, references=labels)
  # codebleu_results = calc_codebleu(labels, decoded_preds, lang="python", weights=(0.25, 0.25, 0.25, 0.25), tokenizer=tokenizer)

  # return {
  #     'test_bleu': bleu_results['bleu'],
  #     'test_rouge1': rouge_results['rouge1'],
  #     'test_rouge2': rouge_results['rouge2'],
  #     'test_rougeL': rouge_results['rougeL'],
  #     'test_meteor': meteor_results['meteor'],
  # }

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
# finetuned_T5 = all_models['T5-base (finetuned)']['model']
# tokenizer = all_models['T5-base (finetuned)']['tokenizer']
# pretrained_T5 = all_models['T5-base']['model']

# tokenized_test_set = cleaned_cpa_test_set.map(lambda x: tokenize_function_for_seq2seq(x, tokenizer), batched=True, remove_columns=cleaned_cpa_test_set.column_names)
# tokenized_samples = tokenized_test_set.select(range(1))

In [None]:
# decoded_pretrained = tokenizer.decode(pretrained_T5.generate(torch.tensor(tokenized_samples['input_ids']).to(device), max_new_tokens=512)[0], skip_special_tokens=True)
# decoded_finetuned = tokenizer.decode(finetuned_T5.generate(torch.tensor(tokenized_samples['input_ids']).to(device), max_new_tokens=512)[0], skip_special_tokens=True)
# decoded_labels = tokenizer.decode(torch.tensor(tokenized_samples['labels'])[0].to(device), skip_special_tokens=True)

In [None]:
# calc_codebleu([decoded_labels], [decoded_pretrained], lang="python", weights=(0.25, 0.25, 0.25, 0.25), tokenizer=None)

{'codebleu': 0.004464285714285714,
 'ngram_match_score': 0,
 'weighted_ngram_match_score': 0,
 'syntax_match_score': 0.0,
 'dataflow_match_score': 0.017857142857142856}

## 4. Run evaluation

In [None]:
import os, pickle
saved_predictions_path = "/content/drive/MyDrive/NLP_Code_Generation/saved_predictions"
if not os.path.exists(saved_predictions_path):
  os.makedirs(saved_predictions_path)

saved_results_path = "/content/drive/MyDrive/NLP_Code_Generation/saved_experiment_results"
if not os.path.exists(saved_predictions_path):
  os.makedirs(saved_results_path)

### 4.1. Comparison between different Seq2Seq on the entire test set (different difficulty levels)

In [None]:
all_metric_names = ['bleu', 'codebleu', 'ngram_match_score', 'weighted_ngram_match_score', 'syntax_match_score', 'dataflow_match_score', 'rouge1', 'rouge2', 'rougeL', 'meteor']
model_comparison_results_df = pd.DataFrame(columns=['model', 'difficulty', *all_metric_names])

# all_difficulty_lvls = ['all'] + list(set(cleaned_cpa_test_set['difficulty']))
all_difficulty_lvls = list(set(cleaned_cpa_test_set['difficulty']))
def select_samples_based_on_difficulties(dataset, difficulty='all'):
  return dataset if (difficulty == 'all') else dataset.filter(lambda example: example['difficulty'] == difficulty)

for model_name in ["T5-base (finetuned)", "T5-base"]:
  print(f"Evaluating model: {model_name}")
  model = all_models[model_name]["model"]
  tokenizer = all_models[model_name]["tokenizer"]

  ## for each difficulty levels
  for lvl in all_difficulty_lvls:
    ### Run eval from scratch ###
    # Filter out the test set to only the current level
    # test_dataset_lvl = select_samples_based_on_difficulties(cleaned_cpa_test_set, difficulty=lvl) # test
    ## Run evaluation
    # metrics, preds = evaluate_seq2seq_model(model, tokenizer, test_dataset_lvl)
    # Save pred path for later use
    # with open(os.path.join(saved_predictions_path, f"{model_name}_{lvl}_predictions.pkl"), "wb") as f:
    #   pickle.dump(preds, f)

    # model_comparison_results_df.loc[len(model_comparison_results_df)] = [model_name, lvl, metrics['test_bleu'], metrics['test_codebleu'], metrics['test_rouge1'], metrics['test_rouge2'], metrics['test_rougeL'], metrics['test_meteor']]

    ### Run eval from saved predictions ###
    with open(os.path.join(saved_predictions_path, f"{model_name}_{lvl}_predictions.pkl"), "rb") as f:
      preds = pickle.load(f)

    metrics = compute_metrics(preds, tokenizer)
    model_comparison_results_df.loc[len(model_comparison_results_df)] = [model_name, lvl] + [metrics[x] for x in all_metric_names]


# model_comparison_results_df = model_comparison_results_df.pivot(index="model", columns="difficulty", values=all_metric_names)
# model_comparison_results_df = model_comparison_results_df.sort_index(axis=1, level=1)
# model_comparison_results_df.columns = pd.MultiIndex.from_tuples([(d, m) for m, d in model_comparison_results_df.columns])
model_comparison_results_df.to_excel(os.path.join(saved_results_path, "seq2seq_model_comparison_results.xlsx"))
model_comparison_results_df

Evaluating model: T5-base (finetuned)
Evaluating model: T5-base


Unnamed: 0,model,difficulty,bleu,codebleu,ngram_match_score,weighted_ngram_match_score,syntax_match_score,dataflow_match_score,rouge1,rouge2,rougeL,meteor
0,T5-base (finetuned),interview,0.138539,0.172883,0.0391087,0.050606,0.233583,0.176195,0.291358,0.131047,0.243383,0.236653
1,T5-base (finetuned),introductory,0.123787,0.2061,0.03120243,0.057608,0.272952,0.220097,0.30674,0.147805,0.265407,0.274619
2,T5-base (finetuned),competition,0.12469,0.145114,0.03608964,0.040422,0.20544,0.138216,0.271158,0.115966,0.222534,0.197653
3,T5-base,interview,1.7e-05,0.027464,1.366067e-06,1.8e-05,0.003201,0.065453,0.034176,0.002335,0.031879,0.010242
4,T5-base,introductory,0.000177,0.033826,1.049057e-05,4.1e-05,0.00412,0.080431,0.048339,0.004594,0.04489,0.012921
5,T5-base,competition,3e-06,0.0218,6.916208e-07,2.3e-05,0.004689,0.049805,0.027598,0.002114,0.026181,0.009615


### 4.2. Comparison between Autoregressive model

In [None]:
all_metric_names = ['bleu', 'codebleu', 'ngram_match_score', 'weighted_ngram_match_score', 'syntax_match_score', 'dataflow_match_score', 'rouge1', 'rouge2', 'rougeL', 'meteor']
model_comparison_results_df = pd.DataFrame(columns=['model', 'difficulty', *all_metric_names])

# all_difficulty_lvls = ['all'] + list(set(cleaned_cpa_test_set['difficulty']))
all_difficulty_lvls = list(set(cleaned_cpa_test_set['difficulty']))

def select_samples_based_on_difficulties(dataset, difficulty='all'):
  return dataset if (difficulty == 'all') else dataset.filter(lambda example: example['difficulty'] == difficulty)

for model_name in ["GPT2-base (finetuned)", "GPT2-base"]:
  print(f"Evaluating model: {model_name}")
  model = all_models[model_name]["model"]
  tokenizer = all_models[model_name]["tokenizer"]

  ## for each difficulty levels
  for lvl in all_difficulty_lvls:
    ### Run eval from scratch ###
    # Filter out the test set to only the current level
    # test_dataset_lvl = select_samples_based_on_difficulties(cleaned_cpa_test_set, difficulty=lvl)
    # metrics, preds = evaluate_autoregressive_model(model, tokenizer, test_dataset_lvl)
    # # Save pred path for later use
    # with open(os.path.join(saved_predictions_path, f"{model_name}_{lvl}_predictions.pkl"), "wb") as f:
    #   pickle.dump(preds, f)
    # model_comparison_results_df.loc[len(model_comparison_results_df)] = [model_name, lvl, metrics['bleu'], metrics['codebleu'], metrics['rouge1'], metrics['rouge2'], metrics['rougeL'], metrics['meteor']]

    ### Run eval from saved predictions
    with open(os.path.join(saved_predictions_path, f"{model_name}_{lvl}_predictions.pkl"), "rb") as f:
      preds = pickle.load(f)

    metrics = compute_metrics(preds, tokenizer)
    model_comparison_results_df.loc[len(model_comparison_results_df)] = [model_name, lvl] + [metrics[x] for x in all_metric_names]

# model_comparison_results_df = model_comparison_results_df.pivot(index="model", columns="difficulty", values=all_metric_names)
# model_comparison_results_df = model_comparison_results_df.sort_index(axis=1, level=1)
# model_comparison_results_df.columns = pd.MultiIndex.from_tuples([(d, m) for m, d in model_comparison_results_df.columns])
model_comparison_results_df.to_excel(os.path.join(saved_results_path, "autoregressive_model_comparison_results.xlsx"))
model_comparison_results_df

Evaluating model: GPT2-base (finetuned)
Evaluating model: GPT2-base


Unnamed: 0,model,difficulty,bleu,codebleu,ngram_match_score,weighted_ngram_match_score,syntax_match_score,dataflow_match_score,rouge1,rouge2,rougeL,meteor
0,GPT2-base (finetuned),interview,0.077352,0.275308,0.01362,0.035847,0.286767,0.389135,0.233589,0.066501,0.156462,0.277233
1,GPT2-base (finetuned),introductory,0.067874,0.303866,0.011042,0.040813,0.32495,0.42175,0.213437,0.065542,0.147998,0.27287
2,GPT2-base (finetuned),competition,0.07495,0.256727,0.012783,0.029719,0.264627,0.366565,0.239567,0.063431,0.157147,0.270258
3,GPT2-base,interview,0.001489,0.202024,0.000109,0.000634,0.040732,0.464141,0.064855,0.003048,0.048637,0.076565
4,GPT2-base,introductory,0.001527,0.20683,6.6e-05,0.000487,0.03953,0.477408,0.06852,0.004736,0.052187,0.081602
5,GPT2-base,competition,0.001616,0.191766,0.000139,0.000676,0.042404,0.436806,0.079479,0.004645,0.058189,0.08304


### 4.3. Comparison on different decoding method (T5)

In [None]:
selected_model_name = "T5-base (finetuned)" ## you can set it to another model ##
model = all_models[selected_model_name]["model"]
tokenizer = all_models[selected_model_name]["tokenizer"]

### Tokenization ###
if selected_model_name.startswith("T5"):
  tokenized_test_set= cleaned_cpa_test_set.map(lambda x: tokenize_function_for_seq2seq(x, tokenizer), batched=True, remove_columns=cleaned_cpa_test_set.column_names)

eval_trainer = create_eval_trainer_for_seq2seq(model, tokenizer, eval_batch_size=128, predict_with_generate=True, generation_max_length=256)

Map:   0%|          | 0/3765 [00:00<?, ? examples/s]

In [None]:
### evaluate with different generation schema
generation_schemas =  {
    "greedy": {"do_sample": False, "num_beams": 1},
    "beam_search": {"do_sample": False, "num_beams": 5, "early_stopping": True},
    "random_sampling": {"do_sample": True, "temperature": 0.6, "top_k": 0},
    "top_k_sampling": {"do_sample": True, "top_k": 50},
    "top_p_sampling": {"do_sample": True, "top_p": 0.9, "top_k": 0},
}

schema_results_df = pd.DataFrame(columns=["generation_method", "bleu", "codebleu", "rouge1", "rouge2", "rougeL", "meteor"])
for schema in generation_schemas:
  print(f"Evaluating generation method: {schema}")
  schema_config = generation_schemas[schema]

  ### EVAL FROM SCRATCH ###
  preds = eval_trainer.predict(tokenized_test_set ,metric_key_prefix="test", **schema_config)
  metrics = preds.metrics
  Save pred path for later use
  with open(os.path.join(saved_predictions_path, f"{model_name}_{schema}_predictions.pkl"), "wb") as f:
    pickle.dump(preds, f)

  schema_results_df.loc[len(schema_results_df)] = [schema, metrics['test_bleu'], metrics["test_codebleu"], metrics['test_rouge1'], metrics['test_rouge2'], metrics['test_rougeL'], metrics['test_meteor']]

  ### EVAL FROM SAVED PREDICTIONS
  # with open(os.path.join(saved_predictions_path, f"{model_name}_{schema}_predictions.pkl"), "rb") as f:
  #   preds = pickle.load(f)
  # metrics = compute_metrics(preds, tokenizer)
  # schema_results_df.loc[len(schema_results_df)] = [schema, metrics['bleu'], metrics["codebleu"], metrics['rouge1'], metrics['rouge2'], metrics['rougeL'], metrics['meteor']]

schema_results_df.to_excel(os.path.join(saved_results_path, "generation_method_comparison_results.xlsx"))
schema_results_df

Evaluating generation method: greedy
Evaluating generation method: beam_search
Evaluating generation method: random_sampling
Evaluating generation method: top_k_sampling
Evaluating generation method: top_p_sampling


Unnamed: 0,generation_method,bleu,codebleu,rouge1,rouge2,rougeL,meteor
0,greedy,0.020162,0.185963,0.073733,0.009302,0.073733,0.129619
1,beam_search,0.047615,0.1923,0.094737,0.010638,0.094737,0.143086
2,random_sampling,0.052105,0.188874,0.368421,0.196429,0.350877,0.202079
3,top_k_sampling,0.069455,0.186272,0.322581,0.131148,0.306452,0.213733
4,top_p_sampling,0.143751,0.322066,0.393443,0.165289,0.295082,0.372553


### 4.4. Comparison on different decoding method (GPT)

In [19]:
selected_model_name = "GPT2-base (finetuned)" ## you can set it to another model ##
model = all_models[selected_model_name]["model"]
tokenizer = all_models[selected_model_name]["tokenizer"]

## different schema config
generation_schemas =  {
    "greedy": {"do_sample": False, "num_beams": 1},
    "beam_search": {"do_sample": False, "num_beams": 5, "early_stopping": True},
    "random_sampling": {"do_sample": True, "temperature": 0.6, "top_k": 0},
    "top_k_sampling": {"do_sample": True, "top_k": 50},
    "top_p_sampling": {"do_sample": True, "top_p": 0.9, "top_k": 0},
}

schema_results_df = pd.DataFrame(columns=["generation_method", "bleu", "codebleu", "rouge1", "rouge2", "rougeL", "meteor"])
for schema in generation_schemas:
  print(f"Evaluating generation method: {schema}")
  schema_config = generation_schemas[schema]

  ### EVAL FROM SCRATCH ###
  metrics, preds = evaluate_autoregressive_model(model, tokenizer, cleaned_cpa_test_set.select(range(5)), schema_config, max_new_tokens=256)

  # Save pred path for later use
  # with open(os.path.join(saved_predictions_path, f"{model_name}_{schema}_predictions.pkl"), "wb") as f:
  #   pickle.dump(preds, f)

  schema_results_df.loc[len(schema_results_df)] = [schema, metrics['bleu'], metrics["codebleu"], metrics['rouge1'], metrics['rouge2'], metrics['rougeL'], metrics['meteor']]

  ### EVAL FROM SAVED PREDICTIONS
  # with open(os.path.join(saved_predictions_path, f"{model_name}_{schema}_predictions.pkl"), "rb") as f:
  #   preds = pickle.load(f)
  # metrics = compute_metrics(preds, tokenizer)
  # schema_results_df.loc[len(schema_results_df)] = [schema, metrics['bleu'], metrics["codebleu"], metrics['rouge1'], metrics['rouge2'], metrics['rougeL'], metrics['meteor']]

# schema_results_df.to_excel(os.path.join(saved_results_path, "generation_method_comparison_results_gpt.xlsx"))
schema_results_df

Evaluating generation method: greedy


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

100%|██████████| 1/1 [00:02<00:00,  2.90s/it]


Evaluating generation method: beam_search


100%|██████████| 1/1 [00:05<00:00,  5.64s/it]


Evaluating generation method: random_sampling


100%|██████████| 1/1 [00:02<00:00,  2.88s/it]


Evaluating generation method: top_k_sampling


100%|██████████| 1/1 [00:02<00:00,  2.90s/it]


Evaluating generation method: top_p_sampling


100%|██████████| 1/1 [00:02<00:00,  2.99s/it]


Unnamed: 0,generation_method,bleu,codebleu,rouge1,rouge2,rougeL,meteor
0,greedy,0.020304,0.131584,0.069118,0.010697,0.062465,0.159184
1,beam_search,0.095869,0.152902,0.181515,0.062095,0.140307,0.186167
2,random_sampling,0.077777,0.155443,0.145631,0.047134,0.092261,0.18462
3,top_k_sampling,0.126479,0.227029,0.292054,0.084603,0.187875,0.297308
4,top_p_sampling,0.105864,0.194887,0.281784,0.085976,0.217557,0.250854
