## 0. Set up

In [1]:
!pip install -U datasets evaluate transformers
!pip install -q evaluate
!pip install -q rouge_score
!pip install tree-sitter-python==0.21
!pip install codebleu

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    U

In [2]:
from datasets import load_dataset
from evaluate import load
import re
import json
import torch
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
from transformers import (AutoTokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Trainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, GPT2LMHeadModel,
                          DataCollatorForLanguageModeling, TrainingArguments,EvalPrediction)
import os
import pickle
from tqdm import tqdm
from codebleu import calc_codebleu
os.environ["WANDB_DISABLED"] = "true"

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## 1. Load and clean dataset

In [5]:
number_of_solutions = 1
def clean_dataset(batch):
  cleaned_questions = []
  cleaned_solutions = []
  difficulties = []

  for question, solutions_str, difficulty in zip(batch["question"], batch["solutions"], batch["difficulty"]):
      cleaned_question = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*)', '', question)
      solutions = json.loads(solutions_str) if solutions_str  else []
      for sol in solutions[:number_of_solutions]:
          cleaned_questions.append(cleaned_question)
          cleaned_solutions.append(sol)
          difficulties.append(difficulty)

  return {"question": cleaned_questions, "solution": cleaned_solutions, "difficulty": difficulties}

def tokenize_function_for_seq2seq(example, tokenizer): # tokenization function
    inputs = tokenizer(example['question'], truncation=True, max_length=512) # Tokenize inputs (questions)
    labels = tokenizer(example['solution'], truncation=True, max_length=512) # Tokenize targets (solutions)

    labels_with_ignore = [] # Replace padding token id with -100 so it's ignored in the loss
    for label in labels['input_ids']:
        labels_with_ignore.append([-100 if token == tokenizer.pad_token_id else token for token in label])
    inputs['labels'] = labels_with_ignore

    inputs['difficulty'] = example['difficulty']

    return inputs

def tokenize_function_for_auto_regressive_evaluation(example, tokenizer): # tokenization function
    inputs = tokenizer(example['question'][:500] + '\n[CODE]\n', truncation=True, max_length=512)
    solution = tokenizer(example['solution'][:500] + tokenizer.eos_token, truncation=True, max_length=512)

    # Create labels - we only want to compute loss on the solution part
    labels = solution['input_ids']

    inputs['labels'] = labels
    return inputs

In [6]:
cpa_dataset = load_dataset('codeparrot/apps', download_mode='force_redownload')
cpa_test_set = cpa_dataset["test"]
cleaned_cpa_test_set = cpa_test_set.map(clean_dataset, batched=True, remove_columns=cpa_test_set.column_names)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.63k [00:00<?, ?B/s]

apps.py:   0%|          | 0.00/4.95k [00:00<?, ?B/s]

apps.py:   0%|          | 0.00/4.95k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.63k [00:00<?, ?B/s]

The repository for codeparrot/apps contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/codeparrot/apps.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


train.jsonl:   0%|          | 0.00/107M [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/1.29G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [7]:
def select_samples_based_on_difficulties(dataset, difficulty='all'):
  return dataset if (difficulty == 'all') else dataset.filter(lambda example: example['difficulty'] == difficulty)

introductory_set = select_samples_based_on_difficulties(cleaned_cpa_test_set, difficulty='introductory')
interview_set = select_samples_based_on_difficulties(cleaned_cpa_test_set, difficulty='interview')
competition_set = select_samples_based_on_difficulties(cleaned_cpa_test_set, difficulty='competition')

print("Number of introductory samples: ", len(introductory_set))
print("Number of interview samples: ", len(interview_set))
print("Number of competition samples: ", len(competition_set))

Filter:   0%|          | 0/3765 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3765 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3765 [00:00<?, ? examples/s]

Number of introductory samples:  728
Number of interview samples:  2727
Number of competition samples:  310


## 2. Load all models to evaluate

In [8]:
all_model_locs = {
    "T5-base": "Salesforce/codet5-base",
    "T5-base (finetuned)": '/content/drive/MyDrive/NLP_Code_Generation/t5-base-codeparrots', ## replace this with your fine-tuned model
    "GPT2-base": "gpt2",
    "GPT2-base (finetuned)": '/content/drive/MyDrive/NLP_Code_Generation/gpt2-codeparrots',  ## replaced this with your fidne-tuned model
}
### dictionary storing the model and tokenizer for each model for evaluation
all_models = {}
for model_name, model_loc in all_model_locs.items():
  all_models[model_name] = {}
  all_models[model_name]["tokenizer"] = AutoTokenizer.from_pretrained(model_loc)

  if model_name.startswith("T5"):
    all_models[model_name]["model"] = T5ForConditionalGeneration.from_pretrained(model_loc)

  elif model_name.startswith("GPT2"):
      all_models[model_name]["tokenizer"].pad_token = all_models[model_name]["tokenizer"].eos_token
      all_models[model_name]["model"] = GPT2LMHeadModel.from_pretrained(model_loc)

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

## 3. Load and define evaluation metrics and utility functions

In [9]:
bleu = load('bleu')
rouge = load('rouge')
meteor = load('meteor')

def preprocess_logits_for_metrics_for_seq2seq(logits, labels):
    '''
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    https://discuss.huggingface.co/t/cuda-out-of-memory-when-using-trainer-with-compute-metrics/2941/15
    '''
    if isinstance(logits, tuple): pred_ids = logits[0]
    else: pred_ids = logits
    if pred_ids.ndim == 3: pred_ids = torch.argmax(pred_ids, dim=-1)
    return pred_ids, labels

def compute_metrics(eval_preds, tokenizer):
    preds = eval_preds.predictions[0]
    labels = eval_preds.label_ids
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id) # Replace -100 with pad token id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id) # Replace -100 with pad token id

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute BLEU, ROUGE, and exact match score
    bleu_results = bleu.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    rouge_results = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    meteor_results = meteor.compute(predictions=decoded_preds, references=decoded_labels)
    codebleu_results = calc_codebleu(decoded_labels, decoded_preds, lang="python", weights=(0.2, 0.4, 0.4, 0), tokenizer=None)
    # exact_match = sum(pred == label for pred, label in zip(decoded_preds, decoded_labels)) / len(decoded_preds)

    return {
        'bleu': bleu_results['bleu'],
        'rouge1': rouge_results['rouge1'],
        'rouge2': rouge_results['rouge2'],
        'rougeL': rouge_results['rougeL'],
        'meteor': meteor_results['meteor'],
        'codebleu': codebleu_results['codebleu'],
        'ngram_match_score': codebleu_results['ngram_match_score'],
        'weighted_ngram_match_score': codebleu_results['weighted_ngram_match_score'],
        'syntax_match_score': codebleu_results['syntax_match_score'],
        'dataflow_match_score': codebleu_results['dataflow_match_score']
        # 'codebleu': meteor_results['codebleu'],
        # 'exact_match': exact_match,
    }

# Create eval trainer for seq2seq model
def create_eval_trainer_for_seq2seq(model, tokenizer, eval_batch_size=128, predict_with_generate=True, generation_max_length=512):
  eval_args = Seq2SeqTrainingArguments(
      per_device_eval_batch_size=eval_batch_size,                       # Evaluation batch size
      predict_with_generate=predict_with_generate,          # Whether to use generate to calculate generative metrics (ROUGE, BLEU)
      generation_max_length=generation_max_length,                            # max generation length
      fp16=torch.cuda.is_available(),                       # Enable mixed-precision training if a CUDA GPU is available (faster, less memory)
      report_to=[]                                          # disable wandb report
  )

  eval_trainer = Seq2SeqTrainer(
      model=model,
      args=eval_args,
      data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model), # Set up data collator for dynamic padding
      preprocess_logits_for_metrics=preprocess_logits_for_metrics_for_seq2seq,
      compute_metrics=lambda x: compute_metrics(x, tokenizer),
  )
  return eval_trainer

# Evaluation functions seq2seq model: use Trainer.evaluate #
def evaluate_seq2seq_model(model, tokenizer, test_dataset):
  tokenized_test_set= test_dataset.map(lambda x: tokenize_function_for_seq2seq(x, tokenizer), batched=True, remove_columns=test_dataset.column_names)
  ## Create trainer
  eval_trainer = create_eval_trainer_for_seq2seq(model, tokenizer, predict_with_generate=True)
  preds = eval_trainer.predict(tokenized_test_set, metric_key_prefix='test')
  metrics = preds.metrics
  # metrics = eval_trainer.evaluate(tokenized_test_set, metric_key_prefix='test')
  return metrics, preds


def shift_padding_to_left(batch, pad_token=-100):
    shifted_batch = []

    for sample in batch:
        # Find the index of the last non-padding token
        last_non_pad_idx = (sample != pad_token).nonzero(as_tuple=True)[0].max().item() if (sample != pad_token).any() else -1

        # Create a new tensor with padding shifted to the left
        shifted_sample = torch.full_like(sample, pad_token)  # Start with all padding (-100)
        shifted_sample[-(last_non_pad_idx + 1):] = sample[:last_non_pad_idx + 1]  # Copy valid tokens to the right side

        shifted_batch.append(shifted_sample)

    return torch.stack(shifted_batch)

def pad_to_same_width(a, b, pad_value=-100):
    max_cols = max(a.shape[1], b.shape[1])

    def pad_array(arr):
        pad_width = max_cols - arr.shape[1]
        if pad_width > 0:
            return np.pad(arr, ((0, 0), (0, pad_width)), constant_values=pad_value)
        return arr

    a_padded = pad_array(a)
    b_padded = pad_array(b)

    return np.concatenate([a_padded, b_padded], axis=0)

def evaluate_autoregressive_model(model, tokenizer, test_dataset, max_new_tokens=512, generation_config={"do_sample": True, "temperature":0.9}):
  tokenized_test_set= test_dataset.map(lambda x: tokenize_function_for_auto_regressive_evaluation(x, tokenizer), batched=False, remove_columns=test_dataset.column_names)
  eval_dataloader = DataLoader(tokenized_test_set, shuffle=False, batch_size=16,
                               collate_fn=DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True))
  model.to(device)
  model.eval()
  ## predictions and labels of all samples
  preds = None
  labels = None
  for batch in tqdm(eval_dataloader):
    input_ids = batch['input_ids']
    ## Reverse the pad token to the left for proper generation
    input_ids = shift_padding_to_left(input_ids, tokenizer.pad_token_id)
    label = batch['labels'].cpu().numpy()
    # print("input", input_ids.shape)

    with torch.no_grad():
        gen_ids = model.generate(
            input_ids.to(device),
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.pad_token_id,
            attention_mask=batch['attention_mask'].to(device),
            eos_token_id=tokenizer.eos_token_id,
            **generation_config
        )
        # Get the generated sequences but ignore the input part
        gen_ids = gen_ids[:, input_ids.shape[-1]:].cpu().numpy()

    preds = gen_ids if (preds is None) else pad_to_same_width(preds, gen_ids)
    labels = label if (labels is None) else pad_to_same_width(labels, label)

  preds = np.array(preds)
  labels = np.array(labels)
  eval_preds = EvalPrediction(predictions=(preds,), label_ids=labels)
  # gen_text = tokenizer.batch_decode(gen_ids.to('cpu'))

  metrics = compute_metrics(eval_preds, tokenizer)

  return metrics, eval_preds

  # bleu_results = bleu.compute(predictions=decoded_preds, references=[[label] for label in labels])
  # rouge_results = rouge.compute(predictions=decoded_preds, references=labels)
  # meteor_results = meteor.compute(predictions=decoded_preds, references=labels)
  # codebleu_results = calc_codebleu(labels, decoded_preds, lang="python", weights=(0.25, 0.25, 0.25, 0.25), tokenizer=tokenizer)

  # return {
  #     'test_bleu': bleu_results['bleu'],
  #     'test_rouge1': rouge_results['rouge1'],
  #     'test_rouge2': rouge_results['rouge2'],
  #     'test_rougeL': rouge_results['rougeL'],
  #     'test_meteor': meteor_results['meteor'],
  # }

def select_samples_based_on_difficulties(dataset, difficulty='all'):
  return dataset if (difficulty == 'all') else dataset.filter(lambda example: example['difficulty'] == difficulty)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


## 4. Run evaluation

In [10]:
read_saved_preds = True ## control if we want to run evaluation from scratch or run it all saved predictions

In [11]:
import os, pickle
saved_predictions_path = "/content/drive/MyDrive/NLP_Code_Generation/saved_predictions"
if not os.path.exists(saved_predictions_path):
  os.makedirs(saved_predictions_path)

saved_results_path = "/content/drive/MyDrive/NLP_Code_Generation/saved_experiment_results"
if not os.path.exists(saved_results_path):
  os.makedirs(saved_results_path)

### 4.1. Comparison between pretrained and finetuned CodeT5 models

In [13]:
all_metric_names = ['bleu', 'codebleu', 'ngram_match_score', 'weighted_ngram_match_score', 'syntax_match_score', 'dataflow_match_score', 'rouge1', 'rouge2', 'rougeL', 'meteor']
model_comparison_results_df = pd.DataFrame(columns=['model', 'difficulty', *all_metric_names])

# all_difficulty_lvls = ['all'] + list(set(cleaned_cpa_test_set['difficulty']))
all_difficulty_lvls = list(set(cleaned_cpa_test_set['difficulty']))

for model_name in ["T5-base (finetuned)", "T5-base"]:
  print(f"Evaluating model: {model_name}")
  model = all_models[model_name]["model"]
  tokenizer = all_models[model_name]["tokenizer"]

  ## for each difficulty levels
  for lvl in all_difficulty_lvls:
    ### Run eval from scratch ###
    if not read_saved_preds:
      print("Running evaluation from scratch")
      # Filter out the test set to only the current level
      test_dataset_lvl = select_samples_based_on_difficulties(cleaned_cpa_test_set, difficulty=lvl) # test
      ## Run evaluation
      metrics, preds = evaluate_seq2seq_model(model, tokenizer, test_dataset_lvl)
      # Save pred path for later use
      with open(os.path.join(saved_predictions_path, f"{model_name}_{lvl}_predictions.pkl"), "wb") as f:
        pickle.dump(preds, f)
      model_comparison_results_df.loc[len(model_comparison_results_df)] = [model_name, lvl] + [f"test_{metrics[x]}" for x in all_metric_names]

    ### Run eval from saved predictions ###
    else:
      print("Loading predictions from saved directory")
      with open(os.path.join(saved_predictions_path, f"{model_name}_{lvl}_predictions.pkl"), "rb") as f:
        preds = pickle.load(f)
      metrics = compute_metrics(preds, tokenizer)
      model_comparison_results_df.loc[len(model_comparison_results_df)] = [model_name, lvl] + [metrics[x] for x in all_metric_names]

# model_comparison_results_df = model_comparison_results_df.pivot(index="model", columns="difficulty", values=all_metric_names)
# model_comparison_results_df = model_comparison_results_df.sort_index(axis=1, level=1)
# model_comparison_results_df.columns = pd.MultiIndex.from_tuples([(d, m) for m, d in model_comparison_results_df.columns])
model_comparison_results_df.to_excel(os.path.join(saved_results_path, "codet5_model_comparison_results.xlsx"))
model_comparison_results_df

Evaluating model: T5-base (finetuned)
Loading predictions from saved directory
Loading predictions from saved directory
Loading predictions from saved directory
Evaluating model: T5-base
Loading predictions from saved directory
Loading predictions from saved directory
Loading predictions from saved directory


Unnamed: 0,model,difficulty,bleu,codebleu,ngram_match_score,weighted_ngram_match_score,syntax_match_score,dataflow_match_score,rouge1,rouge2,rougeL,meteor
0,T5-base (finetuned),interview,0.138539,0.121497,0.0391087,0.050606,0.233583,0.175749,0.291387,0.130957,0.243339,0.236653
1,T5-base (finetuned),introductory,0.123787,0.138464,0.03120243,0.057608,0.272952,0.220344,0.306245,0.147855,0.26565,0.274619
2,T5-base (finetuned),competition,0.12469,0.105563,0.03608964,0.040422,0.20544,0.137451,0.270791,0.115386,0.222807,0.197653
3,T5-base,interview,1.7e-05,0.001288,1.366067e-06,1.8e-05,0.003201,0.065415,0.034153,0.002325,0.031754,0.010242
4,T5-base,introductory,0.000177,0.001666,1.049057e-05,4.1e-05,0.00412,0.080184,0.048308,0.004582,0.044621,0.012921
5,T5-base,competition,3e-06,0.001885,6.916208e-07,2.3e-05,0.004689,0.049805,0.027591,0.00209,0.025971,0.009615


### 4.2. Comparison between GPT-2 pretrained and finetuned models

In [15]:
all_metric_names = ['bleu', 'codebleu', 'ngram_match_score', 'weighted_ngram_match_score', 'syntax_match_score', 'dataflow_match_score', 'rouge1', 'rouge2', 'rougeL', 'meteor']
model_comparison_results_df = pd.DataFrame(columns=['model', 'difficulty', *all_metric_names])

# all_difficulty_lvls = ['all'] + list(set(cleaned_cpa_test_set['difficulty']))
all_difficulty_lvls = list(set(cleaned_cpa_test_set['difficulty']))

def select_samples_based_on_difficulties(dataset, difficulty='all'):
  return dataset if (difficulty == 'all') else dataset.filter(lambda example: example['difficulty'] == difficulty)

for model_name in ["GPT2-base (finetuned)", "GPT2-base"]:
  print(f"Evaluating model: {model_name}")
  model = all_models[model_name]["model"]
  tokenizer = all_models[model_name]["tokenizer"]

  ## for each difficulty levels
  for lvl in all_difficulty_lvls:
    ### Run eval from scratch ###
    if not read_saved_preds:
      print("Running evaluation from scratch")
      # Filter out the test set to only the current level
      test_dataset_lvl = select_samples_based_on_difficulties(cleaned_cpa_test_set, difficulty=lvl)
      metrics, preds = evaluate_autoregressive_model(model, tokenizer, test_dataset_lvl)
      # Save pred path for later use
      with open(os.path.join(saved_predictions_path, f"{model_name}_{lvl}_predictions.pkl"), "wb") as f:
        pickle.dump(preds, f)
      model_comparison_results_df.loc[len(model_comparison_results_df)] = [model_name, lvl] + [metrics[x] for x in all_metric_names]

    ### Run eval from saved predictions
    else:
      print("Loading predictions from saved directory")
      with open(os.path.join(saved_predictions_path, f"{model_name}_{lvl}_predictions.pkl"), "rb") as f:
        preds = pickle.load(f)
      metrics = compute_metrics(preds, tokenizer)

    model_comparison_results_df.loc[len(model_comparison_results_df)] = [model_name, lvl] + [metrics[x] for x in all_metric_names]

# model_comparison_results_df = model_comparison_results_df.pivot(index="model", columns="difficulty", values=all_metric_names)
# model_comparison_results_df = model_comparison_results_df.sort_index(axis=1, level=1)
# model_comparison_results_df.columns = pd.MultiIndex.from_tuples([(d, m) for m, d in model_comparison_results_df.columns])
model_comparison_results_df.to_excel(os.path.join(saved_results_path, "gpt2_model_comparison_results.xlsx"))
model_comparison_results_df

Evaluating model: GPT2-base (finetuned)
Loading predictions from saved directory
Loading predictions from saved directory
Loading predictions from saved directory
Evaluating model: GPT2-base
Loading predictions from saved directory
Loading predictions from saved directory
Loading predictions from saved directory


Unnamed: 0,model,difficulty,bleu,codebleu,ngram_match_score,weighted_ngram_match_score,syntax_match_score,dataflow_match_score,rouge1,rouge2,rougeL,meteor
0,GPT2-base (finetuned),interview,0.077352,0.13177,0.01362,0.035847,0.286767,0.389581,0.233508,0.066519,0.156433,0.277233
1,GPT2-base (finetuned),introductory,0.067874,0.148514,0.011042,0.040813,0.32495,0.420265,0.21323,0.065429,0.14776,0.27287
2,GPT2-base (finetuned),competition,0.07495,0.120295,0.012783,0.029719,0.264627,0.368484,0.24,0.063161,0.1569,0.270258
3,GPT2-base,interview,0.001489,0.016568,0.000109,0.000634,0.040732,0.464321,0.064801,0.003049,0.048574,0.076565
4,GPT2-base,introductory,0.001527,0.01602,6.6e-05,0.000487,0.03953,0.477247,0.068347,0.004731,0.05194,0.081602
5,GPT2-base,competition,0.001616,0.01726,0.000139,0.000676,0.042404,0.436866,0.079319,0.004647,0.058097,0.08304


### 4.3. Comparison on different decoding method (T5)

In [17]:
selected_model_name = "T5-base (finetuned)" ## you can set it to another model ##
model = all_models[selected_model_name]["model"]
tokenizer = all_models[selected_model_name]["tokenizer"]

### Tokenization ###
tokenized_test_set= cleaned_cpa_test_set.map(lambda x: tokenize_function_for_seq2seq(x, tokenizer), batched=True, remove_columns=cleaned_cpa_test_set.column_names)
eval_trainer = create_eval_trainer_for_seq2seq(model, tokenizer, eval_batch_size=128, predict_with_generate=True, generation_max_length=256)

### evaluate with different generation schema
all_metric_names = ['bleu', 'codebleu', 'ngram_match_score', 'weighted_ngram_match_score', 'syntax_match_score', 'dataflow_match_score', 'rouge1', 'rouge2', 'rougeL', 'meteor']
generation_schemas =  {
    "greedy": {"do_sample": False, "num_beams": 1},
    "beam_search": {"do_sample": False, "num_beams": 5, "early_stopping": True},
    "random_sampling": {"do_sample": True, "temperature": 0.6, "top_k": 0},
    "top_k_sampling": {"do_sample": True, "top_k": 50},
    "top_p_sampling": {"do_sample": True, "top_p": 0.9, "top_k": 0},
}

schema_results_df = pd.DataFrame(columns=["model", "generation_method", *all_metric_names])
for schema in generation_schemas:
  print(f"Evaluating generation method: {schema}")
  schema_config = generation_schemas[schema]

  ### EVAL FROM SCRATCH ###
  if not read_saved_preds:
    print("Running evaluation from scratch")
    preds = eval_trainer.predict(tokenized_test_set ,metric_key_prefix="test", **schema_config)
    metrics = preds.metrics
    # Save pred path for later use
    with open(os.path.join(saved_predictions_path, f"{selected_model_name}_{schema}_predictions.pkl"), "wb") as f:
      pickle.dump(preds, f)
    schema_results_df.loc[len(schema_results_df)] = [selected_model_name, schema, metrics['test_bleu'], metrics["test_codebleu"], metrics['test_rouge1'], metrics['test_rouge2'], metrics['test_rougeL'], metrics['test_meteor']]

  ### EVAL FROM SAVED PREDICTIONS
  else:
    print("Loading predictions from saved directory")
    with open(os.path.join(saved_predictions_path, f"{selected_model_name}_{schema}_predictions.pkl"), "rb") as f:
      preds = pickle.load(f)
    metrics = compute_metrics(preds, tokenizer)
    schema_results_df.loc[len(schema_results_df)] = [selected_model_name, schema] + [metrics[x] for x in all_metric_names]

schema_results_df.to_excel(os.path.join(saved_results_path, "decoding_method_comparison_t5.xlsx"))
schema_results_df

Map:   0%|          | 0/3765 [00:00<?, ? examples/s]

Evaluating generation method: greedy
Loading predictions from saved directory
Evaluating generation method: beam_search
Loading predictions from saved directory
Evaluating generation method: random_sampling
Loading predictions from saved directory
Evaluating generation method: top_k_sampling
Loading predictions from saved directory
Evaluating generation method: top_p_sampling
Loading predictions from saved directory


Unnamed: 0,model,generation_method,bleu,codebleu,ngram_match_score,weighted_ngram_match_score,syntax_match_score,dataflow_match_score,rouge1,rouge2,rougeL,meteor
0,T5-base (finetuned),greedy,0.143637,0.123407,0.046177,0.050089,0.235339,0.174689,0.318012,0.143658,0.266825,0.247294
1,T5-base (finetuned),beam_search,0.151042,0.119455,0.039178,0.044817,0.234233,0.175938,0.314247,0.138313,0.261931,0.254133
2,T5-base (finetuned),random_sampling,0.155124,0.133487,0.042212,0.047213,0.265399,0.204328,0.340752,0.141349,0.266736,0.263666
3,T5-base (finetuned),top_k_sampling,0.152996,0.133474,0.033747,0.037053,0.279758,0.228958,0.332337,0.116932,0.239357,0.271075
4,T5-base (finetuned),top_p_sampling,0.158858,0.13753,0.03864,0.042172,0.282334,0.227189,0.335465,0.124042,0.247004,0.273379


### 4.4. Comparison on different decoding method (GPT)

In [19]:
selected_model_name = "GPT2-base (finetuned)" ## you can set it to another model ##
model = all_models[selected_model_name]["model"]
tokenizer = all_models[selected_model_name]["tokenizer"]

all_metric_names = ['bleu', 'codebleu', 'ngram_match_score', 'weighted_ngram_match_score', 'syntax_match_score', 'dataflow_match_score', 'rouge1', 'rouge2', 'rougeL', 'meteor']
## different schema config
generation_schemas =  {
    "greedy": {"do_sample": False, "num_beams": 1},
    "random_sampling": {"do_sample": True, "temperature": 0.6, "top_k": 0},
    "top_k_sampling": {"do_sample": True, "top_k": 50},
    "top_p_sampling": {"do_sample": True, "top_p": 0.9, "top_k": 0},
    "beam_search": {"do_sample": False, "num_beams": 5, "early_stopping": True},
}

schema_results_df = pd.DataFrame(columns=["model", "generation_method", *all_metric_names])
for schema in generation_schemas:
  print(f"Evaluating generation method: {schema}")
  schema_config = generation_schemas[schema]

  ### EVAL FROM SCRATCH ###
  if not read_saved_preds:
    print("Running evaluation from scratch")
    metrics, preds = evaluate_autoregressive_model(model, tokenizer, cleaned_cpa_test_set, max_new_tokens=256, generation_config=schema_config)

    # Save pred path for later use
    with open(os.path.join(saved_predictions_path, f"{selected_model_name}_{schema}_predictions.pkl"), "wb") as f:
      pickle.dump(preds, f)

    schema_results_df.loc[len(schema_results_df)] = [selected_model_name, schema, *[metrics[x] for x in all_metric_names]]

  ### EVAL FROM SAVED PREDICTIONS
  else:
    print("Loading predictions from saved directory")
    with open(os.path.join(saved_predictions_path, f"{selected_model_name}_{schema}_predictions.pkl"), "rb") as f:
      preds = pickle.load(f)
    metrics = compute_metrics(preds, tokenizer)
    schema_results_df.loc[len(schema_results_df)] = [selected_model_name, schema] + [metrics[x] for x in all_metric_names]

schema_results_df.to_excel(os.path.join(saved_results_path, "decoding_method_comparison_gpt2.xlsx"))
schema_results_df

Evaluating generation method: greedy
Loading predictions from saved directory
Evaluating generation method: random_sampling
Loading predictions from saved directory
Evaluating generation method: top_k_sampling
Loading predictions from saved directory
Evaluating generation method: top_p_sampling
Loading predictions from saved directory
Evaluating generation method: beam_search
Loading predictions from saved directory


Unnamed: 0,model,generation_method,bleu,codebleu,ngram_match_score,weighted_ngram_match_score,syntax_match_score,dataflow_match_score,rouge1,rouge2,rougeL,meteor
0,GPT2-base (finetuned),greedy,0.054022,0.045944,0.004678,0.009316,0.103205,0.290784,0.127987,0.035211,0.102601,0.146434
1,GPT2-base (finetuned),random_sampling,0.08074,0.073584,0.01111,0.018916,0.159489,0.300564,0.19139,0.050436,0.141434,0.214395
2,GPT2-base (finetuned),top_k_sampling,0.102605,0.106045,0.014973,0.023714,0.233911,0.303742,0.254562,0.066792,0.172601,0.26106
3,GPT2-base (finetuned),top_p_sampling,0.102581,0.101296,0.015393,0.024362,0.221183,0.294451,0.243451,0.066011,0.167986,0.251972
4,GPT2-base (finetuned),beam_search,0.129377,0.102292,0.023256,0.026897,0.217206,0.247287,0.232208,0.094964,0.185391,0.230372
