## 0. Set up

In [1]:
!pip install datasets evaluate transformers
!pip install -q evaluate
!pip install -q rouge_score

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m9

In [1]:
from datasets import load_dataset
from evaluate import load
import re
import json
import torch
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
from transformers import (AutoTokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Trainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, GPT2LMHeadModel,
                          DataCollatorForLanguageModeling, TrainingArguments)
import os
from tqdm import tqdm
os.environ["WANDB_DISABLED"] = "true"

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## 1. Load and clean dataset

In [26]:
number_of_solutions = 1
def clean_dataset(batch):
  cleaned_questions = []
  cleaned_solutions = []
  difficulties = []

  for question, solutions_str, difficulty in zip(batch["question"], batch["solutions"], batch["difficulty"]):
      cleaned_question = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*)', '', question)
      solutions = json.loads(solutions_str) if solutions_str  else []
      for sol in solutions[:number_of_solutions]:
          cleaned_questions.append(question)
          cleaned_solutions.append(sol)
          difficulties.append(difficulty)

  return {"question": cleaned_questions, "solution": cleaned_solutions, "difficulty": difficulties}

def tokenize_function_for_seq2seq(example, tokenizer): # tokenization function
    inputs = tokenizer(example['question'], truncation=True, max_length=512) # Tokenize inputs (questions)
    labels = tokenizer(example['solution'], truncation=True, max_length=512) # Tokenize targets (solutions)

    labels_with_ignore = [] # Replace padding token id with -100 so it's ignored in the loss
    for label in labels['input_ids']:
        labels_with_ignore.append([-100 if token == tokenizer.pad_token_id else token for token in label])
    inputs['labels'] = labels_with_ignore

    inputs['difficulty'] = example['difficulty']

    return inputs

def tokenize_function_for_auto_regressive(example, tokenizer): # tokenization function
    text = example['question'] + '\n[CODE]\n' + example['solution'] + tokenizer.eos_token
    inputs = tokenizer(text, truncation=True, padding='max_length', max_length=512)
    question = tokenizer(example['question'] + '\n[CODE]\n', truncation=True, max_length=256)

    # Create labels - we only want to compute loss on the solution part
    solution_start = len(question['input_ids'])  # Find where the solution starts in the encoded sequence
    labels = [-100] * solution_start + inputs['input_ids'][solution_start:] # Set labels to -100 for question part (ignored in loss calculation)
    inputs['labels'] = labels[:512]
    inputs['difficulty'] = example['difficulty']
    return inputs

In [5]:
cpa_dataset = load_dataset('codeparrot/apps')
cpa_test_set = cpa_dataset["test"]

cleaned_cpa_test_set = cpa_test_set.map(clean_dataset, batched=True, remove_columns=cpa_test_set.column_names)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## 2. Load all models to evaluate

In [6]:
all_model_locs = {
    "T5-base": "Salesforce/codet5-base",
    "T5-base (finetuned)": '/content/drive/MyDrive/NLP_Code_Generation/t5-base-codeparrots', ## replace this with your fine-tuned model
    "GPT2-base": "gpt2",
    # "GPT2-base (finetuned)": '/content/drive/MyDrive/NLP_Code_Generation/gpt2-base-codeparrots',  ## replaced this with your fidne-tuned model
}
### dictionary storing the model and tokenizer for each model for evaluation
all_models = {}
for model_name, model_loc in all_model_locs.items():
  all_models[model_name] = {}
  all_models[model_name]["tokenizer"] = AutoTokenizer.from_pretrained(model_loc)

  if model_name.startswith("T5"):
    all_models[model_name]["model"] = T5ForConditionalGeneration.from_pretrained(model_loc)

  elif model_name.startswith("GPT2"):
      all_models[model_name]["tokenizer"].pad_token = all_models[model_name]["tokenizer"].eos_token
      all_models[model_name]["model"] = GPT2LMHeadModel.from_pretrained(model_loc)

## 3. Load and define evaluation metrics and utility functions

In [27]:
bleu = load('bleu')
rouge = load('rouge')
meteor = load('meteor')

def preprocess_logits_for_metrics_for_seq2seq(logits, labels):
    '''
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    https://discuss.huggingface.co/t/cuda-out-of-memory-when-using-trainer-with-compute-metrics/2941/15
    '''
    if isinstance(logits, tuple): pred_ids = logits[0]
    else: pred_ids = logits
    if pred_ids.ndim == 3: pred_ids = torch.argmax(pred_ids, dim=-1)
    return pred_ids, labels

def compute_metrics(eval_preds):
    preds = eval_preds.predictions[0]
    labels = eval_preds.label_ids
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id) # Replace -100 with pad token id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id) # Replace -100 with pad token id

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute BLEU, ROUGE, and exact match score
    bleu_results = bleu.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    rouge_results = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    meteor_results = meteor.compute(predictions=decoded_preds, references=decoded_labels)
    # codebleu_results = codebleu.compute(predictions=decoded_preds, references=decoded_labels, lang="python")
    # exact_match = sum(pred == label for pred, label in zip(decoded_preds, decoded_labels)) / len(decoded_preds)

    return {
        'bleu': bleu_results['bleu'],
        'rouge1': rouge_results['rouge1'],
        'rouge2': rouge_results['rouge2'],
        'rougeL': rouge_results['rougeL'],
        'meteor': meteor_results['meteor'],
        # 'codebleu': meteor_results['codebleu'],
        # 'exact_match': exact_match,
    }

# Create eval trainer for seq2seq model
def create_eval_trainer_for_seq2seq(model, tokenizer, predict_with_generate=True):
  eval_args = Seq2SeqTrainingArguments(
      per_device_eval_batch_size=128,         # Evaluation batch size
      predict_with_generate=predict_with_generate,          # Whether to use generate to calculate generative metrics (ROUGE, BLEU)
      generation_max_length=256,
      fp16=torch.cuda.is_available(),       # Enable mixed-precision training if a CUDA GPU is available (faster, less memory)
      report_to=[]                          # disable wandb report
  )

  eval_trainer = Seq2SeqTrainer(
      model=model,
      args=eval_args,
      data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model), # Set up data collator for dynamic padding
      preprocess_logits_for_metrics=preprocess_logits_for_metrics_for_seq2seq,
      compute_metrics=compute_metrics,
  )
  return eval_trainer

# Evaluation functions seq2seq model: use Trainer.evaluate #
def evaluate_seq2seq_model(model, tokenizer, test_dataset):
  tokenized_test_set= test_dataset.map(lambda x: tokenize_function_for_seq2seq(x, tokenizer), batched=True, remove_columns=test_dataset.column_names)
  ## Create trainer
  eval_trainer = create_eval_trainer_for_seq2seq(model, tokenizer, predict_with_generate=True)
  metrics = eval_trainer.evaluate(tokenized_test_set, metric_key_prefix='test')
  return metrics

# Evaluation functions for autoregressive #
def evaluate_autoregressive_model(model, tokenizer, test_dataset):
  tokenized_test_set= test_dataset.map(lambda x: tokenize_function_for_auto_regressive(x, tokenizer), batched=False, remove_columns=test_dataset.column_names)
  tokenized_test_set = tokenized_test_set.remove_columns(['difficulty'])
  eval_dataloader = DataLoader(tokenized_test_set, shuffle=False, batch_size=1,
                               collate_fn=DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True))
  model.to(device)
  model.eval()
  decoded_preds = []
  labels = []
  for batch in tqdm(eval_dataloader):
    input_len=torch.nonzero(batch['labels'][0]!=-100).squeeze()[0]
    input_ids = batch['input_ids'][0][:input_len.item()].unsqueeze(0)

    label = tokenizer.decode(batch['input_ids'][0][input_len.item():])

    with torch.no_grad():
        gen_tokens = model.generate(
            input_ids.to(device),
            do_sample=True,
            temperature=0.9,
            max_new_tokens=256,
        )
        gen_text = tokenizer.decode(gen_tokens.to('cpu')[0])

    gen_text = gen_text.split('\n[CODE]\n')[0]

    decoded_preds.append(gen_text)
    labels.append(label)

  bleu_results = bleu.compute(predictions=decoded_preds, references=[[label] for label in labels])
  rouge_results = rouge.compute(predictions=decoded_preds, references=labels)
  meteor_results = meteor.compute(predictions=decoded_preds, references=labels)

  return {
      'test_bleu': bleu_results['bleu'],
      'test_rouge1': rouge_results['rouge1'],
      'test_rouge2': rouge_results['rouge2'],
      'test_rougeL': rouge_results['rougeL'],
      'test_meteor': meteor_results['meteor'],
  }

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## 4. Run evaluation

### 4.1. Comparison between different Seq2Seq on the entire test set (different difficulty levels)

In [8]:
model_comparison_results_df = pd.DataFrame(columns=['model', 'difficulty', 'bleu', 'rouge1', 'rouge2', 'rougeL', 'meteor'])

all_difficulty_lvls = ['all'] + list(set(cleaned_cpa_test_set['difficulty']))
def select_samples_based_on_difficulties(dataset, difficulty='all'):
  return dataset if (difficulty == 'all') else dataset.filter(lambda example: example['difficulty'] == difficulty)

for model_name in ["T5-base (finetuned)", "T5-base"]:
  print(f"Evaluating model: {model_name}")
  model = all_models[model_name]["model"]
  tokenizer = all_models[model_name]["tokenizer"]

  ## for each difficulty levels
  for lvl in all_difficulty_lvls:
    # Filter out the test set to only the current level
    test_dataset_lvl = select_samples_based_on_difficulties(cleaned_cpa_test_set, difficulty=lvl)

    ## Run evaluation
    if model_name.startswith("T5"):
      metrics = evaluate_seq2seq_model(model, tokenizer, test_dataset_lvl)
    elif model_name.startswith("GPT"):
      metrics = evaluate_autoregressive_model(model, tokenizer, test_dataset_lvl)


    model_comparison_results_df.loc[len(model_comparison_results_df)] = [model_name, lvl, metrics['test_bleu'], metrics['test_rouge1'], metrics['test_rouge2'], metrics['test_rougeL'], metrics['test_meteor']]


model_comparison_results_df = model_comparison_results_df.pivot(index="model", columns="difficulty", values=["bleu", "rouge1", "rouge2", "rougeL", "meteor"])
model_comparison_results_df = model_comparison_results_df.sort_index(axis=1, level=1)
model_comparison_results_df.columns = pd.MultiIndex.from_tuples([(d, m) for m, d in model_comparison_results_df.columns])
model_comparison_results_df

Evaluating model: T5-base (finetuned)


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Filter:   0%|          | 0/3765 [00:00<?, ? examples/s]

Map:   0%|          | 0/728 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3765 [00:00<?, ? examples/s]

Map:   0%|          | 0/310 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3765 [00:00<?, ? examples/s]

Map:   0%|          | 0/2727 [00:00<?, ? examples/s]

Evaluating model: T5-base


Map:   0%|          | 0/3765 [00:00<?, ? examples/s]

Map:   0%|          | 0/728 [00:00<?, ? examples/s]

Map:   0%|          | 0/310 [00:00<?, ? examples/s]

Map:   0%|          | 0/2727 [00:00<?, ? examples/s]

Unnamed: 0_level_0,all,all,all,all,all,competition,competition,competition,competition,competition,interview,interview,interview,interview,interview,introductory,introductory,introductory,introductory,introductory
Unnamed: 0_level_1,bleu,meteor,rouge1,rouge2,rougeL,bleu,meteor,rouge1,rouge2,rougeL,bleu,meteor,rouge1,rouge2,rougeL,bleu,meteor,rouge1,rouge2,rougeL
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
T5-base,2.7e-05,0.010727,0.036318,0.002785,0.033925,3e-06,0.009627,0.027273,0.002109,0.025698,1.7e-05,0.010241,0.034132,0.002344,0.031817,0.000177,0.012921,0.048156,0.004595,0.044805
T5-base (finetuned),0.143529,0.24718,0.317829,0.143645,0.266808,0.105458,0.20204,0.296873,0.125648,0.242936,0.139844,0.242647,0.316935,0.141857,0.26448,0.182422,0.283353,0.330613,0.158015,0.285584


### 4.2. Comparison between Autoregressive model

In [30]:
model_comparison_results_df = pd.DataFrame(columns=['model', 'difficulty', 'bleu', 'rouge1', 'rouge2', 'rougeL', 'meteor'])

all_difficulty_lvls = ['all'] + list(set(cleaned_cpa_test_set['difficulty']))
def select_samples_based_on_difficulties(dataset, difficulty='all'):
  return dataset if (difficulty == 'all') else dataset.filter(lambda example: example['difficulty'] == difficulty)

for model_name in ["GPT2-base"]:
  print(f"Evaluating model: {model_name}")
  model = all_models[model_name]["model"]
  tokenizer = all_models[model_name]["tokenizer"]

  ## for each difficulty levels
  for lvl in all_difficulty_lvls:
    # Filter out the test set to only the current level
    test_dataset_lvl = select_samples_based_on_difficulties(cleaned_cpa_test_set, difficulty=lvl)

    ## Run evaluation
    if model_name.startswith("T5"):
      metrics = evaluate_seq2seq_model(model, tokenizer, test_dataset_lvl)
    elif model_name.startswith("GPT"):
      metrics = evaluate_autoregressive_model(model, tokenizer, test_dataset_lvl.select(range(2)))

    model_comparison_results_df.loc[len(model_comparison_results_df)] = [model_name, lvl, metrics['test_bleu'], metrics['test_rouge1'], metrics['test_rouge2'], metrics['test_rougeL'], metrics['test_meteor']]


model_comparison_results_df = model_comparison_results_df.pivot(index="model", columns="difficulty", values=["bleu", "rouge1", "rouge2", "rougeL", "meteor"])
model_comparison_results_df = model_comparison_results_df.sort_index(axis=1, level=1)
model_comparison_results_df.columns = pd.MultiIndex.from_tuples([(d, m) for m, d in model_comparison_results_df.columns])
model_comparison_results_df

Evaluating model: GPT2-base


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

  0%|          | 0/2 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████     | 1/2 [00:02<00:02,  2.72s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
100%|██████████| 2/2 [00:05<00:00,  2.71s/it]


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

  0%|          | 0/2 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████     | 1/2 [00:02<00:02,  2.73s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
100%|██████████| 2/2 [00:05<00:00,  2.72s/it]


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

  0%|          | 0/2 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████     | 1/2 [00:02<00:02,  2.72s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
100%|██████████| 2/2 [00:05<00:00,  2.71s/it]


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

  0%|          | 0/2 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████     | 1/2 [00:02<00:02,  2.72s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
100%|██████████| 2/2 [00:05<00:00,  2.72s/it]


Unnamed: 0_level_0,all,all,all,all,all,competition,competition,competition,competition,competition,interview,interview,interview,interview,interview,introductory,introductory,introductory,introductory,introductory
Unnamed: 0_level_1,bleu,meteor,rouge1,rouge2,rougeL,bleu,meteor,rouge1,rouge2,rougeL,bleu,meteor,rouge1,rouge2,rougeL,bleu,meteor,rouge1,rouge2,rougeL
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
GPT2-base,0.015968,0.149404,0.134886,0.032333,0.094155,0.080324,0.251716,0.295198,0.094569,0.159749,0.015158,0.159197,0.138301,0.033981,0.095937,0.137755,0.327831,0.362906,0.181195,0.212304


### 4.3. Comparison on different decoding method

In [9]:
selected_model_name = "T5-base (finetuned)" ## you can set it to another model ##
model = all_models[selected_model_name]["model"]
tokenizer = all_models[selected_model_name]["tokenizer"]

### Tokenization ###
if selected_model_name.startswith("T5"):
  tokenized_test_set= cleaned_cpa_test_set.map(lambda x: tokenize_function_for_seq2seq(x, tokenizer), batched=True, remove_columns=cleaned_cpa_test_set.column_names)

eval_trainer = create_eval_trainer_for_seq2seq(model, tokenizer, predict_with_generate=True)

Map:   0%|          | 0/3765 [00:00<?, ? examples/s]

In [12]:
### evaluate with different generation schema
generation_schemas =  {
    "greedy": {"do_sample": False, "num_beams": 1},
    "beam_search": {"do_sample": False, "num_beams": 5, "early_stopping": True},
    "random_sampling": {"do_sample": True, "temperature": 0.6, "top_k": 0},
    "top_k_sampling": {"do_sample": True, "top_k": 50},
    "top_p_sampling": {"do_sample": True, "top_p": 0.9, "top_k": 0},
}

schema_results_df = pd.DataFrame(columns=["generation_method", "bleu", "rouge1", "rouge2", "rougeL", "meteor"])
for schema in generation_schemas:
  schema_config = generation_schemas[schema]
  metrics = eval_trainer.predict(tokenized_test_set ,metric_key_prefix="test", **schema_config).metrics
  schema_results_df.loc[len(schema_results_df)] = [schema, metrics['test_bleu'], metrics['test_rouge1'], metrics['test_rouge2'], metrics['test_rougeL'], metrics['test_meteor']]

schema_results_df

Unnamed: 0,generation_method,bleu,rouge1,rouge2,rougeL,meteor
0,greedy,0.143529,0.317829,0.143645,0.266808,0.24718
1,beam_search,0.151078,0.314411,0.138405,0.262196,0.254163
2,random_sampling,0.155126,0.340696,0.141335,0.266654,0.263644
3,top_k_sampling,0.152932,0.332511,0.116916,0.23943,0.27098
4,top_p_sampling,0.158862,0.335466,0.124047,0.247006,0.273418


In [45]:
# model_comparison_results_df.to_excel("model_comparison_results_df.xlsx")
# schema_results_df.to_excel("schema_results_df.xlsx")