In [1]:
%pip install -q --disable-pip-version-check \
    evaluate==0.4.0 \
    py7zr==0.20.4 \
    sentencepiece==0.1.99 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.4.0 \
    trl==0.7.2 \
    bert_score
%pip install -q    wandb bitsandbytes accelerate

## Set up all random seeds

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!wget https://github.com/wandb/edu/raw/main/llm-training-course/colab/utils.py

--2024-04-20 07:27:21--  https://github.com/wandb/edu/raw/main/llm-training-course/colab/utils.py
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/wandb/edu/main/llm-training-course/colab/utils.py [following]
--2024-04-20 07:27:21--  https://raw.githubusercontent.com/wandb/edu/main/llm-training-course/colab/utils.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8155 (8.0K) [text/plain]
Saving to: ‘utils.py.6’


2024-04-20 07:27:21 (95.4 MB/s) - ‘utils.py.6’ saved [8155/8155]



In [4]:
from google.colab import output
output.enable_custom_widget_manager()

In [5]:
PROJECT = "FlanT5-Lora"
MODEL_NAME = 'google/flan-t5-base'
DATASET = "MeQSum"
WANDB_ID = "NER_seq2seq_2"

In [6]:
import wandb
wandb.init(project=PROJECT, # the project I am working on
           tags=[MODEL_NAME, DATASET],
           notes ="Fine tuning FlanT5 with MeQSum Dataset. Prompt Instruction",
           id=WANDB_ID, resume='allow') # the Hyperparameters I want to keep track of

[34m[1mwandb[0m: Currently logged in as: [33majwadakil[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np
import random

# tqdm library makes the loops show a smart progress meter.
from tqdm import tqdm
tqdm.pandas()

In [8]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [9]:
TRAIN_PATH = '/content/drive/MyDrive/cs577_proj_dataset/train_tagged.jsonl'
VAL_PATH = '/content/drive/MyDrive/cs577_proj_dataset/validation_tagged.jsonl'
TEST_PATH = '/content/drive/MyDrive/cs577_proj_dataset/test_tagged.jsonl'

In [10]:
random_seed = 0
np.random.seed(random_seed)
torch.manual_seed(random_seed)
random.seed(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [11]:
from datasets import load_dataset

# Define the paths to your files

def create_hf_dataset(train_path, val_path, test_path):
    data_files = {}
    data_files["train"] = train_path
    data_files["test"]  = test_path
    data_files["validation"] = val_path

    # Load the datasets
    dataset = load_dataset('json', data_files=data_files)

    # Print the first example of the training dataset
    print(dataset)

    return dataset

# check the number of examples in each dataset

dataset = create_hf_dataset(
    TRAIN_PATH,
    VAL_PATH,
    TEST_PATH
)

DatasetDict({
    train: Dataset({
        features: ['idx', 'inputs', 'target', 'inputs_tagged'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['idx', 'inputs', 'target', 'inputs_tagged'],
        num_rows: 150
    })
    validation: Dataset({
        features: ['idx', 'inputs', 'target', 'inputs_tagged'],
        num_rows: 50
    })
})


In [12]:
dataset['train'].to_pandas()

Unnamed: 0,idx,inputs,target,inputs_tagged
0,0,SUBJECT: who and where to get cetirizine - D M...,Who manufactures cetirizine?,SUBJECT: who and where to get cetirizine - D M...
1,1,who makes bromocriptine i am wondering what co...,Who manufactures bromocriptine?,who makes <DIAGNOSTIC_PROCEDURE> bromocriptine...
2,2,SUBJECT: nulytely MESSAGE: Hello can you tell ...,"Who makes nulytely, and where can I buy it?",SUBJECT: nulytely MESSAGE: Hello can you tell ...
3,3,Williams' syndrome I would like to have my dau...,Where can I get genetic testing for william's ...,<DISEASE_DISORDER> williams' syndrome I would ...
4,4,ClinicalTrials.gov - Question - general inform...,Where can I get genetic testing for multiple m...,ClinicalTrials.gov - Question - general inform...
...,...,...,...,...
995,995,SUBJECT: after surgery of ear drum still same ...,What are the treatments for perforated eardrum?,SUBJECT: after <THERAPEUTIC_PROCEDURE> surgery...
996,996,SUBJECT: ClinicalTrials.gov - Question - speci...,What are the treatments for Glycogen storage d...,SUBJECT: <DIAGNOSTIC_PROCEDURE> clinicaltrials...
997,997,MESSAGE: I have numbness/tingling in my lower ...,Where can I find information and treatment for...,MESSAGE: I have numbness/tingling in my <BIOLO...
998,998,SUBJECT: sleep apnea MESSAGE: I was diagnosed ...,How long does swelling from sleep apnea take t...,SUBJECT: <DISEASE_DISORDER> sleep apnea MESSAG...


In [13]:
# with wandb.init(project=PROJECT, id="trial_run_normal_seq2seq", job_type="dataset", resume="allow"):
#    wbtrain = wandb.Table(data=dataset['train'].to_pandas())
#    wbvalidation = wandb.Table(data=dataset['validation'].to_pandas())
#    wbtest = wandb.Table(data=dataset['test'].to_pandas())
#    wandb.log({"meqsum_train": wbtrain})
#    wandb.log({"meqsum_validation": wbvalidation})
#    wandb.log({"meqsum_test": wbtest})

In [14]:
model_name='google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [15]:
# def print_number_of_trainable_model_parameters(model, tag="original_model"):
#     trainable_model_params = 0
#     all_model_params = 0
#     for _, param in model.named_parameters():
#         all_model_params += param.numel()
#         if param.requires_grad:
#             trainable_model_params += param.numel()
#     with wandb.init(project=PROJECT, job_type="log_parameters", id=WANDB_ID, resume='allow'):
#       wandb.log({f'{tag}': {"trainable_model_params":trainable_model_params}})
#       wandb.log({f'{tag}': {"all_model_params":all_model_params}})
#       wandb.log({f'{tag}': {"percentage_of_trainable_model_parameters": 100 * trainable_model_params}} )

#     return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params}%"

# print(print_number_of_trainable_model_parameters(original_model))

In [16]:
# Define W&B Table to store generations
columns = ["index", "medical question", "prompt", "human_summary", "zero_shot_output"]
table = wandb.Table(columns=columns)

In [17]:
#START_PROMPT = 'Summarize the following medical question using context around it.\n\n'
#START_PROMPT = 'Read through the whole context and summarize the medical question\n\n'
#END_PROMPT = '\n\nQuestion Summary:'
#END_PROMPT = '\n\medical question summary:'
NER_START_PROMPT_1 = 'Read through the whole context and summarize the medical question focusing on tags supplied within <> brackets.\n\n'
NER_START_PROMPT_2 = 'Reading the context, shortly summarize the medical question focusing on tags within <>. Focus on <MEDICATION>,<DIAGNOSTIC_PROCEDURE> \
<BIOLOGICAL_ATTRIBUTE>,<SIGN_SYMPTOM>,<BIOLOGICAL_STRUCTURE>,<DISEASE_DISORDER> if present.\n\n'
NER_END_PROMPT_1= '\n\nQuestion Summary:'
NER_END_PROMPT_2= '\n\nmedical question summary:'

In [18]:
def tokenize_function(example):
    prompt = [NER_START_PROMPT_2 + question + NER_END_PROMPT_2 for question in example["inputs_tagged"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["target"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['idx', 'inputs', 'target',])

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [19]:
# tokenized_datasets['train'].to_pandas().head().iloc[0]['input_ids']

In [20]:
# with wandb.init(project=PROJECT, job_type="dataset"):
#    wbtrain_tokenized = wandb.Table(data=tokenized_datasets['train'].to_pandas())
#    wbvalidation_tokenized = wandb.Table(data=tokenized_datasets['validation'].to_pandas())
#    wbtest_tokenized = wandb.Table(data=tokenized_datasets['test'].to_pandas())
#    wandb.log({"MeQSum_train_tokenized": wbtrain_tokenized})
#    wandb.log({"MeQSum_validation_tokenized": wbvalidation_tokenized})
#    wandb.log({"MeQSum_test_tokenized": wbtest_tokenized})

In [21]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (1000, 3)
Validation: (50, 3)
Test: (150, 3)
DatasetDict({
    train: Dataset({
        features: ['inputs_tagged', 'input_ids', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['inputs_tagged', 'input_ids', 'labels'],
        num_rows: 150
    })
    validation: Dataset({
        features: ['inputs_tagged', 'input_ids', 'labels'],
        num_rows: 50
    })
})


In [22]:
from types import SimpleNamespace
from pathlib import Path
from tqdm.notebook import tqdm
from datetime import datetime
import nltk

In [23]:
# from datasets import load_metric
# from bert_score import score


# # bert_score is not a part of the datasets library, so we'll use the bert_score package directly

# rouge = evaluate.load('rouge')
# bleu = evaluate.load("bleu")

# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     print(predictions[0].shape)
#     print(labels.shape)
#     print('Starting to batch decode: ')
#     decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
#     # Replace -100 in the labels as we can't decode them.
#     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

#     print('Batch Decoding finished: ')
#     # Split the predictions and labels into individual sentences
#     decoded_preds = [pred.strip() for pred in decoded_preds]
#     decoded_labels = [label.strip() for label in decoded_labels]

#     # print(decoded_preds)
#     # print(decoded_labels)

#     #print(type(inference_df['finetuned_model_summaries'].tolist()))

#     references_questions = [[target] for target in decoded_labels]
#     #prediction_questions = [[target] for target in inference_df['finetuned_model_summaries'].tolist()]
#     #print(references_questions)
#     # Compute BLEU score
#     bleu_results = bleu.compute(
#         predictions=decoded_preds,
#         references=references_questions
#     )

#     result = rouge.compute(predictions=decoded_preds, references=decoded_labels)["rougeL"]
#     bleu_result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
#     P, R, F1 = score(decoded_preds, decoded_labels, lang="en")

#     return {
#         "rougel_fmeasure": result,
#         "bleu": bleu_result["bleu"],
#         "bert_score_f1": F1.mean().item(),
#     }

In [24]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# config = SimpleNamespace(
#     # hyperparameters
#     learning_rate=1e-3,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     gradient_accumulation_steps=4,
#     num_train_epochs=100,
#     save_steps=5,
#     save_strategy='epoch', # we cannot set it to "no". Otherwise, the model cannot guess the best checkpoint.
#     eval_steps=5,
#     logging_steps=5,
#     evaluation_strategy='epoch',
#     warmup_steps=500,
#     save_total_limit=3,
#     load_best_model_at_end = True,
#     output_dir = f'./MeQSum-training-{str(int(time.time()))}'
# )


config = SimpleNamespace(
    # hyperparameters
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    num_train_epochs=30,
    save_steps=5,
    save_strategy='epoch', # we cannot set it to "no". Otherwise, the model cannot guess the best checkpoint.
    eval_steps=5,
    logging_steps=5,
    evaluation_strategy='epoch',
    warmup_steps=500,
    save_total_limit=2,
    load_best_model_at_end = True,
    output_dir = f'./MeQSum-training-{str(int(time.time()))}'
)

In [25]:
training_args = TrainingArguments(
    output_dir=config.output_dir,
    learning_rate=config.learning_rate,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
   num_train_epochs=config.num_train_epochs,
    save_steps=config.save_steps,
    save_strategy=config.save_strategy, # we cannot set it to "no". Otherwise, the model cannot guess the best checkpoint.
    eval_steps=config.eval_steps,
    logging_steps=config.logging_steps,
    evaluation_strategy=config.evaluation_strategy,
    warmup_steps=config.warmup_steps,
    save_total_limit=config.save_total_limit,
    load_best_model_at_end = config.load_best_model_at_end,
    report_to="wandb",
    run_name=f"Prompt_tuning_original_model-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [26]:
# free up GPU Memory
torch.cuda.empty_cache()

In [27]:
with wandb.init(project=PROJECT, id=WANDB_ID, resume='allow'):
  trainer.train()

VBox(children=(Label(value='0.003 MB of 0.012 MB uploaded\r'), FloatProgress(value=0.20508461479020168, max=1.…

Epoch,Training Loss,Validation Loss
0,45.725,46.380001
1,43.8625,44.209999
2,40.25,39.709999
4,26.9375,26.325001
5,15.2,8.35625
6,4.7047,4.35375
8,2.9969,1.873438
9,2.2852,1.10125
10,1.7156,0.551875
12,0.7559,0.22457


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,██▇▆▅▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▁▅▇▁█▅▄▅▄▄▆▃▄▅▄▅█▅▆▃█▅▅▁█▅▅▅▆▂
eval/samples_per_second,█▄▂█▁▄▅▄▅▅▃▆▅▄▅▄▁▄▃▆▁▄▄█▁▄▄▄▃▇
eval/steps_per_second,█▄▂█▁▄▅▄▅▅▃▆▅▄▅▅▁▄▃▆▁▄▄█▁▄▄▄▃▇
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,█▇█▆▅▄▃▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇███▇▇▇▆▆▅▅▅▄▄▃▃▃▂▂▁▁
train/loss,███▇▇▆▅▄▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁

0,1
eval/loss,0.06761
eval/runtime,1.2332
eval/samples_per_second,40.545
eval/steps_per_second,5.676
train/epoch,29.76
train/global_step,930.0
train/grad_norm,0.75781
train/learning_rate,0.0
train/loss,0.0863
train/total_flos,2.037837880885248e+16


In [28]:
trainer.state.best_model_checkpoint

'./MeQSum-training-1713598056/checkpoint-906'

In [29]:
trainer.state.best_model_checkpoint
!mkdir flan-dialogue-summary-checkpoint
custom_path = "/content/drive/MyDrive/cs577_proj_dataset/MODELS/flan-MeQSum-normal_train-checkpoint_lr_1e_4_NER_2/"
trainer.save_model(output_dir=custom_path)

mkdir: cannot create directory ‘flan-dialogue-summary-checkpoint’: File exists


In [30]:
# with wandb.init(project=PROJECT, job_type="models"):
#   artifact = wandb.Artifact("instruct_model", type="model")
#   artifact.add_dir(custom_path)
#   wandb.save(custom_path)
#   wandb.log_artifact(artifact)

In [31]:
instruct_model = AutoModelForSeq2SeqLM.from_pretrained(custom_path, torch_dtype=torch.bfloat16)

In [32]:
instruct_model = instruct_model.to("cuda")

## Evaluation of the model

In [33]:
from bert_score import score

In [34]:
def compute_rogue_metric(inference_df):
  rouge = evaluate.load('rouge')

  finetuned_model_results = rouge.compute(
    predictions=inference_df['finetuned_model_summaries'].tolist(),
    references=inference_df['human_baseline_summaries'].tolist(),
    use_aggregator=True,
    use_stemmer=True,
  )

  return finetuned_model_results

def compute_bleu_metric(inference_df):
  # Load the BLEU metric
  bleu = evaluate.load("bleu")
  #print(type(inference_df['finetuned_model_summaries'].tolist()))

  references_questions = [[target] for target in inference_df['human_baseline_summaries'].tolist()]
  #prediction_questions = [[target] for target in inference_df['finetuned_model_summaries'].tolist()]
  #print(references_questions)
  # Compute BLEU score
  bleu_results = bleu.compute(
      predictions=inference_df['finetuned_model_summaries'].tolist(),
      references=references_questions
  )

  # print('BLEU SCORE:')
  # print(bleu_results['score'])
  return bleu_results['bleu']

def compute_bert_score(inference_df):
  # Compute BERTScore
  P, R, F1 = score(
      inference_df['finetuned_model_summaries'].tolist(),
      inference_df['human_baseline_summaries'].tolist(), lang="en")

  # print('BERT SCORE:')
  # print('Precision:', P.mean().item())
  # print('Recall:', R.mean().item())
  # print('F1 Score:', F1.mean().item())

  return P.mean().item(), R.mean().item(), F1.mean().item()


In [35]:
def generate_test_set_inference(dataset, instruct_model, START_PROMPT, END_PROMPT):
  questions = dataset['test'][0:]['inputs']
  human_baseline_summaries = dataset['test'][0:]['target']

  finetuned_model_summaries = []

  for _, question in enumerate(tqdm(questions)):
      prompt = START_PROMPT + question + END_PROMPT
      input_ids = tokenizer(prompt, return_tensors="pt").input_ids
      input_ids = input_ids.to("cuda")

      instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
      original_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
      finetuned_model_summaries.append(original_model_text_output)

  zipped_summaries = list(zip(human_baseline_summaries,finetuned_model_summaries))

  df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries','finetuned_model_summaries'])

  print(df.head())

  return df


In [36]:
inference_df = generate_test_set_inference(dataset, instruct_model, NER_START_PROMPT_2, NER_END_PROMPT_2)

  0%|          | 0/150 [00:00<?, ?it/s]

                            human_baseline_summaries  \
0  How can i get rid of a lower lip birthmark per...   
1       Is Magnesium Silicofluoride safe for people?   
2                      Could RhoGAM damage the baby?   
3  Could hydroxychloroquine and methotrexate make...   
4  Is there a relationship between Gadolinium and...   

                           finetuned_model_summaries  
0   What are the treatments for lower lip birthmark?  
1  What are the treatments for moth infestation o...  
2  What are the causes of RH-related mental illne...  
3  What are the treatments for rheumatoid arthritis?  
4  What are the symptoms of Multiple Chemical Sen...  


In [37]:
inference_df.head()

Unnamed: 0,human_baseline_summaries,finetuned_model_summaries
0,How can i get rid of a lower lip birthmark per...,What are the treatments for lower lip birthmark?
1,Is Magnesium Silicofluoride safe for people?,What are the treatments for moth infestation o...
2,Could RhoGAM damage the baby?,What are the causes of RH-related mental illne...
3,Could hydroxychloroquine and methotrexate make...,What are the treatments for rheumatoid arthritis?
4,Is there a relationship between Gadolinium and...,What are the symptoms of Multiple Chemical Sen...


In [38]:
rogue_score = compute_rogue_metric(inference_df)
bleu_score = compute_bleu_metric(inference_df)
bert_score_precision, bert_score_recall, bert_score_f1  = compute_bert_score(inference_df)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
print(f'ROGUE SCORE: {rogue_score}')
print(f'BLEU SCORE: {bleu_score}')
print(f'BERT SCORE: PRECISION: {bert_score_precision}, RECALL: {bert_score_recall}, F1: {bert_score_f1}')

ROGUE SCORE: {'rouge1': 0.27600856079934744, 'rouge2': 0.10741795275255708, 'rougeL': 0.26235237645503284, 'rougeLsum': 0.26230764932797956}
BLEU SCORE: 0.07061317883905567
BERT SCORE: PRECISION: 0.9067013263702393, RECALL: 0.8911650776863098, F1: 0.8986525535583496


In [40]:
with wandb.init(project=PROJECT, id=WANDB_ID, resume="allow"):
  wandb.log({"rogue_score": rogue_score})
  wandb.log({"bleu_score": bleu_score})
  wandb.log({"bert_score_precision": bert_score_precision})
  wandb.log({"bert_score_recall": bert_score_recall})
  wandb.log({"bert_score_f1": bert_score_f1})


VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
bert_score_f1,▁
bert_score_precision,▁
bert_score_recall,▁
bleu_score,▁

0,1
bert_score_f1,0.89865
bert_score_precision,0.9067
bert_score_recall,0.89117
bleu_score,0.07061
eval/loss,0.06761
eval/runtime,1.2332
eval/samples_per_second,40.545
eval/steps_per_second,5.676
train/epoch,29.76
train/global_step,930.0


## Logging Test Set scores and inference to wandb

In [41]:
with wandb.init(project=PROJECT, id=WANDB_ID, job_type="dataset", resume="allow"):
   wbtest_inference = wandb.Table(data=inference_df)
   wandb.log({"meqsum_test_inference": wbtest_inference})

VBox(children=(Label(value='0.037 MB of 0.037 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
bert_score_f1,0.89865
bert_score_precision,0.9067
bert_score_recall,0.89117
bleu_score,0.07061
eval/loss,0.06761
eval/runtime,1.2332
eval/samples_per_second,40.545
eval/steps_per_second,5.676
train/epoch,29.76
train/global_step,930.0
