In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from datasets import Dataset 
from datasets import load_from_disk

# Load from arrow file directly
train = Dataset.from_file("/kaggle/input/endsem-eval-nlp/val.arrow")
test = Dataset.from_file("/kaggle/input/endsem-eval-nlp/test.arrow")
val = Dataset.from_file("/kaggle/input/endsem-eval-nlp/train.arrow")

In [None]:
print(train)
print(val)
print(test)
print(type(train))

In [None]:
train_df = train.to_pandas()
test_df= test.to_pandas()
val_df = val.to_pandas()

In [None]:
train_df.head(4)

In [None]:
train_df.iloc[0, 0]

In [None]:
train_df.iloc[0 ,  0]

In [None]:
train_df.iloc[3,  0]

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")
tokenizer.pad_token = tokenizer.eos_token


In [None]:
def preprocess(example):
    full_text = example['input_text'] + tokenizer.eos_token + example['target_text'] + tokenizer.eos_token
    tokenized = tokenizer(full_text, truncation=True, padding='max_length', max_length=512)
    return {
        'input_ids': tokenized['input_ids'],
        'attention_mask': tokenized['attention_mask'],
        'labels': tokenized['input_ids']  # For language modeling
    }

### We are adding both `input_ids` and `labels` in the output of the preprocessing function, even though they have the same values (i.e., ``tokenized['input_ids'])``. This is because we are doing a next-token prediction task (language modeling). 

### In preprocessing, we are combining the input (like a prompt or training part) and the target (like the expected output or test part) into a single sequence. Then, we tokenize this combined sequence. Both input_ids and labels are set to this tokenized sequence. Later during training, the model will use input_ids as input and labels to calculate the loss by predicting the next tokens in the sequence.

In [None]:
processed = [preprocess(row) for _, row in train_df.iterrows()]
val_processed = [preprocess(row) for _, row in val_df.iterrows()]
test_processed = [preprocess(row) for _, row in test_df.iterrows()]
tokenized_dataset = Dataset.from_list(processed)
tokenized_val_dataset =  Dataset.from_list(val_processed)
tokenized_test_dataset = Dataset.from_list(test_processed)

In [None]:
print(processed[0])
print(len(tokenized_dataset))
print(model.config)
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results-small",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    logging_dir="./logs-small",
    eval_strategy="epoch",  # no evaluation during training
    save_strategy="no",        # no checkpoints saved
    report_to="none"           # disable reporting to tools like WandB
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,  # use the full dataset now
    eval_dataset=tokenized_val_dataset,
)

trainer.train()


In [None]:
## saving the model 
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")

In [None]:
def generate_response(input_text):
    # Move model to GPU
    model.to('cuda')  # Ensure the model is on the GPU

    # Tokenize input text
    input_ids = tokenizer.encode(input_text + tokenizer.eos_token, return_tensors='pt')

    # Move input_ids to GPU
    input_ids = input_ids.to('cuda')  # Move input_ids to the same device as the model

    # Generate response from the model
    output = model.generate(input_ids, max_length=50, num_return_sequences=1, no_repeat_ngram_size=2, top_k=50, top_p=0.9, temperature=0.8)

    # Decode the response
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    # Filter out any unwanted tokens such as [SEP] or 'P:'
    response = response.replace('[SEP]', '').replace('P:', '').strip()

    return response

# Test with a custom sentence entered by the user
user_input = "Hello, i am sick and tired "  # Example, change this input to test with different sentences
response = generate_response(user_input)
print(f"Response: {response}")


In [None]:
eval_results = trainer.evaluate(tokenized_test_dataset)

# Print evaluation results
print("Evaluation Results on Test Set:")
print(eval_results)

In [None]:
pip install evaluate
pip install bert_score

In [None]:
import evaluate
bleu = evaluate.load("bleu")
bertscore = evaluate.load("bertscore")

def evaluate_model(test_dataset):
    predictions = []
    references = []

    for example in test_dataset:
        input_text = tokenizer.decode(example['input_ids'], skip_special_tokens=True)

        inputs = tokenizer(input_text + tokenizer.eos_token, return_tensors='pt', padding=True).to('cuda')

        output = model.generate(
            **inputs,
            max_new_tokens=50,
            do_sample=True,
            no_repeat_ngram_size=2,
            top_k=50,
            top_p=0.9,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )

        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        predictions.append(generated_text)

        # ✅ FIXED: decode label ids to text
        reference = tokenizer.decode(example['labels'], skip_special_tokens=True)
        references.append(reference)

    # Compute BLEU — needs tokenized references
    bleu_results = bleu.compute(
        predictions=predictions,
        references=[[ref.split()] for ref in references]
    )

    # Compute BERTScore — raw string references
    bertscore_results = bertscore.compute(
        predictions=predictions,
        references=references,
        lang='en'
    )

    print(f"BLEU Score: {bleu_results}")
    print(f"BERTScore: {bertscore_results}")

# Run evaluation
evaluate_model(tokenized_test_dataset)


In [None]:
pip install bert_score

In [None]:
import evaluate

In [None]:
from IPython.display import FileLink

# Assuming model.safetensors is in the current working directory
FileLink(r'/kaggle/working/trained_model.zip')


In [None]:
import shutil
from IPython.display import FileLink

# Zip the entire directory or just specific files
shutil.make_archive('/kaggle/working/trained_model', 'zip', './')  # zips everything in current directory

# Display download link
FileLink('model_files.zip')

In [4]:
pip install bert_score

Note: you may need to restart the kernel to use updated packages.


In [5]:
import logging
# Clear existing logging handlers (important in Jupyter)
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

# Now reconfigure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]
)

In [6]:
import os
import torch
import nltk
import numpy as np
from datasets import Dataset, DatasetDict, load_from_disk, Dataset as HFDataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM, # *** Changed for DialoGPT ***
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
    # Seq2Seq specific classes removed if not needed elsewhere
)
import evaluate
import logging
from tqdm.auto import tqdm

2025-04-15 00:16:33,543 - INFO - NumExpr defaulting to 4 threads.
2025-04-15 00:16:34,089 - INFO - PyTorch version 2.5.1+cu124 available.
2025-04-15 00:16:34,091 - INFO - Polars version 1.9.0 available.
2025-04-15 00:16:34,092 - INFO - Duckdb version 1.1.3 available.
2025-04-15 00:16:34,094 - INFO - TensorFlow version 2.18.0 available.
2025-04-15 00:16:34,096 - INFO - JAX version 0.4.33 available.
2025-04-15 00:16:36.812723: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744676196.834543     930 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744676196.841236     930 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [8]:
# --- Basic Setup ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
nltk.download('punkt', quiet=True)

# --- Configuration ---
# *** Paths for DialoGPT and Re-ranker ***
# DIALOGPT_MODEL_DIR = "./kaggle/working/trained_model"  # <-- Directory containing your fine-tuned DialoGPT files
RERANKER_MODEL_NAME = "bert-base-uncased"
RERANKER_OUTPUT_DIR = "/kaggle/working/reranker_bert_dialogpt" # <-- Adjusted output dir name
DATASET_PATH = "/kaggle/input/endsem-eval-nlp" # Path to original dataset arrow files
DIALOGPT_MODEL_DIR = "/kaggle/working/trained_model"

dialogpt_tokenizer = AutoTokenizer.from_pretrained(DIALOGPT_MODEL_DIR, local_files_only=True)
dialogpt_model = AutoModelForCausalLM.from_pretrained(DIALOGPT_MODEL_DIR, local_files_only=True).to(device)


In [12]:

# Generation Params for Candidate Generation (DialoGPT)
NUM_RETURN_SEQUENCES = 5
NUM_BEAMS = 5
MAX_HISTORY_LENGTH = 450 # Max tokens for history part (adjust based on DialoGPT limits)
MAX_RESPONSE_LENGTH = 60 # Max tokens for the generated response part

# Re-ranker Training Params (Same as before)
RERANKER_MAX_LENGTH = 512 # Ample room for combined text + SEP
RERANKER_BATCH_SIZE = 16
RERANKER_EPOCHS = 2
RERANKER_LR = 3e-5
logging.info("Loading datasets and fine-tuned DialoGPT model/tokenizer...")

2025-04-15 00:22:51,400 - INFO - Loading datasets and fine-tuned DialoGPT model/tokenizer...


In [10]:

# --- Load Datasets and DialoGPT Components ---

try:
    # Load original datasets
    val_dataset_orig = Dataset.from_file('/kaggle/input/endsem-eval-nlp/train.arrow')
    test_dataset_orig = Dataset.from_file('/kaggle/input/endsem-eval-nlp/test.arrow')
    raw_datasets = DatasetDict({'validation': val_dataset_orig, 'test': test_dataset_orig})

    # *** Load DialoGPT model and tokenizer ***
    # dialogpt_tokenizer = AutoTokenizer.from_pretrained(DIALOGPT_MODEL_DIR)
    # dialogpt_model = AutoModelForCausalLM.from_pretrained(DIALOGPT_MODEL_DIR).to(device)
    dialogpt_model.eval() # Set to evaluation mode

    # *** Set Padding for DialoGPT (GPT2-style) ***
    if dialogpt_tokenizer.pad_token is None:
        logging.warning("DialoGPT tokenizer has no pad token, setting to eos_token.")
        dialogpt_tokenizer.pad_token = dialogpt_tokenizer.eos_token
    dialogpt_tokenizer.padding_side = "left" # Important for decoder-only generation

    # Load BERTScore metric
    bertscore_metric = evaluate.load("bertscore")

except Exception as e:
    logging.error(f"Error loading models/datasets: {e}", exc_info=True)
    exit()



In [14]:
# --- Step 2: Generate Candidates using DialoGPT for Validation Set ---
logging.info(f"Generating {NUM_RETURN_SEQUENCES} candidates using DialoGPT for validation set...")

validation_candidates = []
validation_histories_original = [] # Store original format for re-ranker input
validation_targets = []

for example in tqdm(raw_datasets['validation'], desc="Generating Validation Candidates (DialoGPT)"):
    history_original = example['input_text'] # Keep original for re-ranker input text
    target_text = example['target_text']
    validation_histories_original.append(history_original)
    validation_targets.append(target_text)

    # Prepare input for DialoGPT: Replace [SEP] with EOS and add final EOS
    history_formatted = history_original.replace("[SEP]", dialogpt_tokenizer.eos_token) + dialogpt_tokenizer.eos_token

    inputs = dialogpt_tokenizer(
        history_formatted,
        return_tensors="pt",
        max_length=MAX_HISTORY_LENGTH, # Truncate history if too long
        truncation=True,
        padding=False # Padding handled later or by generate if needed for batching (but doing one by one here)
    ).to(device)

    input_ids = inputs['input_ids']
    current_input_length = input_ids.shape[1]

    # Calculate max_length for generate: current input + desired response length
    generate_max_length = current_input_length + MAX_RESPONSE_LENGTH

    with torch.no_grad():
        outputs = dialogpt_model.generate(
            input_ids,
            max_length=generate_max_length,
            num_beams=NUM_BEAMS,
            num_return_sequences=NUM_RETURN_SEQUENCES,
            pad_token_id=dialogpt_tokenizer.pad_token_id,
            eos_token_id=dialogpt_tokenizer.eos_token_id,
            early_stopping=True,
            # top_k=50, # Optional sampling parameters if beam search is too slow/rigid
            # top_p=0.95,
            # temperature=0.7
        )

    # *** Decode only the generated part for DialoGPT ***
    generated_sequences = outputs[:, current_input_length:]
    decoded_outputs = dialogpt_tokenizer.batch_decode(generated_sequences, skip_special_tokens=True)

    # Clean up potential artifacts if needed (e.g., dangling EOS)
    cleaned_outputs = [text.replace(dialogpt_tokenizer.eos_token, "").strip() for text in decoded_outputs]
    validation_candidates.append(cleaned_outputs)

logging.info(f"Generated DialoGPT candidates for {len(validation_candidates)} validation examples.")

# --- Step 3: Prepare Re-ranker Training Data (Identical Logic) ---
logging.info("Preparing training data for the re-ranker...")

reranker_train_data = {'text': [], 'label': []}

for i in tqdm(range(len(validation_histories_original)), desc="Scoring DialoGPT Candidates"):
    history = validation_histories_original[i] # Use original history format
    candidates = validation_candidates[i]      # Use DialoGPT candidates
    target = validation_targets[i]

    # Calculate BERTScore for each candidate against the target
    try:
        scores = bertscore_metric.compute(
            predictions=candidates,
            references=[target] * len(candidates),
            lang="en", device=device, batch_size=8 # Adjust batch size if OOM
        )
        f1_scores = scores['f1']
    except Exception as e:
        logging.warning(f"BERTScore failed for example {i}. Skipping. Error: {e}")
        f1_scores = [0.0] * len(candidates)

    # Create training instances: (history + candidate, score)
    for j, candidate in enumerate(candidates):
        # Use original history format for re-ranker input
        combined_text = history + " [SEP] " + candidate # Use [SEP] consistently here
        reranker_train_data['text'].append(combined_text)
        reranker_train_data['label'].append(f1_scores[j] if f1_scores and j < len(f1_scores) else 0.0)

reranker_dataset = HFDataset.from_dict(reranker_train_data)
logging.info(f"Re-ranker dataset created with {len(reranker_dataset)} examples (from DialoGPT candidates).")

# --- Step 4: Train Re-ranker Model (Identical Code) ---
logging.info(f"Loading and training re-ranker model: {RERANKER_MODEL_NAME}")

# Load re-ranker tokenizer and model (for regression)
reranker_tokenizer = AutoTokenizer.from_pretrained(RERANKER_MODEL_NAME)
reranker_model = AutoModelForSequenceClassification.from_pretrained(
    RERANKER_MODEL_NAME,
    num_labels=1 # Regression
).to(device)

# Tokenize the re-ranker dataset
def tokenize_reranker(examples):
    tokenized = reranker_tokenizer(
        examples['text'],
        truncation=True,
        padding="max_length",
        max_length=RERANKER_MAX_LENGTH
    )
    tokenized['labels'] = [float(label) for label in examples['label']]
    return tokenized

tokenized_reranker_dataset = reranker_dataset.map(tokenize_reranker, batched=True, num_proc=2) # Use multiprocessing
tokenized_reranker_dataset = tokenized_reranker_dataset.remove_columns(['text'])

# Data collator for the re-ranker
reranker_data_collator = DataCollatorWithPadding(tokenizer=reranker_tokenizer)

# Training arguments for the re-ranker
reranker_training_args = TrainingArguments(
    output_dir=RERANKER_OUTPUT_DIR,
    num_train_epochs=RERANKER_EPOCHS,
    per_device_train_batch_size=RERANKER_BATCH_SIZE,
    learning_rate=RERANKER_LR,
    weight_decay=0.01,
    logging_dir=f"{RERANKER_OUTPUT_DIR}/logs",
    logging_steps=100,
    save_strategy="epoch",
    save_total_limit=1,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

# Trainer for the re-ranker
reranker_trainer = Trainer(
    model=reranker_model,
    args=reranker_training_args,
    train_dataset=tokenized_reranker_dataset,
    tokenizer=reranker_tokenizer,
    data_collator=reranker_data_collator,
)



2025-04-15 00:24:52,920 - INFO - Generating 5 candidates using DialoGPT for validation set...


Generating Validation Candidates (DialoGPT):   0%|          | 0/576 [00:00<?, ?it/s]

2025-04-15 00:28:14,852 - INFO - Generated DialoGPT candidates for 576 validation examples.
2025-04-15 00:28:14,853 - INFO - Preparing training data for the re-ranker...


Scoring DialoGPT Candidates:   0%|          | 0/576 [00:00<?, ?it/s]

2025-04-15 00:28:37,934 - INFO - Re-ranker dataset created with 2880 examples (from DialoGPT candidates).
2025-04-15 00:28:37,934 - INFO - Loading and training re-ranker model: bert-base-uncased
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map (num_proc=2):   0%|          | 0/2880 [00:00<?, ? examples/s]

  reranker_trainer = Trainer(


In [15]:
logging.info("Starting re-ranker training...")
reranker_trainer.train()
logging.info("Re-ranker training finished.")
reranker_trainer.save_model()
logging.info(f"Re-ranker model saved to {RERANKER_OUTPUT_DIR}")


# --- Step 5: Apply Re-ranking on Test Set using DialoGPT Candidates ---
logging.info("Applying re-ranking to the test set using DialoGPT candidates...")

# Load the trained re-ranker (Trainer should have loaded the best)
# Or load manually:
# reranker_tokenizer = AutoTokenizer.from_pretrained(RERANKER_OUTPUT_DIR)
# reranker_model = AutoModelForSequenceClassification.from_pretrained(RERANKER_OUTPUT_DIR).to(device)
reranker_model.eval()


final_predictions = []
ground_truths = []

for example in tqdm(raw_datasets['test'], desc="Processing Test Set (DialoGPT + Re-rank)"):
    history_original = example['input_text']
    target = example['target_text']
    ground_truths.append(target)

    # 1. Generate candidates using DialoGPT (same as validation loop)
    history_formatted = history_original.replace("[SEP]", dialogpt_tokenizer.eos_token) + dialogpt_tokenizer.eos_token
    inputs = dialogpt_tokenizer(history_formatted, return_tensors="pt", max_length=MAX_HISTORY_LENGTH, truncation=True).to(device)
    input_ids = inputs['input_ids']
    current_input_length = input_ids.shape[1]
    generate_max_length = current_input_length + MAX_RESPONSE_LENGTH

    with torch.no_grad():
        outputs = dialogpt_model.generate(
            input_ids, max_length=generate_max_length, num_beams=NUM_BEAMS,
            num_return_sequences=NUM_RETURN_SEQUENCES, pad_token_id=dialogpt_tokenizer.pad_token_id,
            eos_token_id=dialogpt_tokenizer.eos_token_id, early_stopping=True
        )
    generated_sequences = outputs[:, current_input_length:]
    candidates_decoded = dialogpt_tokenizer.batch_decode(generated_sequences, skip_special_tokens=True)
    candidates = [text.replace(dialogpt_tokenizer.eos_token, "").strip() for text in candidates_decoded]


    # 2. Prepare inputs for re-ranker
    reranker_inputs_text = [history_original + " [SEP] " + cand for cand in candidates]
    reranker_inputs_tokenized = reranker_tokenizer(
        reranker_inputs_text,
        return_tensors="pt", truncation=True, padding=True,
        max_length=RERANKER_MAX_LENGTH
    ).to(device)

    # 3. Get scores from re-ranker
    with torch.no_grad():
        reranker_outputs = reranker_model(**reranker_inputs_tokenized)
        scores = reranker_outputs.logits.squeeze(-1).cpu().numpy()

    # 4. Select best candidate
    best_candidate_index = np.argmax(scores)
    final_prediction = candidates[best_candidate_index]
    final_predictions.append(final_prediction)

logging.info("Re-ranking complete. Evaluating final DialoGPT + Re-ranked predictions...")


# Remember to compare this to the baseline DialoGPT performance (just using the top beam candidate without re-ranking)

2025-04-15 00:28:50,981 - INFO - Starting re-ranker training...


Step,Training Loss
100,0.0428


2025-04-15 00:34:03,741 - INFO - Re-ranker training finished.
2025-04-15 00:34:04,781 - INFO - Re-ranker model saved to /kaggle/working/reranker_bert_dialogpt
2025-04-15 00:34:04,782 - INFO - Applying re-ranking to the test set using DialoGPT candidates...


Processing Test Set (DialoGPT + Re-rank):   0%|          | 0/968 [00:00<?, ?it/s]

2025-04-15 00:41:58,222 - INFO - Re-ranking complete. Evaluating final DialoGPT + Re-ranked predictions...


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

ValueError: Predictions and/or references don't match the expected format.
Expected format:
Feature option 0: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}
Feature option 1: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')},
Input predictions: ['so', 'you', "'re", ..., 'a', 'job', '?'],
Input references: [['are', 'you', 'evaluated', 'at', 'work', 'by', 'anybody', 'to', 'see', 'if', 'you', "'re", 'in', 'a', 'job', 'you', 'should', 'be', '?']]

In [17]:
# --- Final Evaluation (Identical Code) ---
bleu_metric = evaluate.load("bleu")
bertscore_metric = evaluate.load("bertscore") # Reload or ensure it's available

# Post-process for metrics (Keep this part)
decoded_preds = [pred.strip() for pred in final_predictions]
decoded_labels = [label.strip() for label in ground_truths]
decoded_preds = [pred if pred else "<empty>" for pred in decoded_preds] # Handle empty predictions
decoded_labels = [label if label else "<empty>" for label in decoded_labels] # Handle empty labels

# --- CORRECTION FOR BLEU ---
# BLEU expects lists of strings for predictions, and list of lists of strings for references
# Do NOT pre-tokenize with nltk here
logging.info("Calculating BLEU score...")
references_for_bleu = [[label] for label in decoded_labels] # Each prediction corresponds to a list containing one reference string
try:
    bleu_result = bleu_metric.compute(predictions=decoded_preds, references=references_for_bleu)
    if bleu_result is None: # Handle potential None return
        logging.warning("BLEU score computation returned None. Setting BLEU to 0.")
        bleu_score = 0.0
    else:
        bleu_score = bleu_result.get("bleu", 0.0) # Get 'bleu' score, default to 0 if key missing
except Exception as e:
    logging.error(f"Error computing BLEU: {e}", exc_info=True)
    bleu_score = 0.0 # Set to 0 on error
# --- END CORRECTION ---


# BERTScore (This part expects lists of strings, so it should be correct)
logging.info("Calculating BERTScore...")
try:
    bertscore_result = bertscore_metric.compute(predictions=decoded_preds, references=decoded_labels, lang="en", device=device)
    if bertscore_result is None or "f1" not in bertscore_result:
        logging.warning("BERTScore computation returned None or missing 'f1'. Setting scores to 0.")
        bertscore_f1 = 0.0
        bertscore_precision = 0.0
        bertscore_recall = 0.0
    else:
        bertscore_f1 = np.mean(bertscore_result.get("f1", [0.0])) # Use .get with default
        bertscore_precision = np.mean(bertscore_result.get("precision", [0.0]))
        bertscore_recall = np.mean(bertscore_result.get("recall", [0.0]))
except Exception as e:
    logging.error(f"Error computing BERTScore: {e}", exc_info=True)
    bertscore_f1 = 0.0
    bertscore_precision = 0.0
    bertscore_recall = 0.0


final_metrics = {
    "reranked_dialogpt_bleu": round(bleu_score, 4),
    "reranked_dialogpt_bertscore_f1": round(bertscore_f1, 4),
    "reranked_dialogpt_bertscore_precision": round(bertscore_precision, 4),
    "reranked_dialogpt_bertscore_recall": round(bertscore_recall, 4),
}

logging.info(f"Final Evaluation Metrics (DialoGPT + Re-ranking): {final_metrics}")


2025-04-15 00:45:07,914 - INFO - Calculating BLEU score...
2025-04-15 00:45:08,062 - INFO - Calculating BERTScore...
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-04-15 00:45:17,679 - INFO - Final Evaluation Metrics (DialoGPT + Re-ranking): {'reranked_dialogpt_bleu': 0.002, 'reranked_dialogpt_bertscore_f1': 0.842, 'reranked_dialogpt_bertscore_precision': 0.8559, 'reranked_dialogpt_bertscore_recall': 0.83}
