# 5.4 Fine-Tuning T5 for Gloss Generation
This notebook fine-tunes `t5-small` on 500 samples from ISL_CLSRT to learn English-to-gloss conversion. The model is trained using HuggingFace Transformers.

In [1]:
# ✅ Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# ✅ Step 2: Install dependencies
!pip install -q transformers datasets evaluate accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
# ✅ Step 3: Import libraries
import pandas as pd
from datasets import Dataset
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    TrainingArguments, Trainer,
    DataCollatorForSeq2Seq
)
import torch

In [4]:
# ✅ Step 4: Load and format dataset
df = pd.read_csv('/content/drive/MyDrive/IETGenAI-SLT/Chapter 4/isl_train_meta.csv')
df = df[['Sentences', 'gloss_sequence']].dropna().drop_duplicates().reset_index(drop=True)
df['input_text'] = 'translate English to gloss: ' + df['Sentences']
df = df.rename(columns={'gloss_sequence': 'target_text'})
dataset = Dataset.from_pandas(df[['input_text', 'target_text']])
dataset = dataset.train_test_split(test_size=0.1)

In [None]:
# ✅ Step 5: Tokenization
tokenizer = T5Tokenizer.from_pretrained('t5-small')

def tokenize_function(example):
    model_inputs = tokenizer(example["input_text"], max_length=64, truncation=True, padding="max_length")
    labels = tokenizer(example["target_text"], max_length=32, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [7]:
# ✅ Step 6: Define model and training arguments
model = T5ForConditionalGeneration.from_pretrained('t5-small')

args = TrainingArguments(
    output_dir="/content/t5-gloss-output",
    eval_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=1,
    logging_dir="/content/t5-logs",
)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [9]:
# ✅ Step 7: Trainer setup and training
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33melakkiya16[0m ([33melakkiya16-bits-pilani[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,1.22173
2,No log,0.932836
3,No log,0.70441
4,No log,0.592866
5,No log,0.571278


TrainOutput(global_step=55, training_loss=1.8061047640713779, metrics={'train_runtime': 226.7559, 'train_samples_per_second': 1.896, 'train_steps_per_second': 0.243, 'total_flos': 7274621829120.0, 'train_loss': 1.8061047640713779, 'epoch': 5.0})

In [10]:
# ✅ Step 8: Save fine-tuned model
model.save_pretrained('/content/drive/MyDrive/IETGenAI-SLT/Chapter 5/t5-gloss-finetuned')
tokenizer.save_pretrained('/content/drive/MyDrive/IETGenAI-SLT/Chapter 5/t5-gloss-finetuned')

('/content/drive/MyDrive/IETGenAI-SLT/Chapter 5/t5-gloss-finetuned/tokenizer_config.json',
 '/content/drive/MyDrive/IETGenAI-SLT/Chapter 5/t5-gloss-finetuned/special_tokens_map.json',
 '/content/drive/MyDrive/IETGenAI-SLT/Chapter 5/t5-gloss-finetuned/spiece.model',
 '/content/drive/MyDrive/IETGenAI-SLT/Chapter 5/t5-gloss-finetuned/added_tokens.json')

In [11]:
# ✅ Step 9: Inference from fine-tuned model
def generate_gloss(sentence):
    prompt = f"translate English to gloss: {sentence}"
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=32)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print(generate_gloss("Could you please talk slower?"))

Können Sie bitte langsamer sprechen?


# Task
The model is translating the input sentences instead of generating glosses. This is likely due to the limited dataset size (500 sentences), which is insufficient for the model to learn the complex mapping from English sentences to gloss sequences. The model is defaulting to a more common task it might have been pre-trained on, like translation.

To address this, I will:

1.  **Inspect the dataset**: Ensure the gloss sequences are correctly formatted and aligned with the English sentences.
2.  **Add evaluation metric**: Incorporate a suitable evaluation metric to monitor the model's performance during training.
3.  **Experiment with training hyperparameters**: Adjust hyperparameters to improve training.
4.  **Analyze model outputs during training**: Observe how the output changes over epochs.
5.  **Consider data augmentation (optional)**: Explore techniques to augment the training data if necessary.
6.  **Retrain the model**: Train the model with the adjusted configurations and data.
7.  **Evaluate the fine-tuned model**: Evaluate the final model's performance and manually inspect generated glosses.
8.  **Finish task**: Summarize the findings and the improved model's ability to generate glosses.

## Inspect the dataset

Carefully examine the input and target text in the training and test datasets to ensure the gloss sequences are correctly formatted and aligned with the English sentences.


In [12]:
print("First few examples from train split:")
for i in range(3):
    print(f"Example {i+1}:")
    print(f"  input_text: {tokenized_datasets['train'][i]['input_text']}")
    print(f"  target_text: {tokenized_datasets['train'][i]['target_text']}")
    print(f"  decoded input_ids: {tokenizer.decode(tokenized_datasets['train'][i]['input_ids'], skip_special_tokens=True)}")
    print(f"  decoded labels: {tokenizer.decode(tokenized_datasets['train'][i]['labels'], skip_special_tokens=True)}")
    print("-" * 20)

print("\nFirst few examples from test split:")
for i in range(3):
    print(f"Example {i+1}:")
    print(f"  input_text: {tokenized_datasets['test'][i]['input_text']}")
    print(f"  target_text: {tokenized_datasets['test'][i]['target_text']}")
    print(f"  decoded input_ids: {tokenizer.decode(tokenized_datasets['test'][i]['input_ids'], skip_special_tokens=True)}")
    print(f"  decoded labels: {tokenizer.decode(tokenized_datasets['test'][i]['labels'], skip_special_tokens=True)}")
    print("-" * 20)

First few examples from train split:
Example 1:
  input_text: translate English to gloss: why are you crying
  target_text:  YOU CRY WHY
  decoded input_ids: translate English to gloss: why are you crying
  decoded labels: YOU CRY WHY
--------------------
Example 2:
  input_text: translate English to gloss: how dare you
  target_text:  DARE YOU HOW
  decoded input_ids: translate English to gloss: how dare you
  decoded labels: DARE YOU HOW
--------------------
Example 3:
  input_text: translate English to gloss: i am feeling cold
  target_text: I FEEL COLD
  decoded input_ids: translate English to gloss: i am feeling cold
  decoded labels: I FEEL COLD
--------------------

First few examples from test split:
Example 1:
  input_text: translate English to gloss: let him take time
  target_text: LET HIM TAKE TIME
  decoded input_ids: translate English to gloss: let him take time
  decoded labels: LET HIM TAKE TIME
--------------------
Example 2:
  input_text: translate English to gloss: a

## Add evaluation metric

Incorporate a suitable evaluation metric to monitor the model's performance during training and assess how well it's learning the English-to-gloss conversion.


In [None]:
import evaluate
import numpy as np

metric = evaluate.load("bleu")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # BLEU expects a list of references for each prediction
    decoded_labels = [[label] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return result

In [14]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


## Experiment with training hyperparameters

Adjust hyperparameters like learning rate, batch size, or number of epochs to see if it improves training and prevents the model from defaulting to translation.


In [16]:
# ✅ Step 3: Experiment with training hyperparameters

# Update training arguments
args = TrainingArguments(
    output_dir="/content/t5-gloss-output-tuned",
    eval_strategy="epoch",
    learning_rate=1e-4,  # Lower learning rate
    per_device_train_batch_size=4,  # Smaller batch size
    per_device_eval_batch_size=4,
    num_train_epochs=10, # Increased epochs
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=1,
    logging_dir="/content/t5-logs-tuned",
    metric_for_best_model="bleu", # Metric to use for best model
)

# Re-instantiate Trainer with updated arguments and the compute_metrics function
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


## Retrain the model

Train the model with the adjusted configurations and data.


In [18]:
# ✅ Step 6: Define model and training arguments
model = T5ForConditionalGeneration.from_pretrained('t5-small')

args = TrainingArguments(
    output_dir="/content/t5-gloss-output-tuned",
    eval_strategy="epoch",
    learning_rate=1e-4,  # Lower learning rate
    per_device_train_batch_size=4,  # Smaller batch size
    per_device_eval_batch_size=4,
    num_train_epochs=10, # Increased epochs
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=1,
    logging_dir="/content/t5-logs-tuned",
    metric_for_best_model="bleu", # Metric to use for best model
)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Modify compute_metrics to handle generated sequences
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # BLEU expects a list of references for each prediction
    decoded_labels = [[label] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return result

# Re-instantiate Trainer with updated arguments and the modified compute_metrics function
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [24]:
# Modify compute_metrics to perform argmax on predictions (assuming they are logits)
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Assuming predictions is a tuple where the first element contains logits
    if isinstance(predictions, tuple):
        logits = predictions[0]
    else:
        # If not a tuple, assume predictions are logits directly
        logits = predictions

    # Perform argmax to get predicted token IDs
    predicted_ids = np.argmax(logits, axis=-1)

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)
    # Replace -100 in labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # BLEU expects a list of references for each prediction
    decoded_labels = [[label] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return result

# Re-instantiate Trainer with the modified compute_metrics function
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Initiate the training process
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Bleu,Precisions,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,No log,0.50312,0.0,"[0.41379310344827586, 0.10526315789473684, 0.1, 0.0]",0.841631,0.852941,29,34
2,No log,0.494047,0.0,"[0.39285714285714285, 0.0, 0.0, 0.0]",0.807118,0.823529,28,34
3,No log,0.462161,0.0,"[0.4, 0.05, 0.0, 0.0]",0.875173,0.882353,30,34
4,No log,0.442199,0.0,"[0.41379310344827586, 0.05263157894736842, 0.0, 0.0]",0.841631,0.852941,29,34
5,No log,0.445922,0.0,"[0.3793103448275862, 0.05263157894736842, 0.0, 0.0]",0.841631,0.852941,29,34
6,No log,0.432735,0.0,"[0.3793103448275862, 0.05263157894736842, 0.0, 0.0]",0.841631,0.852941,29,34
7,No log,0.420373,0.0,"[0.4, 0.1, 0.0, 0.0]",0.875173,0.882353,30,34
8,No log,0.427467,0.0,"[0.3793103448275862, 0.05263157894736842, 0.0, 0.0]",0.841631,0.852941,29,34
9,No log,0.42543,0.0,"[0.3793103448275862, 0.05263157894736842, 0.0, 0.0]",0.841631,0.852941,29,34
10,No log,0.424632,0.0,"[0.3793103448275862, 0.05263157894736842, 0.0, 0.0]",0.841631,0.852941,29,34


Trainer is attempting to log a value of "[0.41379310344827586, 0.10526315789473684, 0.1, 0.0]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.39285714285714285, 0.0, 0.0, 0.0]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.4, 0.05, 0.0, 0.0]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.41379310344827586, 0.05263157894736842, 0.0, 0.0]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a valu

TrainOutput(global_step=220, training_loss=0.40474742542613634, metrics={'train_runtime': 683.1289, 'train_samples_per_second': 1.259, 'train_steps_per_second': 0.322, 'total_flos': 14549243658240.0, 'train_loss': 0.40474742542613634, 'epoch': 10.0})

## Evaluate the fine-tuned model

Evaluate the final model's performance using the chosen metric and manually inspect generated glosses for correctness.


In [25]:
# Evaluate the model on the test dataset
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

# Define the generate_gloss function (already defined in a previous cell, but redefining for clarity in this step)
def generate_gloss(sentence):
    prompt = f"translate English to gloss: {sentence}"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device) # Move to model's device
    outputs = model.generate(**inputs, max_length=32)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Generate glosses for a few example sentences
example_sentences = [
    "Could you please talk slower?", # Sentence from previous inference
    "I am going to the market.",
    "What is your name?",
    "He is a good person."
]

print("\nGenerated Glosses:")
for sentence in example_sentences:
    gloss = generate_gloss(sentence)
    print(f"English: {sentence}")
    print(f"Gloss: {gloss}")
    print("-" * 20)

# Manual inspection of generated glosses will be done based on the output

Trainer is attempting to log a value of "[0.3793103448275862, 0.05263157894736842, 0.0, 0.0]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


Evaluation results: {'eval_loss': 0.42463159561157227, 'eval_bleu': 0.0, 'eval_precisions': [0.3793103448275862, 0.05263157894736842, 0.0, 0.0], 'eval_brevity_penalty': 0.8416308400672834, 'eval_length_ratio': 0.8529411764705882, 'eval_translation_length': 29, 'eval_reference_length': 34, 'eval_runtime': 1.7941, 'eval_samples_per_second': 5.574, 'eval_steps_per_second': 1.672, 'epoch': 10.0}

Generated Glosses:
English: Could you please talk slower?
Gloss: YOU PLEASE TALK SLOWER?
--------------------
English: I am going to the market.
Gloss: I GO TO THE MY MY MY MY.
--------------------
English: What is your name?
Gloss: HOW DO YOU WANT WHAT
--------------------
English: He is a good person.
Gloss: Er ist eine gute Person.
--------------------


## Summary:

### Data Analysis Key Findings

*   The initial dataset inspection confirmed that the English sentences and gloss sequences were correctly formatted and aligned, eliminating data formatting issues as the primary cause of the model's translation behavior.
*   Incorporating the BLEU metric revealed that the model achieved an `eval_bleu` score of 0.0 after training, indicating its inability to generate correct gloss sequences.
*   Manual inspection of the generated glosses showed that the model is still producing incorrect translations or nonsensical sequences, although some outputs contained elements of glossing.
*   Experimenting with hyperparameters (lower learning rate, smaller batch size, increased epochs) did not significantly improve the model's ability to perform gloss generation, as evidenced by the zero BLEU score.
*   A technical challenge was encountered and resolved in the `compute_metrics` function to correctly handle the model's output (logits) during evaluation by applying `argmax` to get predicted token IDs.

### Insights or Next Steps

*   The small dataset size (500 sentences) is likely the dominant factor preventing the model from learning the complex English-to-gloss mapping, overwhelming the impact of hyperparameter tuning. Significant data augmentation or acquiring a larger, high-quality dataset is crucial.
*   Investigate alternative model architectures or pre-training strategies specifically designed for low-resource or specialized sequence generation tasks, as the current T5 model appears to default to its pre-trained translation capabilities.


In [26]:
# ✅ Step 9.1: Generate glosses for all sentences and save to DataFrame
def generate_gloss(sentence):
    prompt = f"translate English to gloss: {sentence}"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_length=32)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

df['generated_gloss'] = df['Sentences'].apply(generate_gloss)

# Display the updated DataFrame with generated glosses
display(df.head())

Unnamed: 0,Sentences,target_text,input_text,generated_gloss
0,it does not make any difference to me,IT MAKE ANY DIFFERENCE ME DO NOT,translate English to gloss: it does not make a...,I CAN NOT DIFFERENCE
1,tell me truth,TELL TRUTH,translate English to gloss: tell me truth,I RÉAL
2,do me a favour,DO FAVOUR ME,translate English to gloss: do me a favour,DO MONEY FOUR
3,do not worry,DONOT WORRY,translate English to gloss: do not worry,DO NOT HELP ME
4,do not abuse him,HIM ABUSE DONOT,translate English to gloss: do not abuse him,YOU MISS him


In [27]:
# Define the path to save the CSV file
output_path = '/content/drive/MyDrive/IETGenAI-SLT/Chapter 5/isl_train_meta_with_generated_glosses.csv'

# Save the DataFrame to a CSV file
df.to_csv(output_path, index=False)

print(f"DataFrame saved to {output_path}")

DataFrame saved to /content/drive/MyDrive/IETGenAI-SLT/Chapter 5/isl_train_meta_with_generated_glosses.csv


In [None]:
# ✅ Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# ✅ Step 2: Install dependencies
!pip install -q transformers datasets evaluate accelerate

# ✅ Step 3: Import libraries
import pandas as pd
from datasets import Dataset
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    TrainingArguments, Trainer,
    DataCollatorForSeq2Seq
)
import torch

# ✅ Step 4: Load and format dataset
df = pd.read_csv('/content/drive/MyDrive/IETGenAI-SLT/Chapter 4/isl_train_meta.csv')
df = df[['Sentences', 'gloss_sequence']].dropna().drop_duplicates().reset_index(drop=True)
df['input_text'] = 'translate English to gloss: ' + df['Sentences']
df = df.rename(columns={'gloss_sequence': 'target_text'})
dataset = Dataset.from_pandas(df[['input_text', 'target_text']])
dataset = dataset.train_test_split(test_size=0.1)

# ✅ Step 5: Tokenization
tokenizer = T5Tokenizer.from_pretrained('t5-small')

def tokenize_function(example):
    model_inputs = tokenizer(example["input_text"], max_length=64, truncation=True, padding="max_length")
    labels = tokenizer(example["target_text"], max_length=32, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Add evaluation metric and compute_metrics function
import evaluate
import numpy as np

metric = evaluate.load("bleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Assuming predictions is a tuple where the first element contains logits
    if isinstance(predictions, tuple):
        logits = predictions[0]
    else:
        # If not a tuple, assume predictions are logits directly
        logits = predictions

    # Perform argmax to get predicted token IDs
    predicted_ids = np.argmax(logits, axis=-1)

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)
    # Replace -100 in labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # BLEU expects a list of references for each prediction
    decoded_labels = [[label] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return result

# Define model and training arguments (tuned)
model = T5ForConditionalGeneration.from_pretrained('t5-small')

args = TrainingArguments(
    output_dir="/content/t5-gloss-output-tuned",
    eval_strategy="epoch",
    learning_rate=1e-4,  # Lower learning rate
    per_device_train_batch_size=4,  # Smaller batch size
    per_device_eval_batch_size=4,
    num_train_epochs=10, # Increased epochs
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=1,
    logging_dir="/content/t5-logs-tuned",
    metric_for_best_model="bleu", # Metric to use for best model
)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Trainer setup and training (with tuned arguments and compute_metrics)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

# Save fine-tuned model
model.save_pretrained('/content/drive/MyDrive/IETGenAI-SLT/Chapter 5/t5-gloss-finetuned')
tokenizer.save_pretrained('/content/drive/MyDrive/IETGenAI-SLT/Chapter 5/t5-gloss-finetuned')

# Inference from fine-tuned model and save generated glosses to DataFrame
def generate_gloss(sentence):
    prompt = f"translate English to gloss: {sentence}"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_length=32)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

df['generated_gloss'] = df['Sentences'].apply(generate_gloss)

# Evaluate the model on the test dataset
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

# Save the DataFrame to a CSV file
output_path = '/content/drive/MyDrive/IETGenAI-SLT/Chapter 5/isl_train_meta_with_generated_glosses.csv'
df.to_csv(output_path, index=False)

print(f"DataFrame saved to {output_path}")