In [2]:
# ==============================================================================
# 1. SETUP: Install necessary libraries
# ==============================================================================
# !pip install transformers[torch] datasets sentencepiece --quiet

print("✅ Libraries installed and ready.")


# ==============================================================================
# 2. DATA PREPARATION: Create our training and testing datasets
# ==============================================================================
import json

# For a real project, you would have hundreds or thousands of examples.
# We'll create two small files for this demonstration.
# train_data = [
#     {"question": "Thời gian 10 tháng trước tháng 2, 1130 là khi nào?", "answer": "Tháng 4, 1129"},
#     {"question": "What is 3 weeks after January 1, 2024?", "answer": "January 22, 2024"},
#     {"question": "5 ngày trước ngày 10 tháng 7 năm 2025 là ngày nào?", "answer": "Ngày 5 tháng 7 năm 2025"},
#     {"question": "What date is 2 months after February 29, 2024?", "answer": "April 29, 2024"},
#     {"question": "1 năm sau ngày 15 tháng 5, 2030 là ngày gì?", "answer": "Ngày 15 tháng 5, 2031"}
# ]

# test_data = [
#     {"question": "What is 4 days before March 1, 2024?", "answer": "February 26, 2024"},
#     {"question": "2 tuần sau ngày 20 tháng 12 năm 2025 là ngày nào?", "answer": "Ngày 3 tháng 1 năm 2026"}
# ]

# # Write the data to JSONL files (one JSON object per line)
# with open('train.jsonl', 'w', encoding='utf-8') as f:
#     for item in train_data:
#         f.write(json.dumps(item, ensure_ascii=False) + '\n')

# with open('test.jsonl', 'w', encoding='utf-8') as f:
#     for item in test_data:
#         f.write(json.dumps(item, ensure_ascii=False) + '\n')

# print("✅ train.jsonl and test.jsonl files created.")



# ==============================================================================
# 3. LOAD DATA, TOKENIZER, and MODEL
# ==============================================================================
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, Seq2SeqTrainingArguments

# Load the datasets from our files
raw_datasets = load_dataset('json', data_files={'train': 'date_train.jsonl', 'test': 'date_test.jsonl'})

# Define the model checkpoint we want to use. 't5-small' is great for fast fine-tuning.
model_checkpoint = "google/mt5-base"

# Load the tokenizer to preprocess the text
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Load the model itself
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

print("✅ Datasets, Tokenizer, and Model loaded.")


# ==============================================================================
# 4. PREPROCESSING: Convert text to tokens the model can understand
# ==============================================================================
# T5 models expect a prefix for the task. We'll create one.
prefix = ""

def preprocess_function(examples):
    """Tokenizes the questions and answers, ensuring answers are strings."""
    inputs = [prefix + doc for doc in examples["question"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")

    # The FIX is here: Ensure every item in the "answer" list is a string.
    # This prevents errors if your JSONL file has null/None values.
    answers = [str(ans) if ans is not None else "" for ans in examples["answer"]]

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(answers, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the preprocessing to our entire dataset
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

print("✅ Data preprocessing complete.")


# ==============================================================================
# 5. TRAINING: Fine-tune the model on our data
# ==============================================================================
# Define the training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",           # Directory to save the model
    eval_strategy="epoch",      # Evaluate at the end of each epoch
    learning_rate=2e-5,               # The speed at which the model learns
    per_device_train_batch_size=4,    # Batch size for training
    per_device_eval_batch_size=4,     # Batch size for evaluation
    num_train_epochs=5,               # Number of times to go through the data
    weight_decay=0.01,                # Regularization to prevent overfitting
    save_total_limit=1,               # Only keep the best model
    predict_with_generate=True,       # Needed for text generation evaluation
)

# Create the Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
)

# Start the training!
print("🚀 Starting training...")
trainer.train()
print("🎉 Training finished!")


# ==============================================================================
# 6. EVALUATION & INFERENCE: Use the trained model
# ==============================================================================
# Save the final model so you can use it later
final_model_path = "./final_date_calculator_model"
trainer.save_model(final_model_path)
print(f"✅ Model saved to {final_model_path}")

# Load our fine-tuned model and tokenizer using a pipeline for easy inference
from transformers import pipeline

print("\n\n🧪 Running inference with the fine-tuned model...")
question = "6 tháng sau tháng 9 năm 2025 là khi nào?"

# Load the trained model using the pipeline helper
date_calculator = pipeline("text2text-generation", model=final_model_path)
result = date_calculator(prefix + question)

print(f"\nQuestion: {question}")
print(f"Predicted Answer: {result[0]['generated_text']}")
# Expected output for the question above should be "Tháng 3 năm 2026"

✅ Libraries installed and ready.


  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


✅ Datasets, Tokenizer, and Model loaded.


Map: 100%|██████████| 600/600 [00:00<00:00, 6691.21 examples/s]

✅ Data preprocessing complete.



  trainer = Trainer(


🚀 Starting training...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [1]:
import torch

print(f"cuDNN version: {torch.backends.cudnn.version()}")

cuDNN version: None


In [10]:
# ================================================================================
# Function to load and continue training
# ================================================================================
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from datasets import load_dataset

def continue_training(model_path, train_file, test_file, output_dir, num_train_epochs=3):
    """
    Loads an existing model and continues training.

    Args:
        model_path (str): Path to the existing model.
        train_file (str): Path to the training data file.
        test_file (str): Path to the testing data file.
        output_dir (str): Directory to save the continued training results.
        num_train_epochs (int): Number of epochs to train for.
    """
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

    # Load the datasets
    raw_datasets = load_dataset('json', data_files={'train': train_file, 'test': test_file})

    # Define the prefix for the task
    prefix = ""

    def preprocess_function(examples):
        """Tokenizes the questions and answers, ensuring answers are strings."""
        inputs = [prefix + doc for doc in examples["question"]]
        model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")

        # Ensure every item in the "answer" list is a string
        answers = [str(ans) if ans is not None else "" for ans in examples["answer"]]

        # Setup the tokenizer for targets
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(answers, max_length=128, truncation=True, padding="max_length")

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    # Apply the preprocessing to our entire dataset
    tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

    # Define the training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=num_train_epochs,
        weight_decay=0.01,
        save_total_limit=1,
    )

    # Create the Trainer object
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        tokenizer=tokenizer,
    )

    # Start the training
    print("🚀 Continuing training...")
    trainer.train()
    print("🎉 Training finished!")

    # Save the final model
    trainer.save_model(output_dir)
    print(f"✅ Model saved to {output_dir}")

# Example usage
continue_training(
    model_path="./final_date_calculator_model",
    train_file="date_train.jsonl",
    test_file="date_test.jsonl",
    output_dir="./continued_training_results",
    num_train_epochs=250
)

Map: 100%|██████████| 600/600 [00:00<00:00, 9351.22 examples/s]
  trainer = Trainer(


🚀 Continuing training...


Epoch,Training Loss,Validation Loss
1,0.0489,0.046563
2,0.0483,0.046128
3,0.0474,0.045563
4,0.0465,0.04505
5,0.0462,0.044785
6,0.046,0.044569
7,0.0459,0.044239
8,0.045,0.043682
9,0.0445,0.043129
10,0.0445,0.042807


🎉 Training finished!
✅ Model saved to ./continued_training_results


In [1]:
import json
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the test dataset
def load_test_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]
    return data

test_data = load_test_data('date_test.jsonl')

# Run inference on the test dataset
predictions = []
ground_truths = []

# Open the output file
with open('output.txt', 'w', encoding='utf-8') as output_file:
    for item in tqdm(test_data):
        question = item['question']
        ground_truth = item['answer']
        result = date_calculator(prefix + question)
        predicted_answer = result[0]['generated_text']

        predictions.append(predicted_answer)
        ground_truths.append(ground_truth)

        # Write the question, answer, and predicted answer to the output file
        output_file.write(f"Question: {question}\n")
        output_file.write(f"Answer: {ground_truth}\n")
        output_file.write(f"Predicted: {predicted_answer}\n")
        output_file.write("\n")

# Evaluate the results
accuracy = accuracy_score(ground_truths, predictions)
precision = precision_score(ground_truths, predictions, average='weighted')
recall = recall_score(ground_truths, predictions, average='weighted')
f1 = f1_score(ground_truths, predictions, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

  0%|          | 0/600 [00:00<?, ?it/s]


NameError: name 'date_calculator' is not defined