# Fine-tuning Pretrained T5-small model with raw text

## Dependecies

In [15]:
from datasets import Dataset
import re
from transformers import Trainer, TrainingArguments

## Cleaning

In [16]:
RAW_TXT_PATH = "./dataset/webscrape/raw.txt"

In [17]:
raw_txt = open(RAW_TXT_PATH, "r").read()

In [18]:
raw_txt

'\n\n\n\n\nVisa/Consular Services\n\n2025/1/9\n\n\nJapanese\n\n\n\nVISA\n\nImportant Notice\n1. Digitization of "Certificate of Eligibility (COE)"\n\xa0 From 17th March, Japan Immigration accepts application for digitized COE. For details, please see this.\n\n2. Acceptance of Photocopy of Japanese Documents\xa0\n\xa0From 17th March, we accept photocopy of documents issued/prepared in Japan instead of the original. For details, please see this.\xa0\n\n3. Advanced procedures for Customs, Immigration and Quarantine\n1. From 29th April 2023, all entrants/returnees to Japan do not need to submit a valid vaccination certificate nor a negative certificate of COVID-19.\n2. Please use “Visit Japan Web” for smooth Immigration and Customs procedures when traveling to Japan.\n\xa0\nHow to Apply\n\xa0\n\n\nInquiries Concerning Visa\xa0\nGeneral Information\nList of Accredited Agencies\nFAQs on Japan Visa\nExtension of Re-entry Permit\nVisa Fee Exemption for Foreigners Visiting Three Tohoku Prefectu

In [19]:
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords if you haven't already
nltk.download('stopwords')

# Function to clean text
def clean_text(text):
    # Remove URLs (matches both http://, https:// and other URLs)
    text = re.sub(r'http[s]?://\S+', '', text)

    # Remove dates (matches common formats like '12/01/2021', '2021-12-01', etc.)
    text = re.sub(r'\b(?:\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|\d{4}[-/]\d{1,2}[-/]\d{1,2})\b', '', text)

    # Convert text to lowercase
    text = text.lower()

    # Remove non-alphabetic characters (this keeps only words)
    text = re.sub(r'[^a-z\s]', '', text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]

    # Join words back into a single string
    cleaned_text = ' '.join(words)

    return cleaned_text


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/krimssmirk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
cleaned_txt = clean_text(raw_txt)

In [21]:
cleaned_txt = "".join(cleaned_txt)

In [22]:
tokens = cleaned_txt.split()

In [23]:
" ".join(tokens[1:len(tokens)])

'services japanese visa important notice digitization certificate eligibility coe th march japan immigration accepts application digitized coe details please see acceptance photocopy japanese documents th march accept photocopy documents issuedprepared japan instead original details please see advanced procedures customs immigration quarantine th april entrantsreturnees japan need submit valid vaccination certificate negative certificate covid please use visit japan web smooth immigration customs procedures traveling japan apply inquiries concerning visa general information list accredited agencies faqs japan visa extension reentry permit visa fee exemption foreigners visiting three tohoku prefectures types visa requirements choose one visas suitable purpose travel check requirements see important note document submission temporary visitor stay within days maximum tourismdue rapid increase number visitors philippines examination visa applications tourism purposes may take several weeks

In [24]:
text = []
continuation = []

for i in range(0, len(tokens), 20):
    if i + 20 > len(tokens):
        text.append(" ".join(tokens[i:len(tokens)]))
        continuation.append(" ".join(tokens[len(tokens):]))
    else:
        text.append(" ".join(tokens[i:i+10]))
        continuation.append(" ".join(tokens[i+10:i+20]))

## Set Up the Model for Fine-Tuning

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load pre-trained T5 model and tokenizer
model_name = "t5-small"  # You can use "t5-base" or "t5-large" for better performance
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# The tokenizer's pad token is not always set by default
tokenizer.pad_token = tokenizer.eos_token


In [26]:
# dataset for text continuation (input text + continuation as target)
data = {
    'text': text,
    'continuation': continuation
}

# Load the dataset as a HuggingFace Dataset
dataset = Dataset.from_dict(data)

In [27]:
# Preprocess the dataset
def preprocess_function(examples):
    # Prepare the inputs for the model (encoder input and decoder output)
    # Use the column names for the text you want to use
    inputs = examples["text"]
    targets = examples["continuation"]
    
    # Tokenize the inputs and targets (add decoder_input_ids by shifting the targets)
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    
    # Add decoder_input_ids
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)
# Split the dataset into training and validation sets
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.2)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./t5-generation",  # where to save the model
    evaluation_strategy="epoch",  # evaluate after each epoch
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

# Start the fine-tuning process
trainer.train()


Map: 100%|██████████| 159/159 [00:00<00:00, 1187.35 examples/s]


Epoch,Training Loss,Validation Loss
1,1.4004,0.71807
2,0.7211,0.674137


KeyboardInterrupt: 

In [None]:
# Save the model and tokenizer to a specified directory
model_save_path = "./trained_model"
tokenizer_save_path = "./t5_tokenizer"

# Save the trained model
trainer.save_model(model_save_path)

# Save the tokenizer
tokenizer.save_pretrained(tokenizer_save_path)

print("Model and tokenizer saved!")


Model and tokenizer saved!
