# Fine-tuning Pretrained Roberta Base model with raw text

## Dependecies

In [37]:
from datasets import Dataset
import re
from transformers import RobertaForMaskedLM, RobertaTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling

## Cleaning

In [38]:
RAW_TXT_PATH = "./dataset/webscrape/raw.txt"

In [39]:
raw_txt = open(RAW_TXT_PATH, "r").read()

In [40]:
raw_txt

'\n\n\n\n\nVisa/Consular Services\n\n2025/1/9\n\n\nJapanese\n\n\n\nVISA\n\nImportant Notice\n1. Digitization of "Certificate of Eligibility (COE)"\n\xa0 From 17th March, Japan Immigration accepts application for digitized COE. For details, please see this.\n\n2. Acceptance of Photocopy of Japanese Documents\xa0\n\xa0From 17th March, we accept photocopy of documents issued/prepared in Japan instead of the original. For details, please see this.\xa0\n\n3. Advanced procedures for Customs, Immigration and Quarantine\n1. From 29th April 2023, all entrants/returnees to Japan do not need to submit a valid vaccination certificate nor a negative certificate of COVID-19.\n2. Please use ‚ÄúVisit Japan Web‚Äù for smooth Immigration and Customs procedures when traveling to Japan.\n\xa0\nHow to Apply\n\xa0\n\n\nInquiries Concerning Visa\xa0\nGeneral Information\nList of Accredited Agencies\nFAQs on Japan Visa\nExtension of Re-entry Permit\nVisa Fee Exemption for Foreigners Visiting Three Tohoku Pref

In [41]:
def clean_text(text):
    # Remove URLs (e.g., http://www.example.com or https://example.com)
    text = re.sub(r'http[s]?://\S+', '', text)

    # Remove email addresses (e.g., user@example.com)
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', '', text)

    # Remove dates (in the format yyyy/m/d or yyyy-mm-dd)
    text = re.sub(r'\d{4}[/-]\d{1,2}[/-]\d{1,2}', '', text)

    # Remove time expressions (e.g., 12:30, 10:45 AM, etc.)
    text = re.sub(r'\b\d{1,2}[:]\d{2}([ ]?[APM]{2})?\b', '', text)

    # Remove numbering, including those in parentheses (e.g., 1., 2., (1), (2), etc.)
    text = re.sub(r'\d+\.', '', text)
    text = re.sub(r'\(\d+\)', '', text)

    # Remove all types of parentheses (round, square, curly, angle brackets)
    text = re.sub(r'[\(\)\[\]\{\}<>\'\"‚Äú‚Äù‚Äò‚Äô]', '', text)

    # Remove non-ASCII characters (like `\xa0` which is a non-breaking space in HTML)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)

    # Remove extra punctuation marks (e.g., !!!, ???, etc.)
    text = re.sub(r'[^\w\s]', '', text)

    # Remove extra whitespaces, newlines, and tab characters
    text = re.sub(r'\s+', ' ', text)

    # Strip leading and trailing whitespaces
    text = text.strip()

    return text

In [42]:
cleaned_txt = clean_text(raw_txt)

In [43]:
cleaned_txt = cleaned_txt.split(".")

In [44]:
# Create a dataset object from the cleaned text
dataset = Dataset.from_dict({'text': cleaned_txt})

In [45]:
# Load the pre-trained RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize the dataset for text generation task
def tokenize_function(examples):
    # Tokenize the text and create labels that are shifted by one token
    inputs = tokenizer(examples['text'], return_tensors='pt', truncation=True, padding=True, max_length=512)
    # Shift the input tokens to create the labels (for causal language modeling)
    inputs["labels"] = inputs["input_ids"].clone()
    return inputs


tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 10.26 examples/s]


## Set Up the Model for Fine-Tuning

RoBERTa is designed for Masked Language Modeling (MLM), so it will predict the masked tokens during training. We will load the pre-trained model and fine-tune it for MLM.

In [46]:
# Load pre-trained RoBERTa model
model = RobertaForMaskedLM.from_pretrained('roberta-base')

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    logging_steps=500,
)

In [52]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=8,   # Batch size for training
    logging_dir='./logs',            # Directory for logs
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="no",        # Disable evaluation during training
    # Add eval_steps or eval_dataset if evaluation is desired
)



In [53]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,  # For Masked Language Modeling
    mlm_probability=0.15  # Probability of masking a token
)


In [54]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
)

In [55]:
# Fine-tune the model
trainer.train()

Step,Training Loss


TrainOutput(global_step=3, training_loss=1.3605448404947917, metrics={'train_runtime': 15.9393, 'train_samples_per_second': 0.188, 'train_steps_per_second': 0.188, 'total_flos': 789796389888.0, 'train_loss': 1.3605448404947917, 'epoch': 3.0})

In [56]:
# save
trainer.model.save_pretrained("./fine_tuned_roberta")
tokenizer.save_pretrained('./fine_tuned_roberta')

('./fine_tuned_roberta/tokenizer_config.json',
 './fine_tuned_roberta/special_tokens_map.json',
 './fine_tuned_roberta/vocab.json',
 './fine_tuned_roberta/merges.txt',
 './fine_tuned_roberta/added_tokens.json')