In [15]:
from transformers import (
    RobertaTokenizer, 
    RobertaForMaskedLM, 
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
    RobertaForSequenceClassification,
)
from datasets import load_dataset
import numpy as np

### Pre-Training on Corpus

In [13]:
# Loading model and tokenizer
roberta_model_mlm = RobertaForMaskedLM.from_pretrained('roberta-base')
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

Loading weights: 100%|██████████| 202/202 [00:00<00:00, 1304.41it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
RobertaForMaskedLM LOAD REPORT from: roberta-base
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [14]:
pretrained_dataset = load_dataset('csv', data_files='placeholder_path/data') # Load dataset

FileNotFoundError: Unable to find 'C:/Users/HP/CS175/news-bias-classifier\placeholder_path/data'

In [None]:
# Determines tokenization per data
def tokenize(data):
    return roberta_tokenizer(data['text'], truncation=True, max_length=512, padding='max_length')

In [10]:
print(roberta_tokenizer("Monkey is a monkey but there is a rat rat", truncation=True, max_length=512, padding='max_length')) # Example output of tokenizer

{'input_ids': [0, 17312, 5282, 16, 10, 25684, 53, 89, 16, 10, 12378, 12378, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [None]:
tokenized_pretrained_dataset = pretrained_dataset.map(tokenize, batched=True, remove_columns=['text']) # Tokenizes the descriptions for RoBERTa training

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=roberta_tokenizer, mlm=True, mlm_probability=0.15) # Used to mask random tokens

training_args = TrainingArguments(
    output_dir='./domain-roberta-mlm',
    save_steps=10000,
    save_total_limit=2,
    logging_steps=500,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    learning_rate=5e-5,
) # Parameters used for trainer

trainer = Trainer(model=roberta_model_mlm,
                  args=training_args,
                  data_collator=data_collator,
                  train_dataset=tokenized_pretrained_dataset)

In [None]:
trainer.train() # Training

In [None]:
# Saving model
roberta_model_mlm.save_pretrained('./domain-roberta-mlm')
roberta_tokenizer.save_pretrained('./domain-roberta-mlm')

### Train for Classification

In [None]:
roberta_model_classifier = RobertaForSequenceClassification.from_pretrained('./domain-roberta-mlm', num_labels=7) # Load Pre-Trained model

In [None]:
classifier_dataset = load_dataset('csv', data_files='placeholder_path/data') # Load dataset
tokenized_classifier_dataset = classifier_dataset.map(tokenize, batched=True, remove_columns=['text']) # Tokenizes the descriptions for RoBERTa training

In [None]:
training_args = TrainingArguments(
    output_dir='./domain-roberta-classifier',
    save_strategy='epoch',
    eval_strategy='epoch',
    load_best_model_at_end=True,
    logging_steps=500,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
)  # Parameters used for trainer

trainer = Trainer(model=roberta_model_classifier,
                  args=training_args,
                  data_collator=data_collator,
                  train_dataset=tokenized_classifier_dataset['train'],
                  eval_dataset=tokenized_classifier_dataset['test'])

In [None]:
trainer.train() # Training

In [None]:
# Saving model
roberta_model_classifier.save_pretrained('./domain-roberta-classifier')
roberta_tokenizer.save_pretrained('./domain-roberta-classifier')