# RoBERTa Base - without MordinezNLP
As a training data I'm using Amazon Review Full
It is available under: https://s3.amazonaws.com/fast-ai-nlp/amazon_review_polarity_csv.tgz


Builded using: https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb

# LanguageModel

In [3]:
!wget https://s3.amazonaws.com/fast-ai-nlp/amazon_review_polarity_csv.tgz
!tar -xvf amazon_review_polarity_csv.tgz

'wget' is not recognized as an internal or external command,
operable program or batch file.
tar: Error opening archive: Failed to open 'amazon_review_polarity_csv.tgz'


In [2]:
import pandas as pd
from tokenizers import ByteLevelBPETokenizer
from tqdm.notebook import tqdm

## Load text data from CSV

In [3]:
ds_train = pd.read_csv("./amazon_review_polarity_csv/train.csv", header=None)

In [4]:
ds_test = pd.read_csv("./amazon_review_polarity_csv/test.csv", header=None)

In [5]:
ds_test.head()

Unnamed: 0,0,1,2
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...
1,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...


## Save data to TXT files

In [None]:
!mkdir ds

In [6]:
with open("./ds/ds_bpe_roberta_base_train.txt", "w", encoding="utf8") as f:
    used_lines = set()
    for index, line in tqdm(ds_train.iterrows(), total=len(ds_train), desc="Reading train set"):
        if len(str(line[1])) > 0:
            used_lines.add(str(line[1]) + "\n")
        if len(str(line[2])) > 0:
            used_lines.add(str(line[2]) + "\n")
        
    for index, line in tqdm(ds_test.iterrows(), total=len(ds_test), desc="Reading test set"):
        if len(str(line[1])) > 0:
            used_lines.add(str(line[1]) + "\n")
        if len(str(line[2])) > 0:
            used_lines.add(str(line[2]) + "\n")
            
    for line in tqdm(used_lines, desc='Saving lines to file'):
        f.write(line)

HBox(children=(FloatProgress(value=0.0, description='Reading train set', max=3600000.0, style=ProgressStyle(deâ€¦




HBox(children=(FloatProgress(value=0.0, description='Reading test set', max=400000.0, style=ProgressStyle(descâ€¦




HBox(children=(FloatProgress(value=0.0, description='Saving lines to file', max=6896445.0, style=ProgressStyleâ€¦




## Build tokenizer

In [7]:
tokenizer = ByteLevelBPETokenizer()

In [8]:
tokenizer.train(files=["./ds/ds_bpe_roberta_base_train.txt"], vocab_size=22_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

In [9]:
!mkdir baseRoBERTa
tokenizer.save_model("baseRoBERTa")

['baseRoBERTa\\vocab.json', 'baseRoBERTa\\merges.txt']

## Create BERT tokenizer

In [10]:
from transformers import RobertaConfig, RobertaTokenizerFast

In [11]:
tokenizer = RobertaTokenizerFast.from_pretrained("./baseRoBERTa", max_len=512, use_fast=True)

In [12]:
from transformers import RobertaForMaskedLM

## Build dataset

In [16]:
from datasets import load_dataset

In [17]:
# !head -3500000 ./ds/ds_bpe_roberta_base_train.txt > ./ds/ds_bpe_roberta_base_train_sm.txt
dataset = load_dataset('text', data_files={'train': ['./ds/ds_bpe_roberta_base_train_sm.txt']})

Using custom data configuration default


Downloading and preparing dataset text/default-296c63d50c1e7a9c (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to C:\Users\Marcin Borzymowski\.cache\huggingface\datasets\text\default-296c63d50c1e7a9c\0.0.0\daf90a707a433ac193b369c8cc1772139bb6cca21a9c7fe83bdd16aad9b9b6ab...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''â€¦

Dataset text downloaded and prepared to C:\Users\Marcin Borzymowski\.cache\huggingface\datasets\text\default-296c63d50c1e7a9c\0.0.0\daf90a707a433ac193b369c8cc1772139bb6cca21a9c7fe83bdd16aad9b9b6ab. Subsequent calls will reuse this data.


In [18]:
text_column_name = "text" if "text" in dataset["train"].column_names else column_names[0]; text_column_name

'text'

In [19]:
def encode(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

In [20]:
tokenized_datasets = dataset.map(
    encode,
    batched=True,
    remove_columns=[text_column_name],
    load_from_cache_file=True,
)

HBox(children=(FloatProgress(value=0.0, max=3500.0), HTML(value='')))




In [29]:
from transformers import DataCollatorForLanguageModeling

In [30]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

## Create model

In [22]:
config = RobertaConfig(
    vocab_size=22_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=4,
    type_vocab_size=1,
)

In [23]:
model = RobertaForMaskedLM(config=config)

In [24]:
model.num_parameters()

46258672

## Training args

In [25]:
import torch
torch.cuda.is_available()

True

In [26]:
from transformers import Trainer, TrainingArguments

In [27]:
training_args = TrainingArguments(
    output_dir="./baseRoBERTa",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=13,
    save_steps=10_000,
    save_total_limit=3,
    do_train=True,
    no_cuda=False,
    logging_steps=5000
)

## Build trainer

In [31]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"]
)

## Train

In [32]:
trainer.train()

Step,Training Loss


KeyboardInterrupt: 

In [30]:
trainer.save_model("./baseROBERTa_LM")

In [2]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./baseROBERTa_LM",
    tokenizer="./baseROBERTa_LM"
)

Some weights of RobertaModel were not initialized from the model checkpoint at ./baseROBERTa_save and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
fill_mask("Plane <mask>.")

[{'sequence': '<s>Plane book.</s>',
  'score': 0.018549844622612,
  'token': 354,
  'token_str': 'Ä book'},
 {'sequence': '<s>Plane design.</s>',
  'score': 0.00946250930428505,
  'token': 1489,
  'token_str': 'Ä design'},
 {'sequence': '<s>Plane works.</s>',
  'score': 0.009184008464217186,
  'token': 980,
  'token_str': 'Ä works'},
 {'sequence': '<s>Plane story.</s>',
  'score': 0.008807054720818996,
  'token': 643,
  'token_str': 'Ä story'},
 {'sequence': '<s>Plane fun.</s>',
  'score': 0.008298331871628761,
  'token': 759,
  'token_str': 'Ä fun'}]

# Classifier

In [5]:
from transformers import RobertaTokenizerFast

In [6]:
tokenizer = RobertaTokenizerFast.from_pretrained("baseROBERTa_LM")

In [7]:
ds_train.iloc[0]

0                                                    3
1                                   more like funchuck
2    Gave this to my dad for a gag gift after direc...
Name: 0, dtype: object

In [8]:
import torch

In [9]:
class AmazonDS(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.df)
        
    def __getitem__(self, idx):
        tokenized = tokenizer(str(self.df.iloc[idx][1]) + " " + str(self.df.iloc[idx][2]), truncation=True, padding="max_length", max_length=512)
        item = {
            'input_ids': torch.tensor(tokenized['input_ids']),
            'attention_mask': torch.tensor(tokenized['attention_mask']),
            'labels': torch.tensor([int(self.df.iloc[idx][0])])
        }
        
        return item

In [10]:
classification_ds_train = AmazonDS(ds_train, tokenizer)
classification_ds_test = AmazonDS(ds_test, tokenizer)

In [11]:
classification_ds_test.__getitem__(0)

{'input_ids': tensor([    0,    81,   729,   372,    80,    88,  3941,   852,   265,   564,
          2235,   897,   316,  1442,   323,  9957,   339,   768,  4062,    16,
           353,   285,   679,  8246,   291,   465,  1042,  2697,   301,   381,
          1300,   563,  7910,   797,   651,  3867,   671, 12066,   488,   819,
           519,   381, 12910,   853,     5,  1052,  1753,   486,   360,  3090,
            30,   570,   352,   268,  2311,   589,  6054,  3867,   283,    16,
          1719,    17,  3287,    16,  3867,  3496,  3345,    24,    26,  8643,
            18,  5863,  1690,    16,  5539,   488,   291,  1879,   444,   539,
           285,   697,    18,  4552,  2785,   298,   651,   738, 12815,   373,
           285, 11583,   288,  2128,   549,   488,   430,   268,   534,    18,
          1753, 14692,   749,    19,  6044,  6323,     5,     2,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,   

In [12]:
from transformers import RobertaForSequenceClassification, RobertaConfig

In [13]:
# config = RobertaConfig.from_pretrained("./baseROBERTa_save", num_lables=6)
model = RobertaForSequenceClassification.from_pretrained("./baseROBERTa_save", num_labels=6)

Some weights of the model checkpoint at ./baseROBERTa_save were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./baseROBERTa_save and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

In [14]:
from sklearn.metrics import precision_recall_fscore_support
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='samples')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [15]:
from transformers import  Trainer, TrainingArguments

In [16]:
training_args = TrainingArguments(
    output_dir='./baseROBERTa_classification',          # output directory
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                # strength of weight decay
    num_train_epochs=2,
    per_device_train_batch_size=18,
    per_device_eval_batch_size=18,
    save_steps=5_000,
    save_total_limit=2,
    do_train=True,
    do_eval=True,
    no_cuda=False
)

In [17]:
trainer = Trainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=classification_ds_train,         # training dataset
    eval_dataset=classification_ds_test,            # evaluation dataset
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Step,Training Loss
500,1.2974
1000,1.1201
1500,1.0896
2000,1.0586
2500,1.0613
3000,1.0427
3500,1.043
4000,1.0262
4500,1.0213
5000,1.0165


In [19]:
print('done')

done


In [20]:
trainer.evaluate()

ValueError: Samplewise metrics are not available outside of multilabel classification.