# RoBERTa Base - with MordinezNLP
As a training data I'm using Amazon Review Full
It is available under: https://s3.amazonaws.com/fast-ai-nlp/amazon_review_polarity_csv.tgz


Builded using: https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb

# LanguageModel

In [1]:
!pwd

/home/mborzymowski/AI/Priv/MordinezNLP/benchmarks


In [2]:
!wget https://s3.amazonaws.com/fast-ai-nlp/amazon_review_polarity_csv.tgz
!tar -xvf amazon_review_polarity_csv.tgz

--2021-02-10 11:18:13--  https://s3.amazonaws.com/fast-ai-nlp/amazon_review_polarity_csv.tgz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.83.46
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.83.46|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 688339454 (656M) [application/x-tar]
Saving to: 'amazon_review_polarity_csv.tgz'


2021-02-10 11:19:56 (6.43 MB/s) - 'amazon_review_polarity_csv.tgz' saved [688339454/688339454]

amazon_review_polarity_csv/
amazon_review_polarity_csv/train.csv
amazon_review_polarity_csv/readme.txt
amazon_review_polarity_csv/test.csv


In [1]:
import pandas as pd
from tqdm.notebook import tqdm

## Load text data from CSV

In [2]:
ds_train = pd.read_csv("./amazon_review_polarity_csv/train.csv", header=None)

In [3]:
ds_test = pd.read_csv("./amazon_review_polarity_csv/test.csv", header=None)

In [4]:
ds_test.head()

Unnamed: 0,0,1,2
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...
1,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...


## Save data to TXT files

In [7]:
!mkdir ds

In [8]:
with open("./ds/ds_base_train.txt", "w", encoding="utf8") as f:
    used_lines = set()
    for index, line in tqdm(ds_train.iterrows(), total=len(ds_train), desc="Reading train set"):
        if len(str(line[1])) > 0:
            used_lines.add(str(line[1]) + "\n")
        if len(str(line[2])) > 0:
            used_lines.add(str(line[2]) + "\n")
        
    for index, line in tqdm(ds_test.iterrows(), total=len(ds_test), desc="Reading test set"):
        if len(str(line[1])) > 0:
            used_lines.add(str(line[1]) + "\n")
        if len(str(line[2])) > 0:
            used_lines.add(str(line[2]) + "\n")
            
    for line in tqdm(used_lines, desc='Saving lines to file'):
        f.write(line)

HBox(children=(FloatProgress(value=0.0, description='Reading train set', max=3600000.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Reading test set', max=400000.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Saving lines to file', max=6896445.0, style=ProgressStyle…




## Use MordinezNLP

In [None]:
!pip install MordinezNLP=0.1.0-a8

In [None]:
from MordinezNLP.processors import BasicProcessor

In [None]:
bp = BasicProcessor()

In [None]:
with open("./ds/ds_base_train.txt", "r", encoding="utf8") as f1:
    with open("./ds/ds_base_train_mordineznlp.txt", "w", encoding="utf8") as f2:
        post_processed_texts = bp.process(f1.readlines(), language='en')
        
        for line in tqdm(post_processed_texts):
            f2.write(line + "\n")

In [None]:
!head -10 ./ds/ds_base_train.txt
print("\n\n")
!head -10 ./ds/ds_base_train_mordineznlp.txt

In [None]:
# fix "0" bug in basic processor in MordinezNLP==0.1.0-a8
special_tokens = bp.get_special_tokens()
special_tokens.remove('0')
special_tokens

## Build tokenizer

In [None]:
from tokenizers import ByteLevelBPETokenizer

In [9]:
bpe = ByteLevelBPETokenizer()

In [10]:
bpe.train(
    files=["./ds/ds_base_train_mordineznlp.txt"], 
    vocab_size=22_000, 
    min_frequency=2, 
    special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ] + special_tokens
)

In [None]:
bpe.token_to_id("<email>")

In [11]:
!mkdir roberta_mordineznlp
bpe.save_model("roberta_mordineznlp")

['baseRoBERTa/vocab.json', 'baseRoBERTa/merges.txt']

## Create roBERTatokenizer

In [5]:
from transformers import RobertaConfig, RobertaTokenizerFast

In [6]:
tokenizer = RobertaTokenizerFast.from_pretrained("./roberta_mordineznlp", max_len=512, use_fast=True)

In [None]:
tokenizer.add_special_tokens({
    'additional_special_tokens': [
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ] + special_tokens
})

In [None]:
tokenizer.special_tokens_map

In [None]:
tokenizer.tokenize("Hi <unk> <cls> <email> email currency, big bang, yesterday, loooooooooooolz")

In [7]:
from transformers import RobertaForMaskedLM

## Build dataset

In [4]:
from datasets import load_dataset

In [5]:
!head -3500000 ./ds/ds_base_train_mordineznlp.txt > ./ds/ds_base_train_mordineznlp_sm.txt
dataset = load_dataset('text', data_files={'train': ['./ds/ds_base_train_mordineznlp_sm.txt']})

Using custom data configuration default
Reusing dataset text (/home/mborzymowski/.cache/huggingface/datasets/text/default-e2020fc267dc2c54/0.0.0/daf90a707a433ac193b369c8cc1772139bb6cca21a9c7fe83bdd16aad9b9b6ab)


In [6]:
text_column_name = "text" if "text" in dataset["train"].column_names else column_names[0]; text_column_name

'text'

In [7]:
def encode(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

In [8]:
tokenized_datasets = dataset.map(
    encode,
    batched=True,
    remove_columns=[text_column_name],
    load_from_cache_file=True,
)

Loading cached processed dataset at /home/mborzymowski/.cache/huggingface/datasets/text/default-e2020fc267dc2c54/0.0.0/daf90a707a433ac193b369c8cc1772139bb6cca21a9c7fe83bdd16aad9b9b6ab/cache-4ae5f20bf687614e.arrow


In [9]:
from transformers import DataCollatorForLanguageModeling

In [10]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

## Create model

In [11]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="5,6"

In [12]:
config = RobertaConfig(
    vocab_size=22_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=4,
    type_vocab_size=1,
)

In [13]:
model = RobertaForMaskedLM(config=config)

In [14]:
model.num_parameters()

46258672

## Training args

In [15]:
import torch
torch.cuda.is_available()

True

In [16]:
from transformers import Trainer, TrainingArguments

In [17]:
training_args = TrainingArguments(
    output_dir="./roberta_lm_mordineznlp",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=42,
    save_steps=10_000,
    save_total_limit=3,
    do_train=True,
    no_cuda=False,
    logging_steps=5000
)

## Build trainer

In [18]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"]
)

## Train

In [19]:
trainer.train()



Step,Training Loss
5000,6.2825
10000,4.7897
15000,4.2553
20000,4.0047
25000,3.8335
30000,3.7232
35000,3.6492
40000,3.609




TrainOutput(global_step=41667, training_loss=4.241367944056448, metrics={'train_runtime': 39705.6104, 'train_samples_per_second': 1.049, 'total_flos': 497373241344000000, 'epoch': 1.0})

In [20]:
trainer.save_model("./baseROBERTa_LM")

In [21]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./baseROBERTa_LM",
    tokenizer="./baseROBERTa_LM"
)

Some weights of RobertaModel were not initialized from the model checkpoint at ./baseROBERTa_LM and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
fill_mask("Plane <mask>.")

[{'sequence': 'Plane book.',
  'score': 0.020834563300013542,
  'token': 350,
  'token_str': ' book'},
 {'sequence': 'Plane Review.',
  'score': 0.009379935450851917,
  'token': 4874,
  'token_str': ' Review'},
 {'sequence': 'Plane Book.',
  'score': 0.009224461391568184,
  'token': 2140,
  'token_str': ' Book'},
 {'sequence': 'Plane read.',
  'score': 0.008015546947717667,
  'token': 433,
  'token_str': ' read'},
 {'sequence': 'Plane garbage.',
  'score': 0.0077307019382715225,
  'token': 2778,
  'token_str': ' garbage'}]

# Classifier

In [5]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="3,4,5,6"

In [6]:
from transformers import RobertaTokenizerFast

In [7]:
tokenizer = RobertaTokenizerFast.from_pretrained("baseROBERTa_LM")

In [8]:
ds_train.iloc[0]

0                                                    2
1                       Stuning even for the non-gamer
2    This sound track was beautiful! It paints the ...
Name: 0, dtype: object

In [9]:
import torch

In [10]:
class AmazonDS(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.df)
        
    def __getitem__(self, idx):
        tokenized = tokenizer(str(self.df.iloc[idx][1]) + " " + str(self.df.iloc[idx][2]), truncation=True, padding="max_length", max_length=512)
        item = {
            'input_ids': torch.tensor(tokenized['input_ids']),
            'attention_mask': torch.tensor(tokenized['attention_mask']),
            'labels': torch.tensor([int(self.df.iloc[idx][0])-1])
        }
        
        return item

In [11]:
classification_ds_train = AmazonDS(ds_train, tokenizer)
classification_ds_test = AmazonDS(ds_test, tokenizer)

In [12]:
classification_ds_test.__getitem__(0)

{'input_ids': tensor([    0,  1150,   728,   926,  4559,  4739,   482,   404,   297,   268,
          3321,  5005,   297,   569,  5506,    18,   284,   365,  3012,   289,
           315,   728,   322, 11554,   290,   284,   777,  3308,  1526,    18,
          1403,   284,   682,   300,   262,   466,  4526,   299,  1018,   442,
           878,   651,    18,   353,   799,  4526,   462,  1596,   496,   279,
          1309,   438,  6423,   300,   268,  6053,    18,   544,   728,   462,
           270,    83,  7881, 10984,    18,   697, 19371,   379,   401,   311,
           278,  2177,    57,  3317,  7269,   290,  2167,   462,  2045,    18,
          1673,   297,   900,   355,  6680,  9811,    18,   544,   302,   262,
          9296,   302,   306,   728,   300,   373,   350,    18,  2783,   693,
           769,   779,   299,  1020,   302,   462,  2940,   442,    18, 17246,
           284,   666,   315,    16,   497,  1932,  2028,    16,  2428,    16,
          1582,    16,   741,    16,  4

In [13]:
from transformers import RobertaForSequenceClassification, RobertaConfig

In [14]:
# config = RobertaConfig.from_pretrained("./baseROBERTa_save", num_lables=6)
model = RobertaForSequenceClassification.from_pretrained("./baseROBERTa_LM", num_labels=2)

Some weights of the model checkpoint at ./baseROBERTa_LM were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./baseROBERTa_LM and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.o

In [15]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [16]:
from transformers import  Trainer, TrainingArguments

In [18]:
training_args = TrainingArguments(
    output_dir='./baseROBERTa_classification',          # output directory
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                # strength of weight decay
    num_train_epochs=3,
    per_device_train_batch_size=70,
    per_device_eval_batch_size=128,
    save_steps=5_000,
    save_total_limit=3,
    do_train=True,
    do_eval=True,
    no_cuda=False,
    logging_steps=2000,
    eval_steps=5000
)

In [19]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=classification_ds_train,         # training dataset
    eval_dataset=classification_ds_test,            # evaluation dataset
    compute_metrics=compute_metrics
)

In [20]:
trainer.train()



Step,Training Loss
2000,0.2094
4000,0.1483
6000,0.1356
8000,0.1281
10000,0.1238
12000,0.1201
14000,0.1103
16000,0.1038
18000,0.1023
20000,0.101




TrainOutput(global_step=38574, training_loss=0.10856790810108012, metrics={'train_runtime': 44999.815, 'train_samples_per_second': 0.857, 'total_flos': 1534021875302400000, 'epoch': 3.0})

In [21]:
print('done')

done


In [22]:
trainer.save_model("./baseROBERTa_classification")

In [23]:
trainer.evaluate()

{'eval_loss': 0.1097908467054367,
 'eval_accuracy': 0.96315,
 'eval_f1': 0.9630021937640876,
 'eval_precision': 0.9668803741897763,
 'eval_recall': 0.959155,
 'eval_runtime': 418.032,
 'eval_samples_per_second': 956.865,
 'epoch': 3.0}

### Big thanks to theBlue.ai