In [1]:
import multiprocessing
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import transformers
import csv
import math
import glob

from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer, AutoConfig
from transformers import BertForMaskedLM, DistilBertForMaskedLM
from transformers import BertTokenizer, DistilBertTokenizer
# from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from tokenizers import BertWordPieceTokenizer
from transformers.models.roberta.modeling_roberta import RobertaModel


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# HYPERPARAMS
SEED_SPLIT = 0
SEED_TRAIN = 0

MAX_SEQ_LEN = 75
TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 16
LEARNING_RATE = 2e-5 
LR_WARMUP_STEPS = 100
WEIGHT_DECAY = 0.01

save_path = './RoBERTa_for_ebay'

In [3]:
# load data
data_path_VAL = "Listing_Titles.tsv"
dtf_mlm = pd.read_csv(data_path_VAL, sep="\t", dtype=str, keep_default_na=False, na_values=[""], quoting=csv.QUOTE_NONE, skiprows = lambda x : x > 50000)
#dtf_mlm = dtf_mlm.rename(columns={"review_content": "text"})

# Train/Valid Split
df_train, df_valid = train_test_split(
    dtf_mlm, test_size=0.15, random_state=SEED_SPLIT
)

len(df_train), len(df_valid)

# Convert to Dataset object
train_dataset = Dataset.from_pandas(df_train[['Title']].dropna())
valid_dataset = Dataset.from_pandas(df_valid[['Title']].dropna())


  if _pandas_api.is_sparse(col):
  if _pandas_api.is_sparse(col):


In [4]:
'''
bert-base-uncased  # 12-layer, 768-hidden, 12-heads, 109M parameters
distilbert-base-uncased  # 6-layer, 768-hidden, 12-heads, 65M parameters
'''

# MODEL = 'bert'
bert_type = 'xlm-roberta-large-finetuned-conll03-german'#'bert-base-german-cased'

# if MODEL == 'distilbert':
#     TokenizerClass = DistilBertTokenizer 
#     ModelClass = DistilBertForMaskedLM 
# elif MODEL == 'bert':
#     TokenizerClass = BertTokenizer
#     ModelClass = BertForMaskedLM 
# # elif MODEL == 'roberta':
# #     TokenizerClass = RobertaTokenizer
# #     ModelClass = RobertaForMaskedLM
# elif MODEL == 'scibert':
#     TokenizerClass = AutoTokenizer
#     ModelClass = AutoModelForMaskedLM
ModelClass = AutoModelForMaskedLM
TokenizerClass = AutoTokenizer

# config = AutoConfig.from_pretrained(bert_type)

tokenizer = TokenizerClass.from_pretrained(bert_type)
model = ModelClass.from_pretrained(bert_type)

Some weights of XLMRobertaForMaskedLM were not initialized from the model checkpoint at xlm-roberta-large-finetuned-conll03-german and are newly initialized: ['lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def tokenize_function(row):
    return tokenizer(
        row['Title'],
        padding='max_length',
        truncation=True,
        max_length=MAX_SEQ_LEN,
        return_special_tokens_mask=True)
  
column_names = train_dataset.column_names

train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    # num_proc=multiprocessing.cpu_count(),     # Turned off when tokenizer is fast
    remove_columns=column_names,
)

valid_dataset = valid_dataset.map(
    tokenize_function,
    batched=True,
    # num_proc=multiprocessing.cpu_count(),     # Turned off when tokenizer is fast
    remove_columns=column_names,
)


                                                                    

In [6]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

steps_per_epoch = int(len(train_dataset) / TRAIN_BATCH_SIZE)

training_args = TrainingArguments(
    output_dir='./bert-news',
    logging_dir='./LMlogs',             
    num_train_epochs=1,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    warmup_steps=LR_WARMUP_STEPS,
    save_steps=steps_per_epoch,
    save_total_limit=3,
    weight_decay=WEIGHT_DECAY,
    learning_rate=LEARNING_RATE, 
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='loss', 
    greater_is_better=False,
    seed=SEED_TRAIN
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
)

trainer.train()
trainer.save_model(save_path) #save your custom model


  0%|          | 0/2657 [00:00<?, ?it/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 19%|█▉        | 500/2657 [06:52<29:00,  1.24it/s]

{'loss': 0.26, 'learning_rate': 1.6871333594055535e-05, 'epoch': 0.19}


 38%|███▊      | 1000/2657 [13:52<22:59,  1.20it/s]

{'loss': 0.1513, 'learning_rate': 1.2960500586624952e-05, 'epoch': 0.38}


 56%|█████▋    | 1500/2657 [22:04<29:48,  1.55s/it]

{'loss': 0.1298, 'learning_rate': 9.049667579194369e-06, 'epoch': 0.56}


 75%|███████▌  | 2000/2657 [32:28<15:33,  1.42s/it]

{'loss': 0.1217, 'learning_rate': 5.1388345717637865e-06, 'epoch': 0.75}


 94%|█████████▍| 2500/2657 [41:05<02:03,  1.27it/s]

{'loss': 0.117, 'learning_rate': 1.2280015643332032e-06, 'epoch': 0.94}


                                                   
100%|██████████| 2657/2657 [46:14<00:00,  1.44s/it]

{'eval_loss': 0.10505039244890213, 'eval_runtime': 160.4512, 'eval_samples_per_second': 46.743, 'eval_steps_per_second': 2.923, 'epoch': 1.0}


100%|██████████| 2657/2657 [46:26<00:00,  1.05s/it]


{'train_runtime': 2786.0126, 'train_samples_per_second': 15.255, 'train_steps_per_second': 0.954, 'train_loss': 0.15334676556628496, 'epoch': 1.0}


In [7]:
tokenizer = AutoTokenizer.from_pretrained(bert_type, use_fast = True, do_lower_case=True)
model = AutoModelForMaskedLM.from_pretrained(bert_type)

trainer = Trainer(
  model=model,
  data_collator=data_collator,
  #train_dataset=tokenized_dataset_2['train'],
  eval_dataset=valid_dataset,
  tokenizer=tokenizer,
  )

eval_results = trainer.evaluate()

print('Evaluation results: ', eval_results)
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.3f}")
print('----------------\n')


Some weights of XLMRobertaForMaskedLM were not initialized from the model checkpoint at xlm-roberta-large-finetuned-conll03-german and are newly initialized: ['lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 938/938 [02:09<00:00,  7.23it/s]

Evaluation results:  {'eval_loss': 0.7896931767463684, 'eval_runtime': 130.0929, 'eval_samples_per_second': 57.651, 'eval_steps_per_second': 7.21}
Perplexity: 2.203
----------------






In [8]:
for modelpath in glob.iglob(save_path):
  print('Model: ', modelpath)
  tokenizer = AutoTokenizer.from_pretrained(modelpath, use_fast = True, do_lower_case=True)
  model = AutoModelForMaskedLM.from_pretrained(modelpath)

  trainer = Trainer(
    model=model,
    data_collator=data_collator,
    #train_dataset=tokenized_dataset_2['train'],
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    )
  
  eval_results = trainer.evaluate()

  print('Evaluation results: ', eval_results)
  print(f"Perplexity: {math.exp(eval_results['eval_loss']):.3f}")
  print('----------------\n')


Model:  ./RoBERTa_for_ebay


100%|██████████| 938/938 [01:44<00:00,  8.94it/s]

Evaluation results:  {'eval_loss': 0.10671079158782959, 'eval_runtime': 104.9714, 'eval_samples_per_second': 71.448, 'eval_steps_per_second': 8.936}
Perplexity: 1.113
----------------




