In [1]:
import pandas as pd
import numpy as np
from transformers import AutoModelForMaskedLM, AutoTokenizer, LineByLineTextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import seaborn as sns
import tqdm
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import torch
from scipy import spatial
import scipy
import os
os.environ["WANDB_DISABLED"] = "true"
tqdm.pandas()

In [2]:
def RemoveNewLines(text):
    text = text.replace("\n", " ")
    return text

In [3]:
for data in pd.read_csv("../input/news-summarization/data.csv", chunksize=100000):
    data = data[~data["Content"].isnull()]
    data["Content"] = data["Content"].apply(RemoveNewLines)
    break

In [4]:
with open("corpus.txt", "w") as f:
    for text in data["Content"].values:
        f.write(text+"\n")

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("../input/huggingface-roberta-variants/distilroberta-base/distilroberta-base")
model = AutoModelForMaskedLM.from_pretrained("../input/huggingface-roberta-variants/distilroberta-base/distilroberta-base").to(device)

In [6]:
dataset = LineByLineTextDataset(tokenizer=tokenizer,
                                file_path="corpus.txt",
                                block_size=128)



In [7]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True,
                                                mlm_probability=0.15)

In [8]:
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./",
        overwrite_output_dir=True,
        num_train_epochs=5,
        per_device_train_batch_size=64,
        save_steps=200,
        save_total_limit=2,
    ),
    data_collator=data_collator,
    train_dataset=dataset,
)

trainer.train()

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
***** Running training *****
  Num examples = 100008
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 7815


Step,Training Loss
500,1.7204
1000,1.6662
1500,1.653
2000,1.6215
2500,1.6113
3000,1.5928
3500,1.5775
4000,1.5586
4500,1.5458
5000,1.5457


Saving model checkpoint to ./checkpoint-200
Configuration saved in ./checkpoint-200/config.json
Model weights saved in ./checkpoint-200/pytorch_model.bin
Saving model checkpoint to ./checkpoint-400
Configuration saved in ./checkpoint-400/config.json
Model weights saved in ./checkpoint-400/pytorch_model.bin
Saving model checkpoint to ./checkpoint-600
Configuration saved in ./checkpoint-600/config.json
Model weights saved in ./checkpoint-600/pytorch_model.bin
Deleting older checkpoint [checkpoint-200] due to args.save_total_limit
Saving model checkpoint to ./checkpoint-800
Configuration saved in ./checkpoint-800/config.json
Model weights saved in ./checkpoint-800/pytorch_model.bin
Deleting older checkpoint [checkpoint-400] due to args.save_total_limit
Saving model checkpoint to ./checkpoint-1000
Configuration saved in ./checkpoint-1000/config.json
Model weights saved in ./checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [checkpoint-600] due to args.save_total_limit
Saving mode

TrainOutput(global_step=7815, training_loss=1.5744660068153191, metrics={'train_runtime': 5118.7426, 'train_samples_per_second': 97.688, 'train_steps_per_second': 1.527, 'total_flos': 1.657905204206592e+16, 'train_loss': 1.5744660068153191, 'epoch': 5.0})