In [1]:
import os
import re
import json
import torch
import tokenizers
import transformers

import pandas as pd

from tqdm import tqdm
from pathlib import Path

In [2]:
def basicPreprocess(text):
    processed_text = text.lower()
    processed_text = re.sub(r'\W +', ' ', processed_text)
    return processed_text

In [3]:
complete_df = pd.read_csv("data/clean_df.csv")

In [4]:
data = complete_df.sample(frac = 1).sample(frac = 1)

In [5]:
data.dropna(inplace = True)

In [6]:
data = data["text"].apply(basicPreprocess).replace("\n"," ")

In [7]:
txt_files_dir = "text_splits"

In [8]:
i = 0
for row in tqdm(data.to_list()):
    file_name = os.path.join(txt_files_dir, str(i)+'.txt')
    f = open(file_name, 'w')
    f.write(row)
    f.close()
    i += 1

100%|██████████| 40152/40152 [00:19<00:00, 2069.92it/s]


## Initialize Tokenizer

In [9]:
paths = [str(x) for x in Path(txt_files_dir).glob("**/*.txt")]

tokenizer = tokenizers.ByteLevelBPETokenizer()

vocab_size = 5000

tokenizer.train(files = paths, vocab_size = vocab_size, min_frequency = 50, special_tokens = [
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
])

## Train Test Split

In [10]:
lm_data_dir = "lm_data"

In [11]:
train_split = 0.9
train_data_size = int(len(data)*train_split)

with open(os.path.join(lm_data_dir,'train.txt') , 'w') as f:
    for item in data[:train_data_size].tolist():
        f.write("%s\n" % item)

with open(os.path.join(lm_data_dir,'eval.txt') , 'w') as f:
    for item in data[train_data_size:].tolist():
        f.write("%s\n" % item)

## Model and Tokenizer Instantiation

In [12]:
tokenizer.save("models/COVID")

['models/COVID/vocab.json', 'models/COVID/merges.txt']

In [13]:
train_path = os.path.join(lm_data_dir,"train.txt")
eval_path = os.path.join(lm_data_dir,"eval.txt")

In [14]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = tokenizers.implementations.ByteLevelBPETokenizer(
    'models/COVID/vocab.json', 'models/COVID/merges.txt'
)

In [15]:
tokenizer._tokenizer.post_processor = tokenizers.processors.BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

In [16]:
config = transformers.RobertaConfig(
    vocab_size = 5000,
    max_position_embeddings = 512,
    num_attention_heads = 4,
    num_hidden_layers = 3,
    type_vocab_size = 1,
)

In [17]:
model = transformers.RobertaForMaskedLM(config)

In [18]:
tokenizer = transformers.RobertaTokenizerFast.from_pretrained("models/COVID")

In [19]:
dataset = transformers.LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = "lm_data/train.txt",
    block_size = 128,
)

In [20]:
data_collator = transformers.DataCollatorForLanguageModeling(
    tokenizer = tokenizer, mlm = True, mlm_probability = 0.15
)

In [21]:
training_args = transformers.TrainingArguments(
    output_dir = "models/COVID",
    overwrite_output_dir = True,
    num_train_epochs = 10,
    per_gpu_train_batch_size = 32,
    save_steps = 10_000,
    save_total_limit = 2,
)

trainer = transformers.Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = dataset,
    prediction_loss_only = True,
)

In [22]:
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=565.0, style=ProgressStyle(description_wi…

{"loss": 6.880603775978089, "learning_rate": 4.5575221238938055e-05, "epoch": 0.8849557522123894, "step": 500}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=565.0, style=ProgressStyle(description_wi…

{"loss": 6.51109097480774, "learning_rate": 4.115044247787611e-05, "epoch": 1.7699115044247788, "step": 1000}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=565.0, style=ProgressStyle(description_wi…

{"loss": 6.38016730594635, "learning_rate": 3.672566371681416e-05, "epoch": 2.6548672566371683, "step": 1500}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=565.0, style=ProgressStyle(description_wi…

{"loss": 6.297516198158264, "learning_rate": 3.230088495575221e-05, "epoch": 3.5398230088495577, "step": 2000}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=565.0, style=ProgressStyle(description_wi…

{"loss": 6.216467774391174, "learning_rate": 2.7876106194690264e-05, "epoch": 4.424778761061947, "step": 2500}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=565.0, style=ProgressStyle(description_wi…

{"loss": 6.155025197029114, "learning_rate": 2.345132743362832e-05, "epoch": 5.3097345132743365, "step": 3000}




KeyboardInterrupt: 

In [None]:
trainer.save_model("models/COVID")