In [1]:
import re
import tokenizers
import transformers

import pandas as pd

from collections import Counter

In [2]:
def basicPreprocess(text):
    processed_text = text.lower()
    processed_text = re.sub(r"[^a-zA-Z0-9]+", ' ', processed_text)
    return processed_text

In [3]:
complete_df = pd.read_csv("data/clean_df.csv")[:20000]

In [4]:
data = complete_df.sample(frac = 1).sample(frac = 1)
data.dropna(inplace = True)
del complete_df

In [5]:
data = data["abstract"].apply(basicPreprocess).replace("\n"," ")

In [6]:
text = ''
for i in data.values:
    text += i
del data

In [7]:
counter = Counter(text.split())
del text

In [8]:
vocab = []
for keys, values in counter.items():
    if(values > 100 and values < 10000):
        vocab.append(keys)

In [9]:
len(vocab)

4385

In [10]:
tokenizer = transformers.AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = transformers.AutoModelWithLMHead.from_pretrained('allenai/scibert_scivocab_uncased')

In [11]:
model.config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 31090
}

In [12]:
tokenizer.add_tokens(vocab)

336

In [13]:
del vocab

In [14]:
dataset = transformers.LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = "lm_data/train.txt",
    block_size = 64,
)

In [15]:
data_collator = transformers.DataCollatorForLanguageModeling(
    tokenizer = tokenizer, mlm = True, mlm_probability = 0.15
)

In [20]:
training_args = transformers.TrainingArguments(
    output_dir = "models/COVID-scibert",
    overwrite_output_dir = True,
    num_train_epochs = 2,
    per_device_train_batch_size = 16,
    save_steps = 10_000,
    save_total_limit = 2,
)

trainer = transformers.Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = dataset,
    prediction_loss_only = True,
)

In [21]:
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=2.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=2259.0, style=ProgressStyle(description_w…

{"loss": 2.2454069347381593, "learning_rate": 4.446657813191678e-05, "epoch": 0.2213368747233289, "step": 500}
{"loss": 2.0640436964035036, "learning_rate": 3.893315626383356e-05, "epoch": 0.4426737494466578, "step": 1000}
{"loss": 2.0424425210952757, "learning_rate": 3.339973439575033e-05, "epoch": 0.6640106241699867, "step": 1500}
{"loss": 2.008446457386017, "learning_rate": 2.786631252766711e-05, "epoch": 0.8853474988933157, "step": 2000}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=2259.0, style=ProgressStyle(description_w…

{"loss": 1.92195778632164, "learning_rate": 2.233289065958389e-05, "epoch": 1.1066843736166445, "step": 2500}
{"loss": 1.847716299533844, "learning_rate": 1.6799468791500664e-05, "epoch": 1.3280212483399734, "step": 3000}
{"loss": 1.8034544098377228, "learning_rate": 1.1266046923417443e-05, "epoch": 1.5493581230633025, "step": 3500}
{"loss": 1.7897509193420411, "learning_rate": 5.732625055334219e-06, "epoch": 1.7706949977866313, "step": 4000}
{"loss": 1.7694911324977876, "learning_rate": 1.99203187250996e-07, "epoch": 1.9920318725099602, "step": 4500}




TrainOutput(global_step=4518, training_loss=1.942776378829588)