In [1]:
import os
import re
import transformers

import pandas as pd

from collections import Counter

In [2]:
def basicPreprocess(text):
    processed_text = text.lower()
    processed_text = re.sub(r"[^a-zA-Z0-9]+", ' ', processed_text)
    return processed_text

In [3]:
complete_df = pd.read_csv("data/clean_df.csv")

In [4]:
data = complete_df.sample(frac = 1).sample(frac = 1)
data.dropna(inplace = True)
del complete_df

In [5]:
data = data["abstract"].apply(basicPreprocess).replace("\n"," ")

In [6]:
text = ''
for i in data.values:
    text += i
del data

In [7]:
counter = Counter(text.split())
del text

In [8]:
vocab = []
for keys, values in counter.items():
    if(values > 100 and values < 10000):
        vocab.append(keys)

In [9]:
len(vocab)

6737

In [10]:
tokenizer = transformers.AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = transformers.AutoModelWithLMHead.from_pretrained('allenai/scibert_scivocab_uncased').to('cuda')

In [11]:
model.config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 31090
}

In [12]:
print(len(tokenizer))

31090


In [13]:
tokenizer.add_tokens(vocab)
print(len(tokenizer))

31941


In [14]:
model.resize_token_embeddings(len(tokenizer)) 
model.config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 31941
}

In [15]:
del vocab

In [16]:
os.mkdir('models/COVID-scibert-tokenizer')
tokenizer.save_pretrained('models/COVID-scibert-tokenizer')

('models/COVID-scibert-tokenizer/vocab.txt',
 'models/COVID-scibert-tokenizer/special_tokens_map.json',
 'models/COVID-scibert-tokenizer/added_tokens.json')

In [17]:
dataset = transformers.LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = "lm_data/train.txt",
    block_size = 64,
)

In [18]:
data_collator = transformers.DataCollatorForLanguageModeling(
    tokenizer = tokenizer, mlm = True, mlm_probability = 0.15
)

In [19]:
training_args = transformers.TrainingArguments(
    output_dir = "models/COVID-scibert",
    overwrite_output_dir = True,
    num_train_epochs = 5,
    per_device_train_batch_size = 16,
    save_steps = 10_000,
    save_total_limit = 3,
)

trainer = transformers.Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = dataset,
    prediction_loss_only = True,
)

In [20]:
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=2259.0, style=ProgressStyle(description_w…

{"loss": 2.304284627914429, "learning_rate": 4.778663125276671e-05, "epoch": 0.2213368747233289, "step": 500}
{"loss": 2.1008607831001282, "learning_rate": 4.557326250553343e-05, "epoch": 0.4426737494466578, "step": 1000}
{"loss": 2.063936607837677, "learning_rate": 4.335989375830013e-05, "epoch": 0.6640106241699867, "step": 1500}
{"loss": 2.0148767976760866, "learning_rate": 4.114652501106684e-05, "epoch": 0.8853474988933157, "step": 2000}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=2259.0, style=ProgressStyle(description_w…

{"loss": 1.9426649825572968, "learning_rate": 3.893315626383356e-05, "epoch": 1.1066843736166445, "step": 2500}
{"loss": 1.8798881018161773, "learning_rate": 3.671978751660027e-05, "epoch": 1.3280212483399734, "step": 3000}
{"loss": 1.83592178606987, "learning_rate": 3.450641876936698e-05, "epoch": 1.5493581230633025, "step": 3500}
{"loss": 1.8167849580049515, "learning_rate": 3.229305002213369e-05, "epoch": 1.7706949977866313, "step": 4000}
{"loss": 1.7918421647548675, "learning_rate": 3.00796812749004e-05, "epoch": 1.9920318725099602, "step": 4500}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=2259.0, style=ProgressStyle(description_w…

{"loss": 1.7348681960105896, "learning_rate": 2.786631252766711e-05, "epoch": 2.213368747233289, "step": 5000}
{"loss": 1.7202716085910796, "learning_rate": 2.565294378043382e-05, "epoch": 2.434705621956618, "step": 5500}
{"loss": 1.7061260747909546, "learning_rate": 2.3439575033200534e-05, "epoch": 2.6560424966799467, "step": 6000}
{"loss": 1.691216767191887, "learning_rate": 2.1226206285967244e-05, "epoch": 2.8773793714032756, "step": 6500}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=2259.0, style=ProgressStyle(description_w…

{"loss": 1.6468331438302994, "learning_rate": 1.9012837538733954e-05, "epoch": 3.098716246126605, "step": 7000}
{"loss": 1.6359560004472733, "learning_rate": 1.6799468791500664e-05, "epoch": 3.3200531208499338, "step": 7500}
{"loss": 1.6247947722673417, "learning_rate": 1.4586100044267376e-05, "epoch": 3.5413899955732626, "step": 8000}
{"loss": 1.5908580974340438, "learning_rate": 1.2372731297034086e-05, "epoch": 3.7627268702965915, "step": 8500}
{"loss": 1.584194672346115, "learning_rate": 1.0159362549800798e-05, "epoch": 3.9840637450199203, "step": 9000}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=2259.0, style=ProgressStyle(description_w…

{"loss": 1.558923286676407, "learning_rate": 7.945993802567508e-06, "epoch": 4.20540061974325, "step": 9500}
{"loss": 1.5409786819219589, "learning_rate": 5.732625055334219e-06, "epoch": 4.426737494466578, "step": 10000}




{"loss": 1.5378912506103515, "learning_rate": 3.51925630810093e-06, "epoch": 4.648074369189907, "step": 10500}
{"loss": 1.5192760183811187, "learning_rate": 1.3058875608676407e-06, "epoch": 4.869411243913236, "step": 11000}




TrainOutput(global_step=11295, training_loss=1.7594639224993438)

In [21]:
trainer.save_model("models/COVID-scibert-latest")