In [1]:
import os
import re
import json
import torch
import tokenizers
import transformers

import pandas as pd

from tqdm import tqdm
from pathlib import Path

In [2]:
def basicPreprocess(text):
    processed_text = text.lower()
    processed_text = re.sub(r'\W +', ' ', processed_text)
    return processed_text

In [3]:
complete_df = pd.read_csv("data/clean_df.csv")

In [4]:
data = complete_df.sample(frac = 1).sample(frac = 1)

In [5]:
data.dropna(inplace = True)

In [6]:
data = data["abstract"].apply(basicPreprocess).replace("\n"," ")

In [7]:
txt_files_dir = "text_splits"

In [8]:
i = 0
for row in tqdm(data.to_list()):
    file_name = os.path.join(txt_files_dir, str(i)+'.txt')
    f = open(file_name, 'w')
    f.write(row)
    f.close()
    i += 1

100%|██████████| 40152/40152 [00:06<00:00, 6416.26it/s]


## Initialize Tokenizer

In [9]:
paths = [str(x) for x in Path(txt_files_dir).glob("**/*.txt")]

tokenizer = tokenizers.ByteLevelBPETokenizer()

vocab_size = 5000

tokenizer.train(files = paths, vocab_size = vocab_size, min_frequency = 5, special_tokens = [
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
])

## Train Test Split

In [10]:
lm_data_dir = "lm_data"

In [11]:
train_split = 0.9
train_data_size = int(len(data)*train_split)

with open(os.path.join(lm_data_dir,'train.txt') , 'w') as f:
    for item in data[:train_data_size].tolist():
        f.write("%s\n" % item)

with open(os.path.join(lm_data_dir,'eval.txt') , 'w') as f:
    for item in data[train_data_size:].tolist():
        f.write("%s\n" % item)

## Model and Tokenizer Instantiation

In [12]:
tokenizer.save("models/COVID")

['models/COVID/vocab.json', 'models/COVID/merges.txt']

In [13]:
train_path = os.path.join(lm_data_dir,"train.txt")
eval_path = os.path.join(lm_data_dir,"eval.txt")

In [14]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = tokenizers.implementations.ByteLevelBPETokenizer(
    'models/COVID/vocab.json', 'models/COVID/merges.txt'
)

In [15]:
tokenizer._tokenizer.post_processor = tokenizers.processors.BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

In [16]:
config = transformers.RobertaConfig(
    vocab_size = 5000,
    max_position_embeddings = 512,
    num_attention_heads = 4,
    num_hidden_layers = 3,
    type_vocab_size = 1,
)

In [17]:
model = transformers.RobertaForMaskedLM(config)

In [18]:
tokenizer = transformers.RobertaTokenizerFast.from_pretrained("models/COVID")

In [19]:
dataset = transformers.LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = "lm_data/train.txt",
    block_size = 128,
)

In [20]:
data_collator = transformers.DataCollatorForLanguageModeling(
    tokenizer = tokenizer, mlm = True, mlm_probability = 0.15
)

In [21]:
training_args = transformers.TrainingArguments(
    output_dir = "models/COVID",
    overwrite_output_dir = True,
    num_train_epochs = 2,
    per_device_train_batch_size = 64,
    save_steps = 10_000,
    save_total_limit = 2,
)

trainer = transformers.Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = dataset,
    prediction_loss_only = True,
)

In [22]:
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=2.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=565.0, style=ProgressStyle(description_wi…

{"loss": 6.896225288391113, "learning_rate": 2.7876106194690264e-05, "epoch": 0.8849557522123894, "step": 500}



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=565.0, style=ProgressStyle(description_wi…

{"loss": 6.5980025482177735, "learning_rate": 5.752212389380531e-06, "epoch": 1.7699115044247788, "step": 1000}




TrainOutput(global_step=1130, training_loss=6.724204959278613)

In [23]:
trainer.save_model("models/COVID")

## Pipeline for embeddings

In [24]:
model = transformers.RobertaModel.from_pretrained('models/COVID').to('cuda')

In [25]:
tokenizer = transformers.RobertaTokenizerFast.from_pretrained('models/COVID')

In [26]:
def clean(sentence):
    with torch.no_grad():
        vector = model(torch.tensor(tokenizer.encode(sentence, add_special_tokens = True)).to('cuda').unsqueeze(0))[0].cpu().numpy().tolist()
    return vector[0][0]

In [27]:
clean("What is up with COVID?")

[-0.45296841859817505,
 -2.192584276199341,
 -2.3641011714935303,
 1.3041210174560547,
 0.45958980917930603,
 -0.07980245351791382,
 -0.7769390940666199,
 -1.1012500524520874,
 -0.23653286695480347,
 -0.4653877019882202,
 1.4165536165237427,
 -0.35655665397644043,
 0.3430941104888916,
 0.20337629318237305,
 0.1553642302751541,
 1.7377806901931763,
 0.031772878021001816,
 0.11276957392692566,
 -2.8868908882141113,
 0.6627795696258545,
 -1.2882540225982666,
 -1.0054717063903809,
 0.27778470516204834,
 -0.8777236342430115,
 0.1578001081943512,
 1.3092198371887207,
 0.7750704884529114,
 -2.489347457885742,
 0.22451040148735046,
 0.48058828711509705,
 0.31907519698143005,
 -1.0217267274856567,
 1.6087541580200195,
 -0.1600455492734909,
 0.012304580770432949,
 -0.6671186089515686,
 0.26725465059280396,
 -1.8650914430618286,
 0.8604501485824585,
 -0.9128777980804443,
 -0.46022912859916687,
 1.3813767433166504,
 1.0863120555877686,
 -0.11816200613975525,
 2.436188220977783,
 1.108336091041565,