In [None]:
import wget
from sentence_transformers import SentenceTransformer, util
import torch
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import time
from transformers import AutoTokenizer, AutoModel
from transformers import BertTokenizer, BertModel

In [66]:
def tokenize(model, tokenizer, sentence):
    
    # Tokens comes from a process that splits the input into sub-entities with interesting linguistic properties.
    tokens = tokenizer.tokenize(sentence.lower())

    # This is not sufficient for the model, as it requires integers as input, 
    # not a problem, let's convert tokens to ids.
    tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    #print(tokens_ids)

    # Add the required special tokens
    tokens_ids = tokenizer.build_inputs_with_special_tokens(tokens_ids)
    #print(tokens_ids)
    
    # We need to convert to a Deep Learning framework specific format, let's use PyTorch for now.
    tokens_pt = torch.tensor([tokens_ids])

    # The length of the tokens can not be more than 512
    if len(tokens_pt[0]) > 512:
        tokens_pt = torch.tensor([tokens_pt[0][0:512].tolist()])
        
    # Now we're ready to go through BERT with out input
    outputs, pooled = model(tokens_pt)
    return pooled

In [67]:
string = "Until now we have talked about term matching methods for information retrieval. While these methods are good they do have some issues. One issue is when the user is unfamiliar with what terms are used in relevant documents. A user might search for what percentage of people infected by covid die?, while the more 'correct' way might be What is the Covid case fatality rate. These two sentences have more or less the same semantic meaning, but can lead to different results when using term matching. What if we could try to extract the semantic meaning of a sentence, and match it with the query? For this task we will introduce sentence embedding models."
string2 = "Before looking more into sentence embedding models we will look at a simpler task; word embedding. The point of this task is to map a word into a vector. This vector should represents the meaning of the word. When the meaning of a word is represented as a vector it can be easily interpreted by a computer. One can understand the main idea of word embedding by a simple equation "
string3 = "During the global covid-19 pandemic it has been important for researchers, politicians and others to be able to access the newest and most relevant research relating to COVID-19. The Semantic Scholar team at the Allen Institute for AI has partnered with leading research groups and released a dataset called CORD-19 \cite{Wang2020CORD19TC}. The dataset aims to keep a corpus of COVID-19 research articles so researchers can apply recent advances in natural language processing to aid in the global effort against the pandemic. The CORD-19 dataset contains research articles relating to COVID-19. This includes articles on similar viruses like SARS and MERS, pandemics and other relevant topics. The dataset is updated daily. "
tokenize(covid_model, covid_tokenizer, string+string2+string3+string+string2)


tensor([[-0.0102, -0.3737, -0.9746,  0.6022,  0.8261, -0.3076,  0.2930,  0.2730,
         -0.7471, -0.9276, -0.4815,  0.8626,  0.2386,  0.7950,  0.4367, -0.6525,
         -0.1136, -0.3221,  0.0427, -0.2792,  0.5534,  0.9999, -0.7850,  0.4110,
          0.1783,  0.9109, -0.2236,  0.4989,  0.5344,  0.1912, -0.2670,  0.3970,
         -0.5468, -0.2149, -0.8931, -0.4516,  0.2251,  0.0723, -0.4800, -0.0644,
         -0.6586,  0.2833,  0.9620, -0.4194,  0.8487,  0.0230, -0.9996,  0.2169,
         -0.3985,  0.7609,  0.9244,  0.8573,  0.3377,  0.5523,  0.4771, -0.6528,
          0.1571,  0.2534, -0.4018,  0.0375, -0.2473,  0.4947, -0.8407,  0.0777,
          0.8858,  0.8041, -0.4474, -0.5743, -0.1939,  0.1055,  0.4081,  0.0219,
         -0.4331, -0.5163,  0.8123,  0.1588,  0.0098,  1.0000, -0.0470, -0.6111,
          0.8224,  0.9279,  0.0841, -0.7137,  0.7931, -1.0000,  0.3229, -0.1720,
         -0.5004,  0.3240,  0.5632, -0.2917,  0.7976,  0.1278, -0.8568, -0.4486,
         -0.2825, -0.8283, -

In [3]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [4]:
scibert_tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
scibert_model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")

In [5]:
covid_tokenizer = AutoTokenizer.from_pretrained("gsarti/covidbert-nli")
covid_model = AutoModel.from_pretrained("gsarti/covidbert-nli")

In [6]:
path = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\dataset\\CORD-19\\"
docs = pd.read_csv(path + "crod_19_only_rel.csv")
docs.shape
docs.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0.1,Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,...,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id,body_text
0,3,2b73a28n,348055649b6b8cf2b9a376498df9bf41f7123605,PMC,Role of endothelin-1 in lung disease,10.1186/rr44,PMC59574,11686871.0,no-cc,Endothelin-1 (ET-1) is a 21 amino acid peptide...,...,"Fagan, Karen A; McMurtry, Ivan F; Rodman, David M",Respir Res,,,,document_parses/pdf_json/348055649b6b8cf2b9a37...,document_parses/pmc_json/PMC59574.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,,from xenopus laevis [16] . eta receptors in no...
1,5,zjufx4fo,b2897e1277f56641193a6db73825f707eed3e4c9,PMC,Sequence requirements for RNA strand transfer ...,10.1093/emboj/20.24.7220,PMC125340,11742998.0,green-oa,Nidovirus subgenomic mRNAs contain a leader se...,...,"Pasternak, Alexander O.; van den Born, Erwin; ...",The EMBO Journal,,,,document_parses/pdf_json/b2897e1277f56641193a6...,document_parses/pmc_json/PMC125340.xml.json,http://europepmc.org/articles/pmc125340?pdf=re...,,the genetic information of rna viruses is orga...
2,7,8zchiykl,5806726a24dc91de3954001effbdffd7a82d54e2,PMC,The 21st International Symposium on Intensive ...,10.1186/cc1013,PMC137274,11353930.0,no-cc,The 21st International Symposium on Intensive ...,...,"Ball, Jonathan; Venn, Richard",Crit Care,,,,document_parses/pdf_json/5806726a24dc91de39540...,document_parses/pmc_json/PMC137274.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,,this year's symposium was dominated by the res...
3,10,5tkvsudh,9d4e3e8eb092d5ed282d0aa4aadcaa8b7165b5e9,PMC,Conservation of polyamine regulation by transl...,10.1093/emboj/19.8.1907,PMC302018,10775274.0,no-cc,Regulation of ornithine decarboxylase in verte...,...,"Ivanov, Ivaylo P.; Matsufuji, Senya; Murakami,...",EMBO J,,,,document_parses/pdf_json/9d4e3e8eb092d5ed282d0...,document_parses/pmc_json/PMC302018.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,,the ef®ciency of +1 ribosomal frameshifting at...
4,12,tvxpckxo,d09b79026117ec9faebba46a8d13aa9b23ec751e,PMC,A Method to Identify p62's UBA Domain Interact...,10.1251/bpo66,PMC302190,14702098.0,no-cc,The UBA domain is a conserved sequence motif a...,...,"Pridgeon, Julia W.; Geetha, Thangiah; Wooten, ...",Biol Proced Online,,,,document_parses/pdf_json/d09b79026117ec9faebba...,document_parses/pmc_json/PMC302190.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,,p62 is a novel cellular protein which was init...


In [None]:
docs.head(2)

## Bert embeddings

### Title

In [None]:
path = "C:/Users/User/Documents/NTNU/NLP/CORD-19/Embeddings/BERT/786/title/"
for idx, row in docs.iterrows():
    BERT_embeddings_title = {}
    sha = row["sha"]
    cord_uid = row["cord_uid"]
    BERT_embeddings_title[row["sha"]] = tokenize(bert_model, bert_tokenizer, row["title"])
    if (idx % 10000) == 0:
        print(idx)
    file = path + cord_uid + ".txt"
    with open(file, "wb") as fp:   #Pickling
        pickle.dump(BERT_embeddings_title, fp)

In [None]:
path = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\dataset\\CORD-19\\embeddings\\"

with open(path + "BERT_embedding_title.txt", "wb") as fp:   #Pickling
    pickle.dump(BERT_embeddings_title, fp)
    

### Abstract

In [None]:
BERT_embeddings_abstract = {}
for idx, row in docs.iterrows():
    BERT_embeddings_abstract[row["sha"]] = tokenize(bert_model, bert_tokenizer, row["abstract"])
    if (idx % 10000) == 0:
        print(idx)

In [None]:
path = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\dataset\\CORD-19\\embeddings\\"

with open(path + "BERT_embedding_abstract.txt", "wb") as fp:   #Pickling
    pickle.dump(BERT_embeddings_abstract, fp)

## SciBert embeddings

### Title

In [None]:
SciBERT_embeddings_title = {}
for idx, row in docs.iterrows():
    SciBERT_embeddings_title[row["sha"]] = tokenize(scibert_model, scibert_tokenizer, row["title"])
    if (idx % 10000) == 0:
        print(idx)

In [None]:
path = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\dataset\\CORD-19\\embeddings\\"

with open(path + "SciBERT_embedding_title.txt", "wb") as fp:   #Pickling
    pickle.dump(SciBERT_embeddings_title, fp)

### Abstract

In [None]:
SciBERT_embeddings_abstract = {}
for idx, row in docs.iterrows():
    SciBERT_embeddings_abstract[row["sha"]] = tokenize(scibert_model, scibert_tokenizer, row["abstract"])
    if (idx % 10000) == 0:
        print(idx)

In [None]:
path = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\dataset\\CORD-19\\embeddings\\"

with open(path + "SciBERT_embedding_abstract.txt", "wb") as fp:   #Pickling
    pickle.dump(SciBERT_embeddings_abstract, fp)

## CordBert embeddings

### Title

In [None]:
CordBERT_embeddings_title = {}
for idx, row in docs.iterrows():
    CordBERT_embeddings_title[row["sha"]] = tokenize(covid_model, covid_tokenizer, row["title"])
    if (idx % 10000) == 0:
        print(idx)

In [None]:
path = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\dataset\\CORD-19\\embeddings\\"

with open(path + "CordBERT_embedding_title.txt", "wb") as fp:   #Pickling
    pickle.dump(CordBERT_embeddings_title, fp)

### Abstract

In [None]:
CordBERT_embeddings_abstract = {}
for idx, row in docs.iterrows():
    CordBERT_embeddings_abstract[row["sha"]] = tokenize(covid_model, covid_tokenizer, row["abstract"])
    if (idx % 10000) == 0:
        print(idx)

In [None]:
path = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\dataset\\CORD-19\\embeddings\\"

with open(path + "CordBERT_embedding_abstract.txt", "wb") as fp:   #Pickling
    pickle.dump(CordBERT_embeddings_abstract, fp)

## Embedding dim=786

In [7]:
def isNaN(num):
    return num != num

In [69]:
path = "C:/Users/User/Documents/NTNU/NLP/CORD-19/Embeddings/786/"
n_models = 3
n_fields = 2
dim = 786

start = time.time()
m = start

for idx, row in docs.iterrows():
    embedding = {}
    if (idx>21999):
        sha = row["sha"]
        cord_uid = row["cord_uid"]
        embedding["sha"] = sha
        embedding["cord_uid"] = cord_uid
        embedding["dim"] = dim
        embedding["models"] = {}

        models = [bert_model, scibert_model, covid_model]
        model_name = ['bert-base-uncased', "allenai/scibert_scivocab_uncased", "gsarti/covidbert-nli"]
        tokenizers = [bert_tokenizer ,scibert_tokenizer, covid_tokenizer]
        fields = ["title", "abstract"]

        for i in range(n_models):
            embedding["models"][model_name[i]] = {}
            for j in range(n_fields):
                field = row[fields[j]]

                res = isinstance(fields[j], str) 
                if (not res) or isNaN(field):
                    field = row["title"]
              
                
                embedd = tokenize(models[i], tokenizers[i], field)
                embedding["models"][model_name[i]][fields[j]] = embedd
                torch.cuda.empty_cache()

        ## Timing the loops
        if (idx % 1000) == 0:
            print(idx)
            print("Time (1000 iterations):", round(time.time() - m,1))
            m = time.time()

        file = path + cord_uid + ".txt"
        with open(file, "wb") as fp:   #Pickling
            pickle.dump(embedding, fp)

22000
Time (1000 iterations): 2.9
23000
Time (1000 iterations): 2901.4
24000
Time (1000 iterations): 3052.3
25000
Time (1000 iterations): 3092.0
26000
Time (1000 iterations): 3273.8
27000
Time (1000 iterations): 3098.5
28000
Time (1000 iterations): 3359.0
29000
Time (1000 iterations): 3308.5
30000
Time (1000 iterations): 2962.8
31000
Time (1000 iterations): 3120.6
32000
Time (1000 iterations): 3144.9
33000
Time (1000 iterations): 2534.2
34000
Time (1000 iterations): 2870.9
35000
Time (1000 iterations): 2798.4
36000
Time (1000 iterations): 2563.8
37000
Time (1000 iterations): 2757.5


In [73]:
path = "C:/Users/User/Documents/NTNU/NLP/CORD-19/Embeddings/786/"

s = time.time()
for idx, row in docs.iterrows():
    cord_uid = row["cord_uid"]
    file = path + cord_uid + ".txt"
    with open(file, "rb") as fp:   # Unpickling
        embedding = pickle.load(fp)
        print(embedding["models"]["bert-base-uncased"]["abstract"])
    if (idx >10):
        break
print(time.time()-s)

tensor([[-7.2691e-01, -2.3869e-01, -8.6512e-01,  2.0167e-01,  6.9648e-01,
          8.9377e-02, -6.6326e-01,  4.8240e-02, -8.8421e-01, -9.9984e-01,
         -6.2786e-01,  9.2084e-01,  8.9756e-01, -1.6120e-01,  3.8130e-01,
         -3.1916e-01,  3.6126e-01,  8.2679e-03,  8.8782e-02,  9.3996e-01,
          1.6620e-02,  9.9996e-01, -3.2176e-01,  1.8783e-01,  1.3297e-02,
          9.6083e-01, -5.8463e-01,  5.0327e-01,  7.3296e-01,  3.4668e-01,
          3.3784e-01,  9.8785e-02, -9.1816e-01,  2.8031e-01, -9.1155e-01,
         -9.1367e-01,  2.9899e-01, -3.0932e-01,  4.1385e-01,  5.0384e-03,
         -3.7188e-01,  1.5383e-01,  9.9998e-01, -6.1542e-01,  4.3890e-01,
          7.4744e-02, -9.9930e-01, -2.8538e-02, -4.3428e-01,  9.3243e-01,
          7.4099e-01,  9.5736e-01, -1.0811e-02,  2.1400e-01,  2.5949e-01,
         -6.3800e-01, -4.2442e-01, -1.1899e-01, -6.0226e-02, -2.6048e-01,
         -2.3630e-01, -8.0318e-05, -8.0428e-01, -7.9991e-01,  9.3150e-01,
          7.1275e-01,  9.5864e-02, -4.

In [74]:
path = "C:/Users/User/Documents/NTNU/NLP/CORD-19/Embeddings/786/"

s = time.time()
for idx, row in docs.iterrows():
    cord_uid = row["cord_uid"]
    file = path + cord_uid + ".txt"
    with open(file, "rb") as fp:   # Unpickling
        embedding = pickle.load(fp)
        print(embedding["models"]["bert-base-uncased"]["abstract"])
    if (idx >10):
        break
print(time.time()-s)

tensor([[-7.2691e-01, -2.3869e-01, -8.6512e-01,  2.0167e-01,  6.9648e-01,
          8.9377e-02, -6.6325e-01,  4.8241e-02, -8.8421e-01, -9.9984e-01,
         -6.2787e-01,  9.2084e-01,  8.9756e-01, -1.6120e-01,  3.8130e-01,
         -3.1916e-01,  3.6126e-01,  8.2668e-03,  8.8783e-02,  9.3996e-01,
          1.6621e-02,  9.9996e-01, -3.2176e-01,  1.8783e-01,  1.3297e-02,
          9.6083e-01, -5.8463e-01,  5.0327e-01,  7.3296e-01,  3.4668e-01,
          3.3784e-01,  9.8785e-02, -9.1816e-01,  2.8031e-01, -9.1155e-01,
         -9.1367e-01,  2.9900e-01, -3.0932e-01,  4.1385e-01,  5.0376e-03,
         -3.7188e-01,  1.5383e-01,  9.9998e-01, -6.1542e-01,  4.3890e-01,
          7.4743e-02, -9.9930e-01, -2.8537e-02, -4.3428e-01,  9.3243e-01,
          7.4099e-01,  9.5736e-01, -1.0810e-02,  2.1400e-01,  2.5949e-01,
         -6.3800e-01, -4.2442e-01, -1.1899e-01, -6.0226e-02, -2.6048e-01,
         -2.3630e-01, -7.9607e-05, -8.0428e-01, -7.9991e-01,  9.3150e-01,
          7.1275e-01,  9.5864e-02, -4.

In [None]:
path = "C:/Users/User/Documents/NTNU/NLP/CORD-19/Embeddings/786/"
n_models = 2
n_fields = 2
dim = 786

start = time.time()
m = start

for idx, row in docs.iterrows():
    embedding = {}
    if (idx>3000):
        sha = row["sha"]
        cord_uid = row["cord_uid"]
        embedding["sha"] = sha
        embedding["cord_uid"] = cord_uid
        embedding["dim"] = dim
        embedding["models"] = {}

        models = [bert_model,scibert_model]#, covid_model]
        model_name = ['bert-base-uncased', "gsarti/covidbert-nli"]#, "allenai/scibert_scivocab_uncased"]
        tokenizers = [bert_tokenizer ,scibert_tokenizer]#, covid_tokenizer]
        fields = ["title", "abstract"]

        for i in range(n_models):
            embedding["models"][model_name[i]] = {}
            for j in range(n_fields):
                field = row[fields[j]]

                res = isinstance(fields[j], str) 
                if (not res) or isNaN(field):
                    field = row["title"]
                if len(field) > 512:
                    field = field[0:512]
                #print(fields[j],field,)
                embedd = tokenize(models[i], tokenizers[i], field)
                embedding["models"][model_name[i]][fields[j]] = embedd
                torch.cuda.empty_cache()

        ## Timing the loops
        if (idx % 1000) == 0:
            print(idx)
            print("Time (1000 iterations):", round(time.time() - m,1))
            m = time.time()

        file = path + cord_uid + ".txt"
        with open(file, "wb") as fp:   #Pickling
            pickle.dump(embedding, fp)

In [None]:
"""
{
sha: sha,
cord_uid: cord_uid,
dim: dim
models: {model_name1: {title: [...],
                       abstract: [...]},
         model_name2: {title: [...],
                       abstract: [...]},
        ...                   
}
}


"""

In [None]:
torch.cuda.empty_cache()