In [None]:
# Install medcat
! pip install medcat==1.2.7

In [None]:
import gensim
import pandas as pd
import numpy as np
from tokenizers import ByteLevelBPETokenizer
from gensim.models import Word2Vec

In [None]:
DATA_DIR = "./data/"

In [None]:
!mkdir ./data
!mkdir ./models
!wget https://raw.githubusercontent.com/CogStack/MedCATtutorials/main/notebooks/introductory/data/noteevents.csv -P ./data/

### Meta Annotations with MedCAT

To train meta-annotations (e.g. Experiencer, Negation...) we need two additional models:
- Tokenizer: to tokenize the text
- Embeddings: Word2Vec or any other type of embeddings that will be used for meta annotations. 

For meta-annotations we will use a custom BiLSTM model with simulated attention that works very well with sub-word tokenizers and embeddings creating using Word2Vec or BERT (for simplicity we will use w2v here). All of this is also available for download (check next tutorial) and we only need to rebuild the tokenizer/embeddings if our use-case is from a very specific domain. 

In [None]:
# To train the tokenizer we will use all the data we have from our dummy dataset.
df = pd.read_csv(DATA_DIR + "noteevents.csv")
df.head()

In [None]:
# The tokenizers from huggingface require us to save all the text used for 
#training into one/multiple text files.
f = open(DATA_DIR + "tok_data.txt", 'w')
for text in df['text'].values:
  #We'll remove new lines, so that we have one document in one line
  text = text.strip().replace("\n", ' ')
  f.write(text.lower()) # Lowercase text to remove noise
  f.write("\n")
f.close()

In [None]:
# Create, train and save the tokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(DATA_DIR + "tok_data.txt")
tokenizer.save("./models/bbpe")

In [None]:
# Now we tokenize all the text we have and train word2vec
f = open(DATA_DIR + "tok_data.txt", 'r')
# Note that if you have a very large dataset, use iterators that
#read the text line by line from the file, do not load the whole file
#into memory.
data = []
for line in f:
  data.append(tokenizer.encode(line).tokens)
w2v = Word2Vec(data, vector_size=300, min_count=1)

In [None]:
# Check is word2vec trained, Ġ - for this tokenizer denotes start of word (a space)
w2v.wv.most_similar('Ġcancer')

In [None]:
# Now we just have to create the embeddings matrix
embeddings = []
for i in range(tokenizer.get_vocab_size()):
  word = tokenizer.id_to_token(i)
  if word in w2v.wv:
    embeddings.append(w2v.wv[word])
  else:
    # Assign a random vector if the word was not frequent enough to receive
    #an embedding
    embeddings.append(np.random.rand(300))

In [None]:
# Save the embeddings
np.save(open("./models/embeddings.npy", 'wb'), np.array(embeddings))