In [1]:
# Install medcat
! pip install medcat==1.5.0
try:
    from medcat.cat import CAT
except:
    print("WARNING: Runtime will restart automatically and please run other cells thereafter.")
    exit()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting medcat==1.3.0
 Downloading medcat-1.3.0-py3-none-any.whl (133 kB)
[K |████████████████████████████████| 133 kB 6.8 MB/s 
Collecting xxhash==3.0.0
 Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K |████████████████████████████████| 212 kB 48.2 MB/s 
[?25hCollecting gensim~=4.1.2
 Downloading gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K |████████████████████████████████| 24.1 MB 2.0 MB/s 
[?25hCollecting aiofiles~=0.8.0
 Downloading aiofiles-0.8.0-py3-none-any.whl (13 kB)
Collecting datasets~=2.2.2
 Downloading datasets-2.2.2-py3-none-any.whl (346 kB)
[K |████████████████████████████████| 346 kB 47.7 MB/s 
[?25hCollecting py2neo==2021.2.3
 Downloading py2neo-2021.2.3-py2.py3-none-any.whl (177 kB)
[K |████████████████████████████████| 177 kB 48.7 MB/s 
Collecting transformers~=4.19.2
 Down



In [1]:
import gensim
import pandas as pd
import numpy as np
from tokenizers import ByteLevelBPETokenizer
from gensim.models import Word2Vec

In [2]:
DATA_DIR = "./data/"

In [3]:
!mkdir ./data
!mkdir ./models
!wget https://raw.githubusercontent.com/CogStack/MedCATtutorials/main/notebooks/introductory/data/noteevents.csv -P ./data/

mkdir: cannot create directory ‘./data’: File exists
mkdir: cannot create directory ‘./models’: File exists
--2022-08-25 11:42:26-- https://raw.githubusercontent.com/CogStack/MedCATtutorials/main/notebooks/introductory/data/noteevents.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7171226 (6.8M) [text/plain]
Saving to: ‘./data/noteevents.csv.1’


2022-08-25 11:42:27 (129 MB/s) - ‘./data/noteevents.csv.1’ saved [7171226/7171226]



### Meta Annotations with MedCAT

To train meta-annotations (e.g. Experiencer, Negation...) we need two additional models:
- Tokenizer: to tokenize the text
- Embeddings: Word2Vec or any other type of embeddings that will be used for meta annotations. 

For meta-annotations we will use a custom BiLSTM model with simulated attention that works very well with sub-word tokenizers and embeddings creating using Word2Vec or BERT (for simplicity we will use w2v here). All of this is also available for download (check next tutorial) and we only need to rebuild the tokenizer/embeddings if our use-case is from a very specific domain. 

In [4]:
# To train the tokenizer we will use all the data we have from our dummy dataset.
df = pd.read_csv(DATA_DIR + "noteevents.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,subject_id,chartdate,category,text
0,0,0,01/01/2086,Urology,"CHIEF COMPLAINT: , Blood in urine.,HISTORY OF ..."
1,1,0,01/01/2086,Emergency Room Reports,"CHIEF COMPLAINT: , Blood in urine.,HISTORY OF ..."
2,2,0,01/01/2086,General Medicine,"CHIEF COMPLAINT: , Blood in urine.,HISTORY OF ..."
3,3,0,01/01/2086,General Medicine,"CHIEF COMPLAINT:, Followup on hypertension an..."
4,4,0,01/01/2086,Consult - History and Phy.,"CHIEF COMPLAINT: , Blood in urine.,HISTORY OF ..."


In [5]:
# The tokenizers from huggingface require us to save all the text used for 
#training into one/multiple text files.
f = open(DATA_DIR + "tok_data.txt", 'w')
for text in df['text'].values:
    #We'll remove new lines, so that we have one document in one line
    text = text.strip().replace("\n", ' ')
    f.write(text.lower()) # Lowercase text to remove noise
    f.write("\n")
f.close()

In [6]:
# Create, train and save the tokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(DATA_DIR + "tok_data.txt")
tokenizer.save("./models/bbpe")

In [7]:
# Now we tokenize all the text we have and train word2vec
f = open(DATA_DIR + "tok_data.txt", 'r')
# Note that if you have a very large dataset, use iterators that
#read the text line by line from the file, do not load the whole file
#into memory.
data = []
for line in f:
    data.append(tokenizer.encode(line).tokens)
w2v = Word2Vec(data, vector_size=300, min_count=1)

In [8]:
# Check is word2vec trained, Ġ - for this tokenizer denotes start of word (a space)
w2v.wv.most_similar('Ġcancer')

[('Ġmetastatic', 0.7546937465667725),
 ('Ġcolon', 0.7531586289405823),
 ('Ġbreast', 0.7017560601234436),
 ('Ġcarcinoma', 0.6899590492248535),
 ('Ġaugmentation', 0.6884581446647644),
 ('Ġca', 0.6584445834159851),
 ('Ġfamily', 0.657872200012207),
 ('Ġmesothelioma', 0.6546629071235657),
 ('Ġfather', 0.6540331244468689),
 ('Ġmother', 0.6450846791267395)]

In [9]:
# Now we just have to create the embeddings matrix
embeddings = []
for i in range(tokenizer.get_vocab_size()):
    word = tokenizer.id_to_token(i)
    if word in w2v.wv:
        embeddings.append(w2v.wv[word])
    else:
        # Assign a random vector if the word was not frequent enough to receive
        #an embedding
        embeddings.append(np.random.rand(300))

In [10]:
# Save the embeddings
np.save(open("./models/embeddings.npy", 'wb'), np.array(embeddings))