In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

from medcat.cat import CAT
from medcat.cdb import CDB
from medcat.utils.vocab import Vocab
from medcat.prepare_cdb import PrepareCDB

In [22]:
DATA_DIR = "./data/"

In [23]:
!wget https://raw.githubusercontent.com/CogStack/MedCAT/master/tutorial/data/cdb_simple.csv -P ./data
!wget https://raw.githubusercontent.com/CogStack/MedCAT/master/tutorial/data/cdb_advanced.csv -P ./data
!wget https://raw.githubusercontent.com/CogStack/MedCAT/master/tutorial/data/vocab_data.txt -P ./data

--2020-09-09 17:19:47--  https://raw.githubusercontent.com/CogStack/MedCAT/master/tutorial/data/cdb_simple.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.128.133, 151.101.64.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 49 [text/plain]
Saving to: ‘./data/cdb_simple.csv’


2020-09-09 17:19:47 (3.09 MB/s) - ‘./data/cdb_simple.csv’ saved [49/49]

--2020-09-09 17:19:47--  https://raw.githubusercontent.com/CogStack/MedCAT/master/tutorial/data/cdb_advanced.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.128.133, 151.101.64.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 193 [text/plain]
Saving to: ‘./data/cdb_advanced.csv’


2020-09-09 17:19:47 (12.8 MB/s) - ‘./data/cd

# Building a Vocabulary

The first of the two required models when running MedCAT is a Vocabulary model (Vocab). The model is used for two things: (1) Spell checking; and (2) Word Embedding. 

The Vocab is very simple and you can easily build it from a file that is structured as below:
```
<token>\t<word_count>\t<vector_embedding_separated_by_spaces>
```
`token` - Usually a word or subword if you are using Byte Pair Encoding or something similar.

`word_count` - The count for this word in your dataset or in any large dataset (wikipedia also works nicely).

`vector_embedding_separated_by_spaces` - precalculated vector embedding, can be from Word2Vec or BERT or Whatever

---
An example with 3-dim embedding would be:
```
house	34444	 0.3232 0.123213 1.231231
dog	14444	0.76762 0.76767 1.45454
.
.
.
```
The file is basically a TSV, but should not have any heading. 

---

**NOTE**: If spelling is important for your use-case, take care that there are no misspelled words in the Vocab.

In [24]:
# Let's have a look at an example, I've created a small vocabulary with only 2 words (the ones from above)
#and saved them into a text file. Let's try to create a vocabulary from this two words.
file_path = DATA_DIR + "vocab_data.txt"

vocab = Vocab()
vocab.add_words(path=file_path)

**And that is everything, with this we have built our vocab and no futher training is necessary.**

---

A couple of useful functions of the vocab are presented below

In [25]:
# To see the words in the vocab
vocab.vocab.keys()

dict_keys(['house', 'dog'])

In [26]:
# If you want to add words manually (one by one) use:
vocab.add_word("test", cnt=31, vec=[1.42, 1.44, 1.55], replace=True)
vocab.vocab.keys()

dict_keys(['house', 'dog', 'test'])

In [27]:
# To get a vector of word use:
vocab.vec("house")

array([0.3232  , 0.123213, 1.231231])

In [28]:
# Or to get the count
vocab['house']

34444

In [29]:
# To chec is a word in the vocab:
"house" in vocab

True

### Before we save the vocab model, we need to create the unigram table for negative sampling

In [30]:
# This is necessary after each change of the vocabulary (when we add new words)
vocab.make_unigram_table()

### Save the model

In [31]:
vocab.save_dict(DATA_DIR + "tmp-vocab.dat")

### Load the model

In [32]:
vocab = Vocab()
vocab.load_dict(DATA_DIR + "tmp-vocab.dat")

# Building the vocab from a stream of text data

If we only have a large body of text (e.g. wikipedia), we can easily build our own vocabulary.

In [33]:
f = open(DATA_DIR + "data.txt")

In [34]:
from medcat.utils.make_vocab import MakeVocab

In [35]:
make_vocab = MakeVocab()

In [36]:
make_vocab.make(f, "./out/")

0
10000
20000
30000
40000


In [37]:
make_vocab.vocab.vec("house")

In [38]:
make_vocab.add_vectors("./out/data.txt")

<gensim.models.word2vec.Word2Vec at 0x7ff89dfee3a0>

In [39]:
make_vocab.vocab.vec("house")

array([-4.59955297e-02,  3.33680883e-02, -1.12155275e-02, -6.99009150e-02,
        1.37163609e-01, -3.03632747e-02, -7.00904056e-02, -1.19212449e-01,
        8.54373425e-02, -5.51070757e-02,  4.47482988e-02,  3.92590985e-02,
       -6.99465675e-03,  1.01291731e-01,  6.23297840e-02,  4.88474369e-02,
       -4.87998547e-03, -7.88650438e-02, -2.55356263e-03,  1.06156459e-02,
        9.02190283e-02,  1.31216303e-01, -7.54631907e-02,  7.71637261e-02,
        1.04633376e-01,  9.54619236e-03,  5.77104613e-02, -8.75939205e-02,
       -1.30764535e-02, -4.09990139e-02, -4.35646251e-02,  8.04079697e-02,
        1.60963237e-02, -5.73624112e-03, -3.73984799e-02, -1.26232700e-02,
       -8.32567886e-02, -5.49194142e-02, -4.66786474e-02,  1.32484734e-01,
       -7.62983561e-02, -1.63496863e-02, -3.20202820e-02, -4.89284936e-03,
       -2.40638927e-02, -1.79406349e-02,  2.63882009e-03,  9.64811817e-02,
        2.35552359e-02, -3.23801152e-02, -3.46994884e-02,  8.71317238e-02,
       -5.81941493e-02,  

In [40]:
vocab = make_vocab.vocab

# Building the Concept Database (CDB)

The second model we are going to need when using MedCAT is the ConceptDB (CDB). This database holds a list of all concepts that we would like to detect and link to. For a lot of medical use-cases we would use giant databases like the UMLS or SNOMED. But, MedCAT can be used with any database no matter how big/small it is. 

To prepare the CDB we start off with a CSV with the following structure:
```
cui,str
1,kidney failure
7,CoVid 2
7,coronavirus
```
This is the most basic version of the CSV file, it has only:

`cui` - The concept unique identifier, this is simply an `ID` in your database

`str` - String/Name of that concept. It is important to write all possible names and abbreviations for a concept of interest.

If you have once concept that has multiple different names (like the one above with cui=7), you can simply add multiple rows with the same concept ID and MedCAT will merge that during the build phase.

## The Full CSV Specification
```
cui,str,onto,tty,tui,sty,desc,is_unique,examples
1,Kidney Failure|failure of kidneys|KF,SNOMED,PM,T047,Disease,Description of the concept,The patient was diagnosed with kidney failure
.
.
.
```
This fileds are optional, anyone can be included or left out in your CSV.

`onto` - Source ontology, e.g. HPO, SNOMED, HPC,...

`tty` - Term type e.g. PN - Primary Name. Primary names are important and I would always recommend to add this fields when creating your CDB. 

`tui` - Semantic type identifier - e.g. T047 (taken from UMLS). A list of all semantic types can be found [here](https://metamap.nlm.nih.gov/Docs/SemanticTypes_2018AB.txt). 

`sty` - Semantic type - e.g. Disease

`desc` - Description of this concept

`examples` - Examples of this concept in a sentence (use short examples, not whole documents).


***Note***: If one concept has multiple names, you can also separate the different names by a "|" - pipe - symbol 

In [41]:
# Let's try building the concept databse from a simple CSV
prep_cdb = PrepareCDB(vocab=vocab)

# Crete an array for paths to CSV files that will be used to build our CDB
paths = [DATA_DIR + 'cdb_simple.csv']
cdb = prep_cdb.prepare_csvs(paths)

Done: 0


In [42]:
print(cdb.cui2original_names)

{'1': {'kidney failure'}, '7': {'CoVid 2', 'coronavirus'}}


In [43]:
prep_cdb = PrepareCDB(vocab=vocab)

# To build fromm the advanced CSV example
paths = [DATA_DIR + 'cdb_advanced.csv']
cdb = prep_cdb.prepare_csvs(paths)

Done: 0


**That is all, nothing else is necessary to build the CDB**

---

Some useful functions of the cdb are below

In [44]:
# To display all cuis and names in the db - note that MedCAT merged the names
print(cdb.cui2original_names)

{'1': {'KF', 'Failure of Kidneys', 'K. Failure', 'Kidney Failure'}}


In [45]:
# We have a link from tui to cui
print(cdb.tui2cuis)

{'T047': {'1'}}


In [46]:
# Or vice versa - from cui to tui
print(cdb.cui2tui)

{'1': 'T047'}


In [47]:
# Description is also there
print(cdb.cui2desc)

{'1': 'Description of the concept\n\nnan'}


### Save the model

In [48]:
cdb.save_dict(DATA_DIR + "cdb.dat")

### Load the model

In [49]:
cdb = CDB()
cdb.load_dict(DATA_DIR + "cdb.dat")

# End

This is everything you need to create your own MedCAT models. In the next notebook you will see how to train and use these models to annotate documents. 