# Now let's start extracting concepts from unstructured text!


In [None]:
# Install medcat into the local environment
! pip install medcat==1.2.7

**Restart the runtime if on colab, sometimes necessary after installing models**

In [None]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns

from matplotlib import pyplot as plt
from medcat.cat import CAT

In [None]:
DATA_DIR = "./data/"
model_pack_path = DATA_DIR + "medmen_wstatus_2021_oct.zip"

In [None]:
# Download the models and required data
!wget https://medcat.rosalind.kcl.ac.uk/media/medmen_wstatus_2021_oct.zip -P ./data/
!wget https://raw.githubusercontent.com/CogStack/MedCATtutorials/main/notebooks/introductory/data/pt_notes.csv -P ./data/

## Loading the MedCAT modelpack

In [None]:
# Load model pack and Create CAT - the main class from medcat used for concept annotation
cat = CAT.load_model_pack(model_pack_path)


## Testing Named Entity Recognition + Linking (NER+L)

In [None]:
text = "He was diagnosed with kidney failure"
doc = cat(text)
print(doc.ents)

In [None]:
# To see all entities and corresponding meta-data
cat.get_entities("He was diagnosed with kidney failure", only_cui=False)

In [None]:
# If we want to see the CUI (ID) for each entity
for ent in doc.ents:
    print(ent, " - ", ent._.cui)

In [None]:
# To show type_ids and types for each entity
for ent in doc.ents:
  print(ent, " - ", cat.cdb.cui2type_ids.get(ent._.cui))

In [None]:
# We can also show the entities in a nicer way using displacy form spaCy
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

## Load the data

In [None]:
# Load the dataset - the one we saved in the last step of Part 2.
#If using MIMIC-III this is a very big dataset, be careful with memory requirements.
data = pd.read_csv(DATA_DIR + "pt_notes.csv")

In [None]:
# In this step we are only interested in the *text* column
data.head()

## Run unsupervised training

In [None]:
# The first step that we want to do when using MedCAT is unsupervised training.
# You can find a full explanation of this process in the paper:
# https://doi.org/10.1016/j.artmed.2021.102083

# Print statistics on the CDB before training (note that if you are using the 
#medmen cdb it will already have some training)

cat.cdb.print_stats()

# Run the annotation procedure over all the documents we have,
#given that we have a large number of documents this can take quite some time.
print()
print(f"There are {len(data['text'])} documents to train...")

cat.train(data.text.values, progress_print=100)

# Now print statistics on the CDB after training
cat.cdb.print_stats()

## Save the new MedCAT modelpack

In [None]:
# unsupervised trained model
model_pack_name = cat.create_model_pack(DATA_DIR + "unsupervised_trained_model_pack")

# Annotate Documents

For this task we will annotate documents using the UMLS unsupervised model created above.

The Type_ids in this CDB correspond to the Semantic Type(TUI) found within UMLS

In [None]:
# Load the newly created MedCAT modelpack
model_pack_path_2 = DATA_DIR + "unsupervised_trained_model_pack/" + model_pack_name + ".zip"
cat = CAT.load_model_pack(model_pack_path_2)

We are not interested in all medical concepts that exist in UMLS, that is why we will filter by Semantic Type (TUI) to only:
1. T047 - Disease or Syndrome
2. T048 - Mental or Behavioral Dysfunction

It is possible to filter after the detection process, but always better to define the filter as part of MedCAT

A full list of semantic types in UMLS is available [here](https://metamap.nlm.nih.gov/Docs/SemanticTypes_2018AB.txt).

In [None]:
# IMPORTANT: Set TUI/Type Ids filters
# |T047|Disease or Syndrome
# |T048|Mental or Behavioral Dysfunction
type_ids_filter = ['T047', 'T048']
cui_filters = set()
for type_ids in type_ids_filter:
  cui_filters.update(cat.cdb.addl_info['type_id2cuis'][type_ids])
cat.cdb.config.linking['filters']['cuis'] = cui_filters
print(f"The size of the cdb is now: {len(cui_filters)}")


For each row in our dataframe `data` we want to get all diseases that appear in the `text` column. In UMLS a disease is defined by a CUI, so we want to have the following output after the annotation is done:
```
cui_location = {<CUI>: [<row_id>, <row_id>, ...], ...}
```

## Get entities from unstructured text
We can annotate a portion of text.

In [None]:
annotated_text = cat.get_entities("This patient suffers from diabetes.")

In [None]:
annotated_text

There is one concept Entity extracted from the sample text above. Feel free to try it out for yourself by altering the text.

Don't forget that we applied the type_ids filter for [T047 & T048] so only concepts which fall within these categories will be shown.

## Use Multiprocessing

Although we can annotate documents one by one, using the `annotated_document = cat(text)` option, but it will work much faster if we do it using the `multi_processing` method from medcat.

When using multi_processing medcat requires the following format for input documents:
```
in_data = [(doc_id, doc_text), (doc_id, doc_text), ...]
```
The output is (return from medcat):
```
out_data = [(doc_id, {'entities': [...], 'text': <...>), (doc_id, {'entities': [...], 'text': <...>}), ...]
```

NOTE: If using the full UMLS CDB and a lot of processors (e.g. 32) it is very memory demanding (~ 32GB). The annotation process on 32 processors and 1M large text documents takes ~1h. 

In [None]:
# Let's test the multi processing function first
in_data = [(1, "He was a diabetic patient")]
results = cat.multiprocessing(in_data, nproc=2)
results_pipe = cat.multiprocessing_pipe(in_data, nproc=2)
assert results == results_pipe
results

The returned list of `entities` contains the following:

`acc` - Confidence score for this detection

`cui` - ID of the detected entity in the CDB (in our case UMLS)

`pretty_name` - The pretty name for this entity linked with the CUI

`detected_name` - What exact source value was detected

`type_ids` - The category code

`types` - Description label of the type_ids

`start` - The start character for the entity in the original string

`end` - End character for the entity in the original string

`id` - Internal ID, each entity inside a document has an unique ID


__Optional parameters which can also be set:__

The following can also be set to be returned during the creation of the MedCAT CDB within the model pack

`icd10` - If we are using a medical CDB, we'll also get ICD10 codes

`umls` - If the CDB was something other than UMLS, we would get the potential link to UMLS.

`snomed` - If we are using a medical CDB this would link to the equivalent SNOMED concept

In [None]:
data.shape

In [None]:
# Think about removing text you don't want to annotate. E.g <10 characters
data = data[data.text.apply(lambda x: len(str(x))>10)]

In [None]:
# format the df to match: in_data = [(doc_id, doc_text), (doc_id, doc_text), ...]
in_data = []
for id, row in data[['text']].iterrows():
  text = row['text']
  in_data.append((id, text))

In [None]:
# Set a batch size to control for the variablity between document sizes
batch_size_chars = 500000 # Batch size (BS) in number of characters

# Run model
results = cat.multiprocessing(in_data,  # Formatted data
                              batch_size_chars = batch_size_chars,
                              nproc=8) # Number of processors

To batch on the number of documents, you can use `multiprocessing_pipe` alternatively, which also supports Windows platforms:

In [None]:
# Set the batch size to the number of documents
batch_size = 10 # Batch size (BS) in number of documents

# Run model
results = cat.multiprocessing_pipe(in_data[:200], # Formatted data
                                   batch_size = batch_size,
                                   nproc=1) # Increase it when having more cores available

For a quick sanity check. Let's inspect a document and the annotations that our unsupervised model has produced.

In [None]:
# Check one of the returned results, just in case
data.iloc[0]['text']

In [None]:
# See all annotations based on the filter set above. 
# Notice that entities not present in the filter are filtered out.
cui_list = []
for annotation in list(results[0]['entities'].values()):
  print(annotation['cui'], annotation['pretty_name'])
  print()

For an unsupervised model, not too bad right?

Later we'll explore how we can boost the models performance even further through providing annotation labels created via the annotation tool: MedCATtrainer!

In [None]:
# If we want to convert a CUI back to its name
cat.cdb.cui2preferred_name['C0041834']

In [None]:
# To see all names in the concept db
cat.cdb.cui2names['C0041834']

In [None]:
# If we want to see the type_ids
cat.cdb.cui2type_ids['C0041834']

Now that you have created a structured annotation dataset you can now use this for any number of downstream applications! 

But for now let's continue on with the task and explore the annotations by finding the UMLS cui and type_ids (TUI as it is from UMLS) and their corresponding list of documents in which they are mentioned.

In [None]:
# This will be a map from CUI to a list of documents where it appears: {"cui": [<doc_id>, <doc_id>, ...], ..}
cui_location = {}
# Let's also save the type_ids location
type_ids_location = {}

In [None]:
# For the cui and thier corresponding documents
for doc in list(results.keys()):
  for annotation in list(results[doc]['entities'].values()):
    if annotation['cui'] in cui_location:
      cui_location[annotation['cui']].append(doc)
    else:
      cui_location[annotation['cui']] = [doc]


In [None]:
# For the type_ids and their corresponding documents
# Remember that a cui may map to more than one type_ids (one to many mapping)
for cui in cui_location.keys():
  type_ids_location[list(cat.cdb.cui2type_ids[cui])[0]] = cui_location[cui]

In [None]:
# Save the data so that we don't have to do the annotation again
pickle.dump(cui_location, open(DATA_DIR + "cui_location.dat", 'wb'))
pickle.dump(type_ids_location, open(DATA_DIR + "type_ids_location.dat", 'wb'))

In [None]:
# Load
cui_location = pickle.load(open(DATA_DIR + "cui_location.dat", 'rb'))
type_ids_location = pickle.load(open(DATA_DIR + "type_ids_location.dat", 'rb'))

## Visualise the annotation frequency
Let's explore the annotation counts visually to double check that everything makes sense.

In [None]:
# We are going to count the number of subjects (patients) for each CUI
cui_subjects = {}
cui_subjects_unique = {}
for cui in cui_location:
    for location in cui_location[cui]:
        subject_id = data.iat[location, list(data.columns).index('subject_id')]
        if cui in cui_subjects:
            cui_subjects[cui].append(subject_id)
            cui_subjects_unique[cui].add(subject_id)
        else:
            cui_subjects[cui] = [subject_id]
            cui_subjects_unique[cui] = {subject_id}

In [None]:
cui_nsubjects = [('cui', 'nsubjects')]
for cui in cui_subjects_unique.keys():
    cui_nsubjects.append((cui, len(cui_subjects_unique[cui])))
df_cui_nsubjects = pd.DataFrame(cui_nsubjects[1:], columns=cui_nsubjects[0])

Add a couple of columns that can be useful

In [None]:
df_cui_nsubjects = df_cui_nsubjects.sort_values('nsubjects', ascending=False)
# Add type_ids for each CUI
df_cui_nsubjects['type_ids'] = ['unk'] * len(df_cui_nsubjects)
cols = list(df_cui_nsubjects.columns)
for i in range(len(df_cui_nsubjects)):
    cui = df_cui_nsubjects.iat[i, cols.index('cui')]
    type_ids = cat.cdb.cui2type_ids.get(cui, 'unk')
    df_cui_nsubjects.iat[i, cols.index('type_ids')] = type_ids
    
# Add name for each CUI
df_cui_nsubjects['name'] = ['unk'] * len(df_cui_nsubjects)
cols = list(df_cui_nsubjects.columns)
for i in range(len(df_cui_nsubjects)):
    cui = df_cui_nsubjects.iat[i, cols.index('cui')]
    name = cat.cdb.cui2preferred_name.get(cui, 'unk')
    df_cui_nsubjects.iat[i, cols.index('name')] = name


# Add the percentage column
total_subjects = len(data['subject_id'].unique())
df_cui_nsubjects['perc_subjects'] = (df_cui_nsubjects['nsubjects'] / total_subjects) * 100

df_cui_nsubjects.reset_index(drop=True, inplace=True)

In [None]:
# Inspect newly created df
df_cui_nsubjects.head()

In [None]:
# Plot the top 30 diseases
sns.reset_defaults()
sns.set(
    rc={'figure.figsize':(5,12)}, 
    style="whitegrid",
    palette='pastel'
)
f, ax = plt.subplots()
_data = df_cui_nsubjects.iloc[0:30]
sns.barplot(x="perc_subjects", y="name", data=_data, label="Disorder Name", color="b")
_ = ax.set(xlim=(0, 70), ylabel="Disease Name", xlabel="Percentage of patients with disease")
plt.show()

# Preparing data for supervised training (MedCATtrainer)

As we want to do some fine-tuning on the models and also some meta-annotations (e.g. Status and Temporality), we'll take the top 100 CUIs from each TUI and for each take 2 examples. We'll do this because randomly choosing documents will give us only (most likely) the concepts on the very top by frequency (especially when there is a large drop between the top concepts).

The required output format for supervised training via MedCATtrainer is CSV with `columns=(name, text)`

In [None]:
df_cui_nsubjects.head()

In [None]:
# For each of the top 30 diseases/mental_disorders take 2 examples of documents/text
#where they appear
nexamples = 2
size = 40
loc_047_top = []
loc_048_top = []

for cui in df_cui_nsubjects[df_cui_nsubjects['type_ids'].apply(lambda x: 'T047' in x)]['cui'].values[0:size]:
    loc_047_top.extend(np.array(cui_location[cui])[np.random.randint(0, len(cui_location[cui]), nexamples)])
    
for cui in df_cui_nsubjects[df_cui_nsubjects['type_ids'].apply(lambda x: 'T048' in x)]['cui'].values[0:size]:
    loc_048_top.extend(np.array(cui_location[cui])[np.random.randint(0, len(cui_location[cui]), nexamples)])

data_047_top = data.iloc[loc_047_top]
data_048_top = data.iloc[loc_048_top]

In [None]:
# Combine everything
train_data = pd.concat([data_047_top, data_048_top])

In [None]:
train_data.head()

In [None]:
train_data = train_data.rename(columns={'Unnamed: 0_x': "name"})
train_data = train_data[['name', 'text']]
train_data.reset_index(drop=True, inplace=True)
train_data.head()

In [None]:
# Remove duplicates if we have them. 
train_data = train_data.drop_duplicates(['text'])

# Save the data so that it can be imported into MedCATtrainer
train_data.to_csv(DATA_DIR + "train_data.csv", index=False)

End of Tutorial