In [1]:
! pip install medcat==0.3.3.1
# Get the scispacy model
! pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_md-0.2.4.tar.gz

Collecting medcat==0.3.3.1
[?25l  Downloading https://files.pythonhosted.org/packages/1d/98/0d428ff2eb81fb18b91d56020692ab1ee9e9d4b541c2945833bcd15c23c4/medcat-0.3.3.1-py3-none-any.whl (56kB)
[K     |████████████████████████████████| 61kB 1.8MB/s 
[?25hCollecting torchvision~=0.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/7e/90/6141bf41f5655c78e24f40f710fdd4f8a8aff6c8b7c6f0328240f649bdbe/torchvision-0.5.0-cp36-cp36m-manylinux1_x86_64.whl (4.0MB)
[K     |████████████████████████████████| 4.0MB 7.1MB/s 
Collecting tokenizers~=0.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/73/de/ec55e2d5a8720557b25100dd7dd4a63108a44b6b303978ce2587666931cf/tokenizers-0.6.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 46.7MB/s 
[?25hCollecting gensim~=3.7
[?25l  Downloading https://files.pythonhosted.org/packages/2b/e0/fa6326251692056dc880a64eb22117e03269906ba55a6864864d24ec8b4e/gensim-3.8.3-cp36-cp36m-manylinux1_x86_64

**Restart the runtime if on colab, sometimes necessary after installing models**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

from medcat.cat import CAT
from medcat.cdb import CDB
from medcat.utils.vocab import Vocab

  import pandas.util.testing as tm


In [2]:
DATA_DIR = "./data/"
vocab_path = DATA_DIR + "vocab.dat"
cdb_path = DATA_DIR + "cdb-medmen.dat"

In [3]:
# Download the models and required data 
!wget https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat -P ./data/
!wget https://s3-eu-west-1.amazonaws.com/zkcl/cdb-medmen.dat -P ./data/
###!wget https://raw.githubusercontent.com/CogStack/MedCAT/master/tutorial/data/pt_notes.csv -P ./data/

--2020-06-17 21:37:19--  https://s3-eu-west-1.amazonaws.com/zkcl/vocab.dat
Resolving s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)... 52.218.21.170
Connecting to s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)|52.218.21.170|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 274445907 (262M) [application/x-www-form-urlencoded]
Saving to: ‘./data/vocab.dat’


2020-06-17 21:37:29 (28.5 MB/s) - ‘./data/vocab.dat’ saved [274445907/274445907]

--2020-06-17 21:37:30--  https://s3-eu-west-1.amazonaws.com/zkcl/cdb-medmen.dat
Resolving s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)... 52.218.49.68
Connecting to s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)|52.218.49.68|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 180335834 (172M) [application/x-www-form-urlencoded]
Saving to: ‘./data/cdb-medmen.dat’


2020-06-17 21:37:37 (27.2 MB/s) - ‘./data/cdb-medmen.dat’ saved [180335834/180335834]

--2020-06-17 21:37:38

## Loading models and preparing MedCAT

In [4]:
# Create and load the CDB (Concept Database)
cdb = CDB()
cdb.load_dict(cdb_path)

# Create and load the Vocabulary
vocab = Vocab()
vocab.load_dict(vocab_path)

In [5]:
# Create CAT - the main class from medcat used for concept annotation
cat = CAT(cdb=cdb, vocab=vocab)

# Set a couple of parameters, they are usually set via environments, but
#here we will do it explicitly. You can read more about each option in the 
#medcat repository: https://github.com/CogStack/MedCAT
cat.spacy_cat.PREFER_FREQUENT = True
cat.spacy_cat.PREFER_ICD10 = False
cat.spacy_cat.WEIGHTED_AVG = True
cat.spacy_cat.MIN_CONCEPT_LENGTH = 3 # Ignore concepts (diseases) <= 3 characters
cat.spacy_cat.MIN_ACC = 0.2 # Confidence cut-off, everything bellow will not be displayed 

## Testing NER+L

In [None]:
text = "Epidemiologic and preclinical data suggest a potential role for vitamin D in breast cancer treatment and prevention. However, results of prospective randomized trials are inconsistent. The objective of this study was to assess the effects of high-dose cholecalciferol (vitamin D3) on breast tumour proliferation and apoptosis. We conducted a prospective, randomized, phase 2, double-blinded pre-surgical window of opportunity trial. Newly diagnosed breast cancer patients were randomized to receive 40,000 IU of vitamin D3 per day or placebo for 2 to 6 weeks prior to breast surgery. The primary outcome was the relative change in proliferation (Ki67) and apoptosis (cleaved caspase 3 apoptotic assay [CC3]) in primary breast cancer cells pre and post treatment. Of 83 patients randomized, 80 completed the study (43 (53.8%) vitamin D and 37 (46.3%) placebo). Mean duration of drug intake was 19 days (range 9-28 days). There were no significant differences between the control arm and the vitamin D arm in percent changes of either Ki67 index (1.6% vs. 16.7%, p = 0.25) or CC3 (- 55.9% vs. - 45.9%, p = 0.28). Serum 25-hydroxyvitamin D (25-OHD) levels were 3 times higher in the vitamin D arm (62 nmol/L vs. 246 nmol/L, p < 0.001). Adverse effects were minimal and all classified as grade 1. Despite significantly higher levels of serum 25-OHD in the vitamin D-treated group, this was not associated with any significant effects on tumour proliferation or apoptosis. These findings are consistent with the lack of benefit observed in prospective prevention trials."
doc = cat(text)
print(doc.ents)

(Epidemiologic, preclinical data, suggest, potential role, vitamin D, breast, cancer treatment, prevention, results, prospective, randomized trials, inconsistent, objective, study, assess, effects of, high-dose, vitamin D3, breast, tumour, proliferation, apoptosis, conducted, prospective, randomized, phase 2, double-blinded, pre-surgical, window of opportunity, trial, Newly diagnosed, breast, cancer patients, randomized, receive, vitamin D3, per day, placebo, weeks, prior to, breast, surgery, primary outcome, relative, change in, proliferation, Ki67, apoptosis, cleaved, caspase 3, apoptotic, assay, primary breast cancer, cells, post treatment, patients, randomized, completed, study, vitamin D, placebo, Mean duration, drug, intake, days, range, days, no significant differences, control arm, vitamin D, percent changes, index, Serum, 25-hydroxyvitamin D, levels, times, higher, vitamin D, Adverse effects, classified, grade 1, significantly higher, levels of, serum, vitamin D, treated, grou

In [None]:
# If we want to see the CUI (ID) for each entity
for ent in doc.ents:
    print(ent, " - ",ent._.cui, " - ",ent._.tui," - ",cdb.tui2name[ent._.tui])

Epidemiologic  -  C0014508  -  T169  -  Functional Concept
preclinical data  -  C1516606  -  T170  -  Intellectual Product
suggest  -  C1705535  -  T078  -  Idea or Concept
potential role  -  C3245505  -  T080  -  Qualitative Concept
vitamin D  -  C0042866  -  T109  -  Organic Chemical
breast  -  C0006141  -  T023  -  Body Part, Organ, or Organ Component
cancer treatment  -  C0920425  -  T061  -  Therapeutic or Preventive Procedure
prevention  -  C1456501  -  T080  -  Qualitative Concept
results  -  C0243095  -  T033  -  Finding
prospective  -  C0205556  -  T080  -  Qualitative Concept
randomized trials  -  C0206035  -  T062  -  Research Activity
inconsistent  -  C0442809  -  T080  -  Qualitative Concept
objective  -  C0018017  -  T170  -  Intellectual Product
study  -  C0031928  -  T062  -  Research Activity
assess  -  C1516048  -  T052  -  Activity
effects of  -  C1704420  -  T080  -  Qualitative Concept
high-dose  -  C0178602  -  T081  -  Quantitative Concept
vitamin D3  -  C0006674

In [None]:
# To show semantic types for each entity
for ent in doc.ents:
  print(ent, " - ", ent._.tui)

Epidemiologic  -  T169
preclinical data  -  T170
suggest  -  T078
potential role  -  T080
vitamin D  -  T109
breast  -  T023
cancer treatment  -  T061
prevention  -  T080
results  -  T033
prospective  -  T080
randomized trials  -  T062
inconsistent  -  T080
objective  -  T170
study  -  T062
assess  -  T052
effects of  -  T080
high-dose  -  T081
vitamin D3  -  T109
breast  -  T191
tumour  -  T191
proliferation  -  T043
apoptosis  -  T043
conducted  -  T169
prospective  -  T080
randomized  -  T062
phase 2  -  T062
double-blinded  -  T062
pre-surgical  -  T061
window of opportunity  -  T080
trial  -  T062
Newly diagnosed  -  T080
breast  -  T023
cancer patients  -  T101
randomized  -  T062
receive  -  T080
vitamin D3  -  T109
per day  -  T079
placebo  -  T122
weeks  -  T079
prior to  -  T079
breast  -  T191
surgery  -  T061
primary outcome  -  T080
relative  -  T080
change in  -  T169
proliferation  -  T043
Ki67  -  T116
apoptosis  -  T043
cleaved  -  T067
caspase 3  -  T116
apoptotic  - 

In [None]:
# We can also show the entities in a nicer way using displacy form spaCy
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

In [6]:
text2 ="Although smoking cessation apps have become popular, few have been tested in randomized clinical trials or undergone formative evaluation with target users. We developed a cessation app targeting tobacco-dependent cancer patients. Game design and behavioral rehearsal principles were incorporated to help smokers identify, model, and practice coping strategies to avoid relapse to smoking. In this randomized pilot trial, we examined feasibility (recruitment and retention rates), acceptability (patient satisfaction), quitting self-confidence, and other cessation-related indices to guide the development of a larger trial. We randomized 42 English-speaking cancer patients scheduled for surgical treatment to either the Standard Care (SC; telecounseling and cessation pharmacotherapies) or the experimental QuitIT study arm (SC and QuitIT game). Gameplay parameters were captured in-game; satisfaction with the game was assessed at 1-month follow-up. We report study screening, exclusion, and refusal reasons; compare refusal and attrition by key demographic and clinical variables; and report tobacco-related outcomes. Follow-up data were collected from 65% (13/20) patients in the QuitIT and 61% (11/18) in SC arms. Study enrollees were 71% (27/38) females, 92% (35/38) white people, and 95% (36/38) non-Hispanic people. Most had either lung (12/38, 32%) or gastrointestinal (9/38, 24%) cancer. Those dropping out were less likely than completers to have used a tablet (P<.01) and have played the game at all (P=.02) and more likely to be older (P=.05). Of 20 patients in the QuitIT arm, 40% (8/20) played the game (system data). There were no differences between those who played and did not play by demographic, clinical, technology use, and tobacco-related variables. Users completed an average of 2.5 (SD 4.0) episodes out of 10. A nonsignificant trend was found for increased confidence to quit in the QuitIT arm (d=0.25, 95% CI -0.56 to 1.06), and more participants were abstinent in the QuitIT group than in the SC arm (4/13, 30%, vs 2/11, 18%). Satisfaction with gameplay was largely positive, with most respondents enjoying use, relating to the characters, and endorsing that gameplay helped them cope with actual smoking urges. Recruitment and retention difficulties suggest that the perihospitalization period may be a less than ideal time for delivering a smoking cessation app intervention. Framing of the app as a \"game\" may have decreased receptivity as participants may have been preoccupied with hospitalization demands and illness concerns. Less tablet experience and older age were associated with participant dropout. Although satisfaction with the gameplay was high, 60% (12/20) of QuitIT participants did not play the game. Paying more attention to patient engagement, changing the intervention delivery period, providing additional reward and support for use, and improving cessation app training may bolster feasibility for a larger trial."
doc2 = cat(text2)
print(doc2.ents)

(smoking cessation, apps, tested, randomized clinical trials, undergone, formative, evaluation, target, users, developed, cessation, targeting, tobacco, dependent, cancer patients, Game, design, behavioral, principles, incorporated, help, smokers, identify, model, practice, coping strategies, avoid, relapse, smoking, randomized, pilot trial, examined, feasibility, recruitment, retention, rates, acceptability, patient satisfaction, self-confidence, cessation, related, indices, guide, development, larger, trial, randomized, English, speaking, cancer patients, scheduled, surgical treatment, Standard Care, cessation, experimental, study arm, game, Gameplay, parameters, captured, game, satisfaction, game, assessed, 1-month, follow-up, report, study, screening, exclusion, refusal, reasons, compare, demographic, clinical variables, report, tobacco, related, outcomes, Follow-up, data were collected, patients, arms, Study, females, white, non-Hispanic, people, lung, gastrointestinal, cancer, dr

In [7]:
# We can also show the entities in a nicer way using displacy form spaCy
from spacy import displacy
displacy.render(doc2, style='ent', jupyter=True)

In [8]:
text3="Lack of physical activity (PA), weight gain, and overweight have been associated with increased risk of recurrence and mortality after breast cancer diagnosis. We evaluated the feasibility of implementing an individualized exercise program and nutritional counseling during adjuvant treatment of localized invasive breast cancer. Sixty-one patients eligible for adjuvant chemotherapy were randomized 2:1 to receive a 6-month program of weekly aerobic exercises associated with nutritional counseling (n = 41) or usual care with nutritional counseling (n = 20, one withdrawal). The primary endpoints were the proportion of patients compliant with two weekly supervised sessions and their overall adherence (i.e., proportion of supervised and unsupervised sessions completed versus planned sessions). Ten percent of patients in the intervention group were compliant with the two weekly supervised sessions for 6 months, but the overall median adherence rate was 85% of supervised and non-supervised sessions completed. Non-adherence was mainly due to intrinsic reasons (medical, organizational, psychological barriers). Adherence was positively associated with education and baseline PA level and inversely associated with baseline weight and tumor grade. No statistically significant benefits were observed in the intervention group, even if overall PA level and body composition improved and anthropometrics were maintained over time (p < 0.05). Overall, there was good adherence with the 6-month exercise program during adjuvant treatment for breast cancer, despite poor compliance to twice-weekly supervised sessions. This study highlights the need for flexible exercise modalities and innovative experimental design to reach patients who would most adhere and benefit from intervention."
doc3 = cat(text3)
print(doc3.ents)

(Lack of physical activity, weight gain, overweight, associated with, increased, risk of recurrence, mortality, breast cancer diagnosis, evaluated, feasibility, implementing, individualized, exercise, program, nutritional, counseling, adjuvant treatment, localized, invasive breast cancer, patients, eligible, adjuvant chemotherapy, randomized, receive, 6-month, program, weekly, aerobic exercises, associated with, nutritional, counseling, usual care, nutritional, counseling, withdrawal, primary endpoints, proportion of patients, compliant, weekly, supervised, sessions, overall, adherence, proportion, supervised, sessions, completed, planned, sessions, percent, patients, intervention group, compliant, weekly, supervised, sessions, 6 months, overall, median, rate, supervised, sessions, completed, mainly, due to, reasons, medical, organizational, psychological, barriers, Adherence, positively, associated with, education, baseline, level, inversely associated, baseline, weight, tumor grade, 

In [9]:
from spacy import displacy
displacy.render(doc3, style='ent', jupyter=True)