In [53]:
from medcat.neo.data_preparation import *
from medcat.neo.neo_connector import NeoConnector
import pandas as pd
import json 
from medcat.cat import CAT

# Helper for sending requests to neo
neo = NeoConnector('bolt://localhost:7687/', user='neo4j')

  from tqdm.autonotebook import tqdm


Password:········


# Download data

In [113]:
!wget https://medcat.rosalind.kcl.ac.uk/media/twds/neo/patients.csv
!wget https://medcat.rosalind.kcl.ac.uk/media/twds/neo/documents.csv
!wget https://medcat.rosalind.kcl.ac.uk/media/medmen_wstatus_2021_oct.zip

--2021-12-07 17:45:57--  https://medcat.rosalind.kcl.ac.uk/media/twds/neo/patients.csv
Resolving medcat.rosalind.kcl.ac.uk (medcat.rosalind.kcl.ac.uk)... 193.61.202.225
Connecting to medcat.rosalind.kcl.ac.uk (medcat.rosalind.kcl.ac.uk)|193.61.202.225|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3237 (3.2K) [text/csv]
Saving to: ‘patients.csv.1’


2021-12-07 17:45:57 (14.9 MB/s) - ‘patients.csv.1’ saved [3237/3237]

--2021-12-07 17:45:57--  https://medcat.rosalind.kcl.ac.uk/media/twds/neo/documents.csv
Resolving medcat.rosalind.kcl.ac.uk (medcat.rosalind.kcl.ac.uk)... 193.61.202.225
Connecting to medcat.rosalind.kcl.ac.uk (medcat.rosalind.kcl.ac.uk)|193.61.202.225|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3590186 (3.4M) [text/csv]
Saving to: ‘documents.csv.1’


2021-12-07 17:45:57 (14.8 MB/s) - ‘documents.csv.1’ saved [3590186/3590186]

--2021-12-07 17:45:58--  https://medcat.rosalind.kcl.ac.uk/media/medmen_wstatus_2021_oct.z

# Annotate documents

In [71]:
df_docs = pd.read_csv('./documents.csv')
# This would be a generator if we have a lot of docs
data = [(k,v) for k,v in df_docs[['documentId', 'text']].values]

In [72]:
cat = CAT.load_model_pack('./medmen_wstatus_2021_oct.zip')
docs = cat.multiprocessing(data, nproc=10)
json.dump(docs, open("./annotations.json", 'w'))

Unziping the model pack and loading models.


Annotated until now: 0 docs; Current BS: 1088 docs; Elapsed time: 0.00 minutes


# Create indexes

In [75]:
# Indexes are pre-defined in the data_preparation helper
for ind in get_index_queries():
    try:
        neo.execute(ind)
    except Exception as e:
        print(e)

[Schema.EquivalentSchemaRuleAlreadyExists] An equivalent index already exists, 'Index( id=3, name='patientId', type='GENERAL BTREE', schema=(:Patient {patientId}), indexProvider='native-btree-1.0' )'.
[Schema.EquivalentSchemaRuleAlreadyExists] An equivalent index already exists, 'Index( id=4, name='conceptId', type='GENERAL BTREE', schema=(:Concept {conceptId}), indexProvider='native-btree-1.0' )'.
[Schema.EquivalentSchemaRuleAlreadyExists] An equivalent index already exists, 'Index( id=5, name='documentId', type='GENERAL BTREE', schema=(:Document {documentId}), indexProvider='native-btree-1.0' )'.


# Import Patients

In [76]:
df_pts = pd.read_csv('./patients.csv')
q = create_patients_csv(df_pts, output_dir='/var/lib/neo4j/import/')
# The created CSV has to be saved/copied into the /var/lib/neo4j/import, 
#otherwise the query execution below will not work. The folder by default
#requires admin priviledges to write in it, easiest is set the output dir
#to somewhere else and later copy with sudo. Otherwise you can change permissions on the
#folder so that python can write into it.

In [77]:
# Run the query for import
neo.execute(q)

# Import Concepts

In [78]:
data = [['conceptId', 'name', 'type']]
cuis = set()
for cui in cat.cdb.cui2names.keys(): # Use all concepts, always
    cuis.add(cui)
    type_ids = list(cat.cdb.cui2type_ids[cui])
    type_id = type_ids[0] if len(type_ids) else 'unk'
    t = str(cat.cdb.addl_info['type_id2name'].get(type_id, type_id))
    data.append([str(cui), str(cat.cdb.get_name(cui)), t])
df_concepts = pd.DataFrame(data[1:], columns=data[0])
q = create_concepts_csv(df_concepts, output_dir='/var/lib/neo4j/import/')

In [79]:
neo.execute(q)

# [IF USING SNOMED, SKIP FOR MEDMENTIONS] Import SNOMED ontology

In [50]:
data = [['child', 'parent']]
for parent, children in cat.cdb.addl_info['pt2ch'].items():
    for child in children:
        if child != parent:
            data.append([child, parent])
data = pd.DataFrame(data[1:], columns=data[0])

In [51]:
# The created CSV has to be saved/copied into the /var/lib/neo4j/import, 
#otherwise the query execution below will not work
q = create_concept_ontology_csv(data, output_dir='/var/lib/neo4j/import/')

In [52]:
neo.execute(q)

# Import documents

In [80]:
df_docs = pd.read_csv('./documents.csv')
q = create_documents_csv(df_docs, output_dir='/var/lib/neo4j/import/')

In [81]:
neo.execute(q)

# Make patient2document

In [82]:
# The created CSV has to be saved/copied into the /var/lib/neo4j/import, 
#otherwise the query execution below will not work
q = create_document2patient_csv(df_docs, output_dir='/var/lib/neo4j/import/')

In [83]:
neo.execute(q)

# Import annotations

In [84]:
doc2pt = {str(doc):pt for doc, pt in df_docs[['documentId', 'patientId']].values}
doc2time = {str(int(doc)):pt for doc, pt in df_docs[['documentId', 'chartdate']].values}

In [88]:
docs = json.load(open('./annotations.json', 'r'))
data = get_data_from_docs(docs, doc2pt, doc2time)
data = pd.DataFrame(data[1:], columns=data[0])
q = create_document2concept_csv(data, output_dir='/var/lib/neo4j/import/')

In [89]:
neo.execute(q)

# Queries

In [96]:
# Return all patients that have sleep apnea and fever
patients, q = neo.get_all_patients(concepts=['C0520679', 'C0015967'], limit=10, ignore_meta=True)

In [98]:
# Print the query
print(q)

WITH ['C0520679','C0015967'] AS cs MATCH (c:Concept)<-[:HAS ]-(:Document)<-[:HAS]-(pt:Patient)
            WHERE c.conceptId in cs
            WITH pt, size(cs) as inputCnt, count(DISTINCT c) as cnt
            WHERE cnt = inputCnt
             RETURN pt LIMIT 10


In [99]:
patients

['1', '32', '281', '92', '295', '284', '133', '269', '166', '160']

In [111]:
# Get all concepts from one patient
stream, q = neo.get_all_concepts_from(patient_id='166', bucket_size_seconds=10**10)

In [112]:
entities_df = pd.DataFrame(stream['entities'])
entities_df.sort_values(by='timestamp').head(n=20)

Unnamed: 0,name,conceptId,type,documentId,start,end,contextSimilarity,timestamp,count
0,Actual Positive Comfort,C2712134,Finding,1176,1388,1399,0.228076,3124224000,2
314,Male Circumcision,C0008819,Therapeutic or Preventive Procedure,1175,2654,2665,0.364103,3124224000,1
313,Complete,C0205197,Qualitative Concept,1175,3657,3665,0.520146,3124224000,1
312,Prostate-Specific Antigen,C0138741,Immunologic Factor,1175,3489,3514,0.443968,3124224000,5
311,Onset of (contextual qualifier),C0332162,Qualitative Concept,1175,287,292,0.360686,3124224000,1
310,respiratory,C0521346,Functional Concept,1175,1499,1510,0.368566,3124224000,1
309,Signs and Symptoms,C0037088,Sign or Symptom,1175,152,170,1.0,3124224000,2
308,abdominal organ,C0446633,"Body Part, Organ, or Organ Component",1175,1929,1945,1.0,3124224000,1
307,Finding,C0243095,Finding,1175,3575,3583,0.563521,3124224000,3
306,Respiratory Diaphragm,C0011980,"Body Part, Organ, or Organ Component",1175,1631,1644,0.202807,3124224000,1
