# If in DEV mode

In [None]:
import sys
sys.path.insert(0, "/home/wish/projects/MedCAT/")

In [None]:
%load_ext autoreload
%autoreload 2

# Beginning

In [None]:
from medcat.neo.data_preparation import *
from medcat.neo.neo_connector import NeoConnector
import pandas as pd
import os
import pickle

# Create NEO

In [None]:
neo = NeoConnector('bolt://localhost:7687/', user='neo4j')

# Create INDICIES

Need to optimize the data load and some queries

In [None]:
for ind in get_index_queries():
    try:
        neo.execute(ind)
    except Exception as e:
        print(e)

# Make a Patients CSV

This notebook loads MIMIC-III data, if you have access download the PATIENTS and noteevents CSVs and you'll be able to follow the notebook. 

In [None]:
df = pd.read_csv("/home/wish/data/mimic/PATIENTS.csv") # Patients CSV from MIMIC-III
df.DOB = pd.to_datetime(df.DOB)
df['ethnicity'] = [None] * len(df) # Do not know ethnicity, if known add it here
df = df.rename(columns={'SUBJECT_ID': 'patientId', 'GENDER': 'sex', 'DOB': 'dob'})

In [None]:
# The created CSV has to be saved/copied into the /var/lib/neo4j/import, 
#otherwise the query execution below will not work
q = create_patients_csv(df, output_dir='/home/wish/data/neo/mimic/')

In [None]:
neo.execute(q)

# Make a CUIs CSV

In [None]:
from medcat.cdb import CDB

In [None]:
# Load the CDB that was used to create the annotations you will import later
cdb = CDB.load("/home/wish/data/models/phase1_inital_16_04.dat")

In [None]:
data = [['conceptId', 'name', 'type']]
cuis = set()
for cui in cdb.cui2names.keys(): # Use all concepts, always
    cuis.add(cui)
    type_ids = list(cdb.cui2type_ids[cui])
    type_id = type_ids[0] if len(type_ids) else 'unk'
    t = str(cdb.addl_info['type_id2name'].get(type_id, type_id))
    data.append([str(cui), str(cdb.get_name(cui)), t])
data = pd.DataFrame(data[1:], columns=data[0])

In [None]:
# The created CSV has to be saved/copied into the /var/lib/neo4j/import, 
#otherwise the query execution below will not work
q = create_concepts_csv(data, output_dir='/home/wish/data/neo/mimic/')

In [None]:
neo.execute(q)

# Make the snomed ontology

This will add a IS_A relation to neo4j representing the SNOMED ontology

In [None]:
data = [['child', 'parent']]
for parent, children in cdb.addl_info['pt2ch'].items():
    for child in children:
        if child != parent:
            data.append([child, parent])
data = pd.DataFrame(data[1:], columns=data[0])

In [None]:
# The created CSV has to be saved/copied into the /var/lib/neo4j/import, 
#otherwise the query execution below will not work
q = create_concept_ontology_csv(data, output_dir='/home/wish/data/neo/mimic/')

In [None]:
neo.execute(q)

# Make documents CSV

In [None]:
# Add all documents (without text) to the database
df = pd.read_csv("/home/wish/data/mimic/raw/noteevents.csv")

In [None]:
data = [['documentId']]
data.extend(set(df['row_id'].values))
data = pd.DataFrame(data[1:], columns=data[0])

In [None]:
# The created CSV has to be saved/copied into the /var/lib/neo4j/import, 
#otherwise the query execution below will not work
q = create_documents_csv(data, output_dir='/home/wish/data/neo/mimic/')

In [None]:
neo.execute(q)

# Make patient2document

In [None]:
# Add the patient to document relation for all pts and docs
df = pd.read_csv("/home/wish/data/mimic/raw/noteevents.csv")

In [None]:
data = [['patientId', 'documentId']]
for d, p in df[['row_id', 'subject_id']].values:
    data.append([p, d])

In [None]:
data = pd.DataFrame(data[1:], columns=data[0])

In [None]:
# The created CSV has to be saved/copied into the /var/lib/neo4j/import, 
#otherwise the query execution below will not work
q = create_document2patient_csv(data, output_dir='/home/wish/data/neo/mimic/')

In [None]:
neo.execute(q)

# Make annotations CSV

In [None]:
# This requires a couple of extra dictionaries that can be created from PATIENTS.csv of noteevents.csv
#I have created them earlier that is why I'm just loading them here
doc2pt = pickle.load(open('/home/wish/data/timecat/mimic/doc2pt.pickle', 'rb'))
# The timestamp taken from noteevents.csv
doc2time = pickle.load(open('/home/wish/data/timecat/mimic/doc2time.pickle', 'rb'))
doc2time = {str(k):v for k,v in doc2time.items()}
doc2pt = {str(k):v for k,v in doc2pt.items()}

In [None]:
# This takes the output folder from cat.multiprocessing, files have to start with part_
data_dir = "/home/wish/data/timecat/mimic/annotated_november_2021/"
paths = [path for path in os.listdir(data_dir) 
         if path.startswith('part_')]

In [None]:
queries = []
for path in paths:
    print(f'Working on: {path}')
    docs = pickle.load(open(data_dir + path, 'rb'))
    data = get_data_from_docs(docs, doc2pt, doc2time)
    data = pd.DataFrame(data[1:], columns=data[0])
    
    # The created CSV has to be saved/copied into the /var/lib/neo4j/import, 
    #otherwise the query execution below will not work
    q = create_document2concept_csv(data, output_dir='/home/wish/data/neo/mimic/',
                                    base_name='rel_' + path.replace('.pickle', ''))
    queries.append(q)
    print('done\n')

In [None]:
for i, q in enumerate(queries):
    print("Running q: {}".format(i))
    neo.execute(q)

# Query the database for patients and concepts

In [None]:
pts, q = neo.get_all_patients(concepts=['86406008', '20639004'], limit=1000)

In [None]:
pts

In [None]:
stream, q = neo.get_all_concepts_from(patient_id='20807', limit=10000, bucket_size_seconds=None, min_count=0,
                              meta_requirements={'metaTime': 'Recent'})

In [None]:
print(q)

In [None]:
pd.DataFrame(stream['entities'])