In [1]:
import pandas as pd

# Prepare all the data that will be used
_diagnoses_csv = '../data/zipped/DIAGNOSES_ICD.csv.gz'
_notes_csv = '../data/zipped/NOTEEVENTS.csv.gz'
_patient_csv = '../data/zipped/PATIENTS.csv.gz'
_prescriptions_csv = '../data/zipped/PRESCRIPTIONS.csv.gz'
_icd9_code = '31401' # (ADHD ICD-9 Code)

In [2]:
# Load Data
diagnoses_icd_df = pd.read_csv(_diagnoses_csv, compression='gzip')
noteevents_df = pd.read_csv(_notes_csv, compression="gzip", low_memory=False)
prescription_df = pd.read_csv(_prescriptions_csv, compression="gzip", low_memory=False)
patient_df = pd.read_csv(_patient_csv, compression='gzip')

diagnoses_icd_df.columns = diagnoses_icd_df.columns.str.strip()
diagnoses_icd_df.columns = diagnoses_icd_df.columns.str.lower()
noteevents_df.columns = noteevents_df.columns.str.strip()
noteevents_df.columns = noteevents_df.columns.str.lower()
patient_df.columns = patient_df.columns.str.strip()
patient_df.columns = patient_df.columns.str.lower()
prescription_df.columns = prescription_df.columns.str.strip()
prescription_df.columns = prescription_df.columns.str.lower()

In [3]:
# Filter by ADHD
adhd_df = diagnoses_icd_df.loc[diagnoses_icd_df["icd9_code"] == _icd9_code]

# Combine patient with diagnosis, with prescription, and notes
patients_with_adhd = pd.merge(adhd_df, patient_df, on="subject_id")
patients_with_prescription = pd.merge(patients_with_adhd, prescription_df, on="subject_id")
patients_with_adhd_notes = pd.merge(patients_with_adhd, noteevents_df, on="subject_id")

# Filter by hallucinations & OCD
p_adhd_hallucinations = patients_with_adhd_notes[patients_with_adhd_notes["text"].str.contains("hallucinations", case=False)].drop_duplicates(subset=["subject_id"])
p_adhd_ocd = patients_with_adhd_notes[patients_with_adhd_notes["text"].str.contains("ocd", case=False)].drop_duplicates(subset="subject_id")
p_adhd_conditions = patients_with_adhd_notes[patients_with_adhd_notes["text"].str.contains("hallucinations|ocd", case=False)].drop_duplicates(subset="subject_id")

patients_with_prescription["hallucinations"] = patients_with_prescription["subject_id"].isin(p_adhd_hallucinations["subject_id"])
patients_with_prescription["ocd"] = patients_with_prescription["subject_id"].isin(p_adhd_ocd["subject_id"])
patients_with_prescription = patients_with_prescription.drop_duplicates(subset="subject_id")

print(f"Hallucinations: {len(p_adhd_hallucinations)}")
print(f"OCD: {len(p_adhd_ocd)}")
print(f"Hallucinations & OCD: {len(p_adhd_conditions)}")

Hallucinations: 15
OCD: 5
Hallucinations & OCD: 19


In [6]:
# Drop unneeded columns
patients_with_prescription_tagged = patients_with_prescription.drop(columns=["row_id_x", "row_id_y", "hadm_id_x", "seq_num", "icd9_code", "dod_ssn", "expire_flag", "row_id", "hadm_id_y", "dob", "dod", "dod_hosp", "startdate", "enddate", "gsn", "ndc", "route", "form_unit_disp", "form_val_disp", "dose_unit_rx", "dose_val_rx", "prod_strength", "formulary_drug_cd", "drug_name_poe", "drug_name_generic", "drug_type", "icustay_id"])

# Make gender boolean (Male = True, Female = False).
patients_with_prescription_tagged["gender"] = patients_with_prescription_tagged.apply(lambda row: row["gender"] == "M", axis=1)
patients_with_prescription_tagged["adhd_only"] = patients_with_prescription_tagged.apply(lambda row: row["hallucinations"] == False and row["ocd"] == False, axis=1)

patients_with_prescription_tagged[::]

Unnamed: 0,subject_id,gender,drug,hallucinations,ocd,adhd_only
0,303,True,Hydromorphone,False,False,True
48,715,False,DiphenhydrAMINE,False,True,False
99,1590,True,Morphine Sulfate IR,False,False,True
216,2945,False,Olanzapine,True,True,False
238,2170,True,Potassium Chloride,False,False,True
...,...,...,...,...,...,...
18141,94075,True,Famotidine,False,False,True
18176,97974,False,Ritonavir (Oral Solution),False,False,True
18321,96463,True,Propofol,True,False,False
18339,98177,True,0.9% Sodium Chloride,False,False,True
