# This will extract to a csv file every note that is related to ICD-9 code 314.01 (ADHD).

In [None]:
import pandas as pd

# Prepare all the data that will be used
_discharge_summary_csv = 'ICD9-V4511_Patients_DischargeSummary.csv.gz'
_diagnoses_csv = '../data/zipped/DIAGNOSES_ICD.csv.gz'
_notes_csv = '../data/zipped/NOTEEVENTS.csv.gz'
_admission_csv = '../data/zipped/ADMISSIONS.csv.gz'
_patient_csv = '../data/zipped/PATIENTS.csv.gz'
_icd9_code = '31401' # (ADHD ICD-9 Code)

In [35]:
# Load the data
diagnoses_icd_df = pd.read_csv(_diagnoses_csv, compression='gzip')
noteevents_df = pd.read_csv(_notes_csv, low_memory=False)

# Clean up column names
diagnoses_icd_df.columns = diagnoses_icd_df.columns.str.strip().lower()
diagnoses_icd_df.columns = diagnoses_icd_df.columns.str.lower()
noteevents_df.columns = noteevents_df.columns.str.strip()
noteevents_df.columns = noteevents_df.columns.str.lower()

# 1. Filter all diagnoses that are related to ADHD
filtered_df = diagnoses_icd_df.loc[diagnoses_icd_df["icd9_code"] == _icd9_code]

# 2. Filter all notes for patients whose diagnoses are related to ADHD
arr_subject_id = filtered_df["subject_id"].tolist()
f_adhd_df = noteevents_df.loc[noteevents_df['subject_id'].isin(arr_subject_id)]

# 3 .Remove all the columns that are not needed
notes_col = ["subject_id", "category", "description", "text"];
notes_df = f_adhd_df[notes_col]

print(len(f_adhd_df))
print(len(notes_df))

4325
4325


In [None]:
# Display small group (10 entries)
#display(notes_df[:10])
notes_df[:10]

In [None]:
# Display medium group (100 entries)
notes_df[:100]

In [None]:
# Display large group (all entries)
notes_df[::]