<a href="https://colab.research.google.com/github/AareanaReza/CS598-DLH-Final-Project/blob/main/DataPreProc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#CS 598 DLH Project: Diagnoses Prediction

###Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

###Paper and Github Citations

Zaghir J, Rodrigues-Jr JF, Goeuriot L, Amer-Yahia S. Real-world Patient Trajectory Prediction from Clinical Notes Using Artificial Neural Networks and UMLS-Based Extraction of Concepts. J Healthc Inform Res. 2021 Jun 5;5(4):474-496. doi: 10.1007/s41666-021-00100-z. PMID: 35419508; PMCID: PMC8982755.
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8982755/#CR22
GitHub:https://github.com/JamilProg/patient_trajectory_prediction




###Dependencies

In [None]:
import gzip
import csv
from itertools import islice
import re
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
import sys
import os
import numpy as np
import pandas as pd
import seaborn as sns
import random

In [None]:
%matplotlib inline


###Data Download Instructions



To download NOTEEVENTS.csv, ADMISSION.csv, and DIAGNOSES_ICD.csv you will need to request access and download the files from this link:https://physionet.org/content/mimiciii/1.4/

You will aslo need to install QuickUMLS which require file access from the National Library of Medicine, see instructions : https://github.com/Georgetown-IR-Lab/QuickUMLS

##Preprocessing Code

Path Variables

In [None]:
DATA_PREPROCESSING_PATH = '/content/drive/MyDrive/CS598-DLH-Final-Project/Data-Preprocessing/'
NOTEEVENTS_CSV_GZ = DATA_PREPROCESSING_PATH + 'Original-Data/NOTEEVENTS.csv.gz'
outpath = DATA_PREPROCESSING_PATH + 'Output-Data/'


In [None]:
NOTEEVENTS_CSV = DATA_PREPROCESSING_PATH + 'Original-Data/NOTEEVENTS.csv'

In [None]:
ADMISSIONS_CSV = DATA_PREPROCESSING_PATH + 'Original-Data/ADMISSIONS.csv'

In [None]:
DIAGNOSES_ICD = DATA_PREPROCESSING_PATH + 'Original-Data/DIAGNOSES_ICD.csv'

### Visualization of Data Statistics

Notes Events Data

In [None]:
# Set size and color for plots
sns.reset_defaults()
sns.set(
    rc={'figure.figsize':(10,6)}, 
    style="white"
)

In [None]:
noteevents = pd.read_csv(NOTEEVENTS_CSV, low_memory = False)

In [None]:
noteevents_orig = noteevents

In [None]:
noteevents.info()

In [None]:
lns = [len(str(x)) for x in noteevents['TEXT']]
sns.distplot(lns, kde=False, axlabel='Document length')
plt.show()

In [None]:
# Sort lengths
lns.sort()
# Take 5% as the removal size
rm_size = int(len(lns) / 100) * 5

# Now plot with removal of most/least frequent
sns.distplot(lns[rm_size:-rm_size], kde=False, axlabel='Document length')
plt.show()

In [None]:
# Remove rows from the dataframe based on document length, this is not really
#straightforward, so we'll approximate it and find the document length that is used as a cutoff 
min_ln = max(lns[0:rm_size])
max_ln = min(lns[-rm_size:])

noteevents = noteevents[[True if len(str(x)) > min_ln and len(str(x)) < max_ln else False for x in noteevents['TEXT']]]
noteevents.head()

In [None]:
print(f"Length after cleaning : {len(noteevents)}")
print(f"Length of the original: {len(noteevents_orig)}")

In [None]:
sns.distplot(noteevents['SUBJECT_ID'].value_counts().values, kde=False, axlabel='Documents per patient')
plt.show()

In [None]:
# Again a bit of clean-up, let's remove the bottom/top 1% of patients based on the number of 
#documents they have. 
docs_per_pt = noteevents['SUBJECT_ID'].value_counts()
docs_per_pt_vals = docs_per_pt.values
docs_per_pt_vals.sort()
docs_per_pt_vals

In [None]:
rm_size = int(len(docs_per_pt_vals) / 100) * 1
min_ln = max(docs_per_pt_vals[0:rm_size])
max_ln = min(docs_per_pt_vals[-rm_size:])

In [None]:
min_ln

In [None]:
keep_subject_id = set([k for k, v in docs_per_pt.items() if v > 10 and v < 20])
noteevents_rm_docs_per_pt = noteevents[[True if subject_id in keep_subject_id else False 
                  for subject_id in noteevents['SUBJECT_ID'].values]]
noteevents_rm_docs_per_pt.head()

In [None]:
print(f"Length after cleaning : {len(noteevents_rm_docs_per_pt)}")
print(f"Length of the original: {len(noteevents_orig)}")

In [None]:
sns.distplot(noteevents_rm_docs_per_pt['SUBJECT_ID'].value_counts().values, kde=False, axlabel='Documents per patient')
plt.show()

Extract Sample of Data

In [None]:
subjects = pd.DataFrame(keep_subject_id)

In [None]:
len(subjects)

In [None]:
#sample from subject ids 
sub_samples = subjects.sample(frac=0.01, random_state=4132023)

In [None]:
len(sub_samples)

In [None]:
keep_samp = sub_samples.iloc[:,0].values.tolist()

In [None]:
sampled_notes = noteevents_rm_docs_per_pt[[True if subject_id in keep_samp else False 
                  for subject_id in noteevents_rm_docs_per_pt['SUBJECT_ID'].values]]

In [None]:
sns.distplot(sampled_notes['SUBJECT_ID'].value_counts().values, kde=False, axlabel='Documents per patient')
plt.show()

In [None]:
sampled_notes = sampled_notes.set_index(['ROW_ID'])

In [None]:
sampled_notes.info()

In [None]:
sampled_notes.to_csv('/content/drive/MyDrive/CS598-DLH-Final-Project/Data-Preprocessing/Original-Data/SAMPLEDNOTEEVENTS.csv', index=False)

In [None]:
SAMPLEDNOTEEVENTS = '/content/drive/MyDrive/CS598-DLH-Final-Project/Data-Preprocessing/Original-Data/SAMPLEDNOTEEVENTS.csv'

In [None]:
test = pd.read_csv(SAMPLEDNOTEEVENTS, low_memory=False)
test.info()

###Data Cleaning & DL Preparation

Call the noteEvents_preproc Python file

In [None]:
!python3 /content/drive/MyDrive/CS598-DLH-Final-Project/patient_trajectory_prediction/data_cleaning/noteEvents_preproc.py

In [None]:
OUTPUT = '/content/drive/MyDrive/CS598-DLH-Final-Project/Data-Preprocessing/Output-Data/output.csv'

In [None]:
outputdata = pd.read_csv(OUTPUT, low_memory=False)

In [None]:
outputdata.reset_index(inplace=True)
outputdata = outputdata.rename(columns = {'index':'new column name'})

In [None]:
outputdata.head()

In [None]:
outputdata.to_csv('/content/drive/MyDrive/CS598-DLH-Final-Project/QuickUMLS/data/chunkssmall/1.csv', index = False, header = False)

Call the quickUMLS_getCUI.py file

In [None]:
!python3 /content/drive/MyDrive/CS598-DLH-Final-Project/patient_trajectory_prediction/concept_annotation/quickUMLS_getCUI.py

Call the quickumls_processing.py file

In [None]:
!python3 /content/drive/MyDrive/CS598-DLH-Final-Project/patient_trajectory_prediction/concept_annotation/quickumls_processing.py /content/drive/MyDrive/CS598-DLH-Final-Project/QuickUMLS/data/outputchunkssmall/concatenated_output.csv

In [None]:
test = pd.read_csv('/content/drive/MyDrive/CS598-DLH-Final-Project/Data-Preprocessing/Output-Data/post_processed_output.csv', low_memory=False,header = None)
test.iloc[:,2] = test.iloc[:,2].fillna(0)
test.iloc[:,2] = test.iloc[:,2].astype(int)
test.info()


In [None]:
test.to_csv('/content/drive/MyDrive/CS598-DLH-Final-Project/Data-Preprocessing/Output-Data/post_process_output_no_index.csv', header = False, index = False)

Call the 01_data_prepartion.py file

In [None]:
!python3 /content/drive/MyDrive/CS598-DLH-Final-Project/patient_trajectory_prediction/PyTorch_scripts/diagnoses_prediction/01_data_preparation.py

##Training Code, Evaluation Code, Pretrained Model

In [None]:
!python3 /content/drive/MyDrive/CS598-DLH-Final-Project/patient_trajectory_prediction/PyTorch_scripts/diagnoses_prediction/02_FFN_diagprediction.py

In [None]:
!python3 /content/drive/MyDrive/CS598-DLH-Final-Project/patient_trajectory_prediction/PyTorch_scripts/diagnoses_prediction/02_GRU_train_GPU.py

In [None]:
!python3 /content/drive/MyDrive/CS598-DLH-Final-Project/patient_trajectory_prediction/PyTorch_scripts/diagnoses_prediction/03_GRU_test.py

## Ablation Study