# **Start by uploading the DIAGNOSES_ICD file and getting all patients for ICD_CODE=430**

In [1]:
import pandas as pd
#from google.colab import files

_discharge_summary_csv = 'ICD9-V4511_Patients_DischargeSummary.csv.gz'
_diagnoses_csv = '../data/zipped/DIAGNOSES_ICD.csv.gz'
_notes_csv = '../data/zipped/NOTEEVENTS.csv.gz'
_admission_csv = '../data/zipped/ADMISSIONS.csv.gz'
_patient_csv = '../data/zipped/PATIENTS.csv.gz'
_icd9_code = '31401' #V4511
#d1 = files.upload()



In [None]:
# The uploaded file is 'DIAGNOSES_ICD.csv.gz', so use this name in pd.read_csv
# and specify the compression as 'gzip'
diagnoses_icd_df = pd.read_csv(_diagnoses_csv, compression='gzip')

diagnoses_icd_df.info()
diagnoses_icd_df.iloc[0]
print(len(diagnoses_icd_df))

In [61]:
# **Step 2: Extract SUBJECT_IDs for ICD-9 Code 'V4511'**
if diagnoses_icd_df is not None:
    # Ensure column names match exactly (fix case issues)
    diagnoses_icd_df.columns = diagnoses_icd_df.columns.str.upper()

    # **Vectorized Filtering (MUCH Faster than Loops)**
    filtered_df = diagnoses_icd_df.loc[diagnoses_icd_df["ICD9_CODE"] == _icd9_code]

    # Extract SUBJECT_ID and HADM_ID as lists
    arr_subject_id = filtered_df["SUBJECT_ID"].tolist()
    arr_hadm_id = filtered_df["HADM_ID"].tolist()

    # **Step 3: Print Results**
    print(f"✅ Found {len(arr_subject_id)} records for ICD-9 Code '{_icd9_code}'.")
    print("📌 SUBJECT_IDs:", arr_subject_id[:10])  # Show only first 10 IDs for readability


✅ Found 141 records for ICD-9 Code '31401'.
📌 SUBJECT_IDs: [303, 715, 1590, 2945, 2170, 4966, 4966, 6374, 5571, 6391]


In [62]:
    # Ensure 'ICD9_CODE' exists in the dataset
    if "ICD9_CODE" in diagnoses_icd_df.columns and "SUBJECT_ID" in diagnoses_icd_df.columns and "HADM_ID" in diagnoses_icd_df.columns:
        # **Vectorized Filtering (MUCH Faster)**
        filtered_df = diagnoses_icd_df[diagnoses_icd_df["ICD9_CODE"] == _icd9_code]

        # Extract SUBJECT_ID and HADM_ID as lists
        arr_subject_id = filtered_df["SUBJECT_ID"].tolist()
        arr_hadm_id = filtered_df["HADM_ID"].tolist()

        # **Step 3: Print Results**
        print(f"✅ Found {len(arr_subject_id)} records for ICD-9 Code '{_icd9_code}'.")
        print("📌 First 10 SUBJECT_IDs:", arr_subject_id[:10])  # Show only first 10 IDs for readability
    else:
        print("❌ Error: One or more required columns ('ICD9_CODE', 'SUBJECT_ID', 'HADM_ID') are missing in the dataset.")

✅ Found 141 records for ICD-9 Code '31401'.
📌 First 10 SUBJECT_IDs: [303, 715, 1590, 2945, 2170, 4966, 4966, 6374, 5571, 6391]


In [63]:
import pandas as pd

# Sample DataFrame (replace this with your actual DataFrame)
# df = pd.read_csv("your_file.csv")  # Uncomment if loading from a CSV file

# 1. Check if DataFrame is empty
if diagnoses_icd_df.empty:
    print("The DataFrame is empty. Please check your data source.")
else:
    print(f"DataFrame shape: {diagnoses_icd_df.shape}")

# 2. Print all column names to see if 'subject_id' exists
print("Column Names:", diagnoses_icd_df.columns.tolist())

# 3. Check for typos, hidden spaces, or similar column names
similar_columns = [col for col in diagnoses_icd_df.columns if 'subject' in col.lower()]
print("Similar columns found:", similar_columns)

# 4. Strip spaces and rename columns if needed
diagnoses_icd_df.columns = diagnoses_icd_df.columns.str.strip()

# 5. Convert all column names to lowercase for consistency (optional)
diagnoses_icd_df.columns = diagnoses_icd_df.columns.str.lower()

# 6. Try accessing 'subject_id' safely
column_name = 'subject_id'
if column_name in diagnoses_icd_df.columns:
    subject_id_values = diagnoses_icd_df[column_name]
    print(f"Successfully accessed '{column_name}' column.")
else:
    print(f"Column '{column_name}' not found. Available columns: {diagnoses_icd_df.columns.tolist()}")

# 7. Alternative safe access using .get() to prevent KeyError
subject_id_values = diagnoses_icd_df.get(column_name, "Column Not Found")
print(subject_id_values)


DataFrame shape: (651047, 5)
Column Names: ['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'ICD9_CODE']
Similar columns found: ['SUBJECT_ID']
Successfully accessed 'subject_id' column.
0           109
1           109
2           109
3           109
4           109
          ...  
651042    97503
651043    97503
651044    97503
651045    97503
651046    97503
Name: subject_id, Length: 651047, dtype: int64


In [64]:
import pandas as pd

# Sample DataFrame (replace this with your actual DataFrame)
# df = pd.read_csv("your_file.csv")  # Uncomment if loading from a CSV file
# Define df as your diagnoses_icd_df DataFrame
df = diagnoses_icd_df

# 1. Check if DataFrame is empty
if df.empty:
    print("The DataFrame is empty. Please check your data source.")
else:
    print(f"DataFrame shape: {df.shape}")

# 2. Print all column names to see if 'subject_id' exists
print("Column Names:", df.columns.tolist())

# 3. Check for typos, hidden spaces, or similar column names
similar_columns = [col for col in df.columns if 'subject' in col.lower()]
print("Similar columns found:", similar_columns)

# 4. Strip spaces and rename columns if needed
df.columns = df.columns.str.strip()

# 5. Convert all column names to lowercase for consistency (optional)
df.columns = df.columns.str.lower()

# 6. Try accessing 'subject_id' safely
column_name = 'subject_id'
if column_name in df.columns:
    subject_id_values = df[column_name]
    print(f"Successfully accessed '{column_name}' column.")
else:
    print(f"Column '{column_name}' not found. Available columns: {df.columns.tolist()}")

# 7. Alternative safe access using .get() to prevent KeyError
subject_id_values = df.get(column_name, "Column Not Found")
print(subject_id_values)

DataFrame shape: (651047, 5)
Column Names: ['row_id', 'subject_id', 'hadm_id', 'seq_num', 'icd9_code']
Similar columns found: ['subject_id']
Successfully accessed 'subject_id' column.
0           109
1           109
2           109
3           109
4           109
          ...  
651042    97503
651043    97503
651044    97503
651045    97503
651046    97503
Name: subject_id, Length: 651047, dtype: int64


# **You can use one of the following ways to upload the NOTEEVENTS or any file in colab**

# **Method:1**

In [None]:
# upload file from local drive and open it for reading (NOTEEVENTS file)
# read the uploaded file in a dataframe
# Use the correct filename: 'NOTEEVENTS.csv.gz' and specify compression as 'gzip'
noteevents_df = pd.read_csv(_notes_csv, compression='gzip', low_memory=False)

In [None]:
'''
from google.colab import drive
drive.mount('/content/drive')

!cp "/content/drive/MyDrive/NOTEEVENTS.csv" .
noteevents_df = pd.read_csv('/content/NOTEEVENTS.csv')
noteevents_df.info()
noteevents_df.iloc[0]
'''

# **Method:2**

In [None]:
noteevents_df = pd.read_csv(_notes_csv, low_memory=False)
noteevents_df.info()
noteevents_df.iloc[0]

# **Filter the data in files to create a new relevant dataframe**



In [65]:
# Add the import statement at the beginning of the cell where 'pd' is used.
import pandas as pd # Import the pandas library and assign it to the alias 'pd'

icd9_V4511_patients_discharge_summary_df = pd.DataFrame(columns=['SUBJECT_ID', 'CATEGORY', 'DESCRIPTION', 'TEXT'])
patients_dict = {"SUBJECT_ID":[],"CATEGORY":[], "DESCRIPTION":[], "TEXT":[]};

print(icd9_V4511_patients_discharge_summary_df)
print(f"\n{patients_dict}")

Empty DataFrame
Columns: [SUBJECT_ID, CATEGORY, DESCRIPTION, TEXT]
Index: []

{'SUBJECT_ID': [], 'CATEGORY': [], 'DESCRIPTION': [], 'TEXT': []}


In [66]:
for i in range(0, len(noteevents_df)):
  if((noteevents_df.loc[i, 'SUBJECT_ID'] in arr_subject_id)):
   patients_dict["SUBJECT_ID"].append(noteevents_df.loc[i, 'SUBJECT_ID'])
   patients_dict["CATEGORY"].append(noteevents_df.loc[i, 'CATEGORY'])
   patients_dict["DESCRIPTION"].append(noteevents_df.loc[i, 'DESCRIPTION'])
   patients_dict["TEXT"].append(noteevents_df.loc[i, 'TEXT'])

In [None]:
# First, execute this cell to define and populate noteevents_df. If not outputting data to screen add display(noteevents_df)
'''
import pandas as pd
noteevents_df = pd.read_csv('../../data/NOTEEVENTS.csv.gz')
noteevents_df.info()
noteevents_df.iloc[0]
'''
display(noteevents_df) #If no output from before add this.


In [67]:
# Then execute the cell that uses the dataframe
for i in range(0, len(noteevents_df)):
  if((noteevents_df.loc[i, 'SUBJECT_ID'] in arr_subject_id)):
   patients_dict["SUBJECT_ID"].append(noteevents_df.loc[i, 'SUBJECT_ID'])
   patients_dict["CATEGORY"].append(noteevents_df.loc[i, 'CATEGORY'])
   patients_dict["DESCRIPTION"].append(noteevents_df.loc[i, 'DESCRIPTION'])
   patients_dict["TEXT"].append(noteevents_df.loc[i, 'TEXT'])

In [68]:
# Make sure to run the cell where you create and populate arr_subject_id first.
# For example, the code below likely created it:
arr_subject_id=[]
arr_hadm_id=[]
for row in range(0, len(diagnoses_icd_df)):
  if(diagnoses_icd_df.loc[row, 'icd9_code']=='99591'): #V4511 = Comma
    arr_subject_id.append(diagnoses_icd_df.loc[row, 'subject_id'])


# Then execute the cell that uses the dataframe
for i in range(0, len(noteevents_df)):
  if((noteevents_df.loc[i, 'SUBJECT_ID'] in arr_subject_id)):
   patients_dict["SUBJECT_ID"].append(noteevents_df.loc[i, 'SUBJECT_ID'])
   patients_dict["CATEGORY"].append(noteevents_df.loc[i, 'CATEGORY'])
   patients_dict["DESCRIPTION"].append(noteevents_df.loc[i, 'DESCRIPTION'])
   patients_dict["TEXT"].append(noteevents_df.loc[i, 'TEXT'])

In [69]:
patients_df = pd.DataFrame(patients_dict)

# **Download the created dataframe (whole or a part of it)**

In [72]:
# to keep only certain (say for eg. 10) number of rows of a dataframe
patients_df[:100]  # this will fetch the top 100 rows

patients_df[-100:] # this will fetch the last 100 rows

patients_df[-10:] # this will fetch the last 10 rows

patients_df[::] # this will fetch all the rows

Unnamed: 0,SUBJECT_ID,CATEGORY,DESCRIPTION,TEXT
0,29355,Discharge summary,Report,Admission Date: [**2154-7-2**] D...
1,6391,Discharge summary,Report,Admission Date: [**2170-5-7**] D...
2,88857,Discharge summary,Report,Admission Date: [**2109-7-26**] ...
3,88857,Discharge summary,Report,Admission Date: [**2109-9-30**] ...
4,88857,Discharge summary,Report,Admission Date: [**2109-10-30**] ...
...,...,...,...,...
117936,76418,Physician,Physician Resident Progress Note,"Chief Complaint: delirium, GI blood loss anemi..."
117937,76418,Physician,Physician Resident Progress Note,"Chief Complaint: delirium, GI blood loss anemi..."
117938,73713,Physician,Physician Resident Progress Note,TITLE:\n Chief Complaint:\n 24 Hour Events...
117939,73713,Physician,Physician Resident Progress Note,TITLE:\n Chief Complaint:\n 24 Hour Events...


In [None]:
import pandas as pd
# ... (other code to create patients_dict) ...

# Create the DataFrame
patients_df = pd.DataFrame(patients_dict)

# Now you can access it
# to keep only certain (say for eg. 10) number of rows of a dataframe
patients_df[:10]  # this will fetch the top 10 rows

patients_df[-10:] # this will fetch the last 10 rows

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
def _plot_series(series, series_name, series_index=0):
  palette = list(sns.palettes.mpl_palette('Dark2'))
  counted = (series['index']
                .value_counts()
              .reset_index(name='counts')
              .rename({'index': 'index'}, axis=1)
              .sort_values('index', ascending=True))
  xs = counted['index']
  ys = counted['counts']
  plt.plot(xs, ys, label=series_name, color=palette[series_index % len(palette)])

fig, ax = plt.subplots(figsize=(10, 5.2), layout='constrained')
df_sorted = _df_0.sort_values('index', ascending=True)
#df_sorted = patients_df.sort_values('index', ascending=True)
_plot_series(df_sorted, '')
sns.despine(fig=fig, ax=ax)
plt.xlabel('index')
_ = plt.ylabel('count()')

In [None]:
#if you using Macbook or Linux based systems download the patients_df dataframe in .csv format
patients_df.to_csv(r'ICD9-V4511_Patients_DischargeSummary.csv', index = False)

#from google.colab import files
#files.download('ICD9-V4511_Patients_DischargeSummary.csv')

# if you want to download only the top 10 of the patients_df dataframe rows as a csv file do the following
#patients_df[:10].to_csv(r'ICD9-V4511_Patients_DischargeSummary.csv', index = False)

In [None]:
#if you are using a Windows based system download the patients_df dataframe in excel format
patients_df.to_excel("ICD9-430_Patients_DischargeSummary.xlsx")
# if you want to download only the top 10 of the patients_df dataframe rows as a csv file do the following
patients_df[:10].to_excel("ICD9-430_Patients_DischargeSummary.xlsx")

In [None]:
noteevents_df['CATEGORY'].unique()

In [None]:
len(noteevents_df[noteevents_df['CATEGORY'] == 'Discharge summary'])

In [None]:
len(noteevents_df['CATEGORY'] == 'Echo')

In [None]:
diagnoses_df = pd.read_csv(_diagnoses_csv, compression='gzip')
diagnoses_df

In [None]:
admission_df = pd.read_csv(_admission_csv, compression='gzip')
admission_df

In [None]:
patient_df = pd.read_csv(_patient_csv, compression='gzip')
patient_df

In [None]:
notes_df = pd.read_csv(_notes_csv, compression='gzip', low_memory=False)
notes_df

In [None]:
cardio_codes = ['410']  # Myocardial infarction, Hypertension #'401'
neuro_codes = ['434']  # Stroke, Epilepsy #'345'

# Filter the dataset for these codes
cardio_df = diagnoses_df[diagnoses_df['ICD9_CODE'].astype(str).str.startswith(tuple(cardio_codes))]
neuro_df = diagnoses_df[diagnoses_df['ICD9_CODE'].astype(str).str.startswith(tuple(neuro_codes))]

# Combine both datasets
filtered_df = pd.concat([cardio_df, neuro_df])

# Display the result
print(filtered_df.head())

In [None]:
# Load clinical notes
#notes_df = pd.read_csv("NOTEEVENTS.csv")

# Merge with filtered diagnoses to get relevant notes
notes_filtered_df = filtered_df.merge(notes_df, on="SUBJECT_ID", how="left")

# Display some notes
print(notes_filtered_df[['TEXT']].head())

new = notes_filtered_df[notes_filtered_df['CATEGORY'] == 'Discharge summary']
len(new)

In [None]:
new = notes_filtered_df[notes_filtered_df['CATEGORY'] == 'Radiology']
len(new)

In [None]:
len(new)

In [None]:
import spacy

In [None]:
nlp = spacy.load("en_core_web_sm")


In [None]:
def extract_entities (text):
  doc = nlp(text)
  return [(ent.text, ent.label_) for ent in doc.ents]


In [None]:
for _, row in new.iterrows():
  text = row['TEXT']
  entities = extract_entities(text)
  doc = nlp(text)
  for entity, label in entities:
    print(f"Entity: {entity}, Label: {label}")
  print('*'*100)

In [None]:
# Entity Visualizer :
from spacy import displacy
for i in range(len(doc)):
  displacy.render(doc[i], style='ent', jupyter=True)
  print('*'*100)