In [None]:
import numpy as np
import pandas as pd
import json
import os
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
# %matplotlib inline
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 50)

# --- Sample admission table (kept your original structure) ---
admission_table = {
    'Patient 1': {
        'PatientID': 'A1234-B456',
        'Admission ID': [12, 34, 15],
        'AdmissionStartDate': ['2019-01-03 9:34:55', '2019-02-03 10:50:55', '2019-04-03 12:34:55'],
        'AdmissionEndDate':   ['2019-01-07 8:45:43', '2019-03-04 1:50:32',  '2019-04-03 5:38:18']
    },
    'Patient 2': {
        'PatientID': 'B1234-C456',
        'Admission ID': [13, 34],
        'AdmissionStartDate': ['2018-01-03 9:34:55', '2018-02-03 10:50:55'],
        'AdmissionEndDate':   ['2018-01-07 8:45:43', '2018-03-04 1:50:32']
    }
}

# build admission_table DataFrame (keeps original approach)
admission_table = pd.concat({k: pd.DataFrame(v) for k, v in admission_table.items()}).reset_index(level=1, drop=True)
admission_table = admission_table.reset_index(drop=True)

# --- Diagnosis dictionaries (kept your original data) ---
Patient_1 = {
    'PatientID': 'A1234-B456',
    'Admission ID': [12, 34, 15],
    'PrimaryDiagnosisCode': [
        ['E11.64', 'I25.812', 'I25.10'],
        ['E11.64', 'I25.812', 'I25.10', '780.96', '784.0'],
        ['E11.64', 'I25.812', 'I25.10', '786.50', '401.9', '789.00']
    ],
    'CodingSystem': ['ICD-9', 'ICD-9', 'ICD-9'],
    'DiagnosisCodeDescription': [
        ['Type 2 diabetes mellitus with hypoglycemia',
         'Atherosclerosis of bypass graft of coronary artery of transplanted heart without angina pectoris',
         'Atherosclerotic heart disease of native coronary artery without angina pectoris'],
        ['Type 2 diabetes mellitus with hypoglycemia',
         'Atherosclerosis of bypass graft of coronary artery of transplanted heart without angina pectoris',
         'Atherosclerotic heart disease of native coronary artery without angina pectoris',
         'Generalized Pain', 'Dizziness and giddiness'],
        ['Type 2 diabetes mellitus with hypoglycemia',
         'Atherosclerosis of bypass graft of coronary artery of transplanted heart without angina pectoris',
         'Atherosclerotic heart disease of native coronary artery without angina pectoris',
         'Chest pain, unspecified', 'Essential hypertension, unspecified',
         'Abdominal pain, unspecified site']
    ]
}

Patient_2 = {
    'PatientID': 'B1234-C456',
    'Admission ID': [13, 34],
    'PrimaryDiagnosisCode': [
        ['M05.59', 'Z13.85', 'O99.35'],
        ['M05.59', 'Z13.85', 'O99.35', 'D37.0']
    ],
    'CodingSystem': ['ICD-9', 'ICD-9'],
    'DiagnosisCodeDescription': [
        ['Rheumatoid polyneuropathy with rheumatoid arthritis of multiple sites',
         'Encounter for screening for nervous system disorders',
         'Diseases of the nervous system complicating pregnancy, childbirth, and the puerperium'],
        ['Rheumatoid polyneuropathy with rheumatoid arthritis of multiple sites',
         'Encounter for screening for nervous system disorders',
         'Diseases of the nervous system complicating pregnancy, childbirth, and the puerperium',
         'Neoplasm of uncertain behavior of lip, oral cavity and pharynx']
    ]
}

# --- Functions ---

def process_ehr(patient_dict1, patient_dict2):
    """
    Combine patient dicts into a DataFrame and explode list-columns so each diagnosis has its own row.
    """
    # make DataFrames from each patient dict and concatenate
    df1 = pd.DataFrame({k: v for k, v in patient_dict1.items()})
    df2 = pd.DataFrame({k: v for k, v in patient_dict2.items()})
    pt_diagnosis_table = pd.concat([df1, df2], ignore_index=True)

    # explode the list-like diagnosis columns so each code / description becomes its own row
    # this assumes PrimaryDiagnosisCode and DiagnosisCodeDescription align (same length per row)
    exploded = pt_diagnosis_table.explode(['PrimaryDiagnosisCode', 'DiagnosisCodeDescription']).reset_index(drop=True)

    # return exploded diagnosis table
    return exploded

def hash_key(df):
    """
    Add 'HashKey' column built from PatientID prefix and Admission ID, then reorder columns so HashKey is first.
    """
    df = df.copy()
    df['HashKey'] = df['PatientID'].apply(lambda x: x.split('-')[0]) + '-' + df['Admission ID'].astype(str)
    cols = ['HashKey'] + [col for col in df.columns if col != 'HashKey']
    print(cols)
    return df[cols]

# --- Run processing ---
diagnosis_table = process_ehr(Patient_1, Patient_2)

# show few rows
print("\nDiagnosis table (top 8 rows):")
print(diagnosis_table.head(8))

# create hash keys
diagnosis_table = hash_key(diagnosis_table)
admission_table = hash_key(admission_table)

# convert admission start/end to datetime (let pandas infer time)
admission_table[['AdmissionStartDate', 'AdmissionEndDate']] = admission_table[['AdmissionStartDate', 'AdmissionEndDate']].apply(pd.to_datetime)

# show info and a slice
print("\nAdmission table info:")
admission_table.info()

print("\nDiagnoses for Admission ID == 34:")
print(diagnosis_table[diagnosis_table['Admission ID'] == 34].head(20))



Diagnosis table (top 8 rows):
    PatientID  Admission ID PrimaryDiagnosisCode CodingSystem  \
0  A1234-B456            12               E11.64        ICD-9   
1  A1234-B456            12              I25.812        ICD-9   
2  A1234-B456            12               I25.10        ICD-9   
3  A1234-B456            34               E11.64        ICD-9   
4  A1234-B456            34              I25.812        ICD-9   
5  A1234-B456            34               I25.10        ICD-9   
6  A1234-B456            34               780.96        ICD-9   
7  A1234-B456            34                784.0        ICD-9   

                            DiagnosisCodeDescription  
0         Type 2 diabetes mellitus with hypoglycemia  
1  Atherosclerosis of bypass graft of coronary ar...  
2  Atherosclerotic heart disease of native corona...  
3         Type 2 diabetes mellitus with hypoglycemia  
4  Atherosclerosis of bypass graft of coronary ar...  
5  Atherosclerotic heart disease of native corona... 