In [1]:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder

In [62]:
data_dir = "Data/"

# Path to data
patients_path =  data_dir +'patients.csv'
admissions_path = data_dir +'admissions.csv'
diagnoses_path = data_dir +'diagnoses_icd.csv'
lab_events_path = data_dir +'labevents_sample.csv'
d_icd_labs_path =data_dir +'d_labitems.csv'
d_icd_diagnoses_path = data_dir +'d_icd_diagnoses.csv'
d_icd_procedures_path = data_dir +'d_icd_procedures.csv'
procedures_path =data_dir + 'procedures_icd.csv'
notes_path =data_dir + 'Notes.csv'

# Load the data
patients = pd.read_csv(patients_path, usecols=['subject_id', 'gender'])
admissions = pd.read_csv(admissions_path, usecols=['subject_id', 'hadm_id', 'race'])
diagnoses = pd.read_csv(diagnoses_path, usecols=['subject_id', 'hadm_id', 'icd_code'])
d_icd_diagnoses = pd.read_csv(d_icd_diagnoses_path, usecols=['icd_code', 'long_title'])
lab_events = pd.read_csv(lab_events_path, usecols=['subject_id', 'hadm_id', 'itemid', 'valuenum', 'ref_range_lower','ref_range_upper','flag'])
d_icd_labs = pd.read_csv(d_icd_labs_path, usecols=['itemid', 'label'])
procedures = pd.read_csv(procedures_path, usecols=['subject_id', 'hadm_id', 'icd_code'])
d_icd_procedures = pd.read_csv(d_icd_procedures_path, usecols=['icd_code', 'long_title'])
notes = pd.read_csv(notes_path, usecols=['subject_id', 'hadm_id', 'Symptoms', 'allergies'])

Merging the dataframes

In [None]:
print("Merging dataframes...")

# Merge diagnoses with d_icd_diagnoses
diagnoses = diagnoses.merge(d_icd_diagnoses[['icd_code', 'long_title']], 
                            on='icd_code', how='left')
print(f"Diagnoses after merge: {diagnoses.shape}")
print(diagnoses.head())

# Merge procedures with d_icd_procedures
procedures = procedures.merge(d_icd_procedures[['icd_code', 'long_title']], 
                              on='icd_code', how='left')
print(f"Procedures after merge: {procedures.shape}")
print(procedures.head())

# Merge lab_events with d_icd_labs (d_labitems)
lab_events_with_desc = lab_events.merge(d_icd_labs[['itemid', 'label']], 
                                       on='itemid', how='left')
print(f"Lab events after merge: {lab_events_with_desc.shape}")

# Process lab events (abnormal results only, with range status)
print("Processing lab events...")
lab_events_with_desc['lab_result'] = (
    lab_events_with_desc['itemid'].astype(str) + '_' + 
    lab_events_with_desc['label'].fillna('Unknown')
)
lab_events_with_desc = lab_events_with_desc.dropna(subset=['hadm_id', 'flag'])
print(f"Filtered lab events (non-null flag): {lab_events_with_desc.shape}")

if lab_events_with_desc.empty:
    print("WARNING: No abnormal lab events found.")
    lab_grouped = pd.DataFrame(columns=['hadm_id', 'labs'])
else:
    # Categorize lab results as Below, Above, or Unknown
    def classify_range(row):
        if pd.notnull(row['valuenum']) and pd.notnull(row['ref_range_lower']) and row['valuenum'] < row['ref_range_lower']:
            return 'Below'
        elif pd.notnull(row['valuenum']) and pd.notnull(row['ref_range_upper']) and row['valuenum'] > row['ref_range_upper']:
            return 'Above'
        return 'Unknown'

    lab_events_with_desc['range_status'] = lab_events_with_desc.apply(classify_range, axis=1)
    lab_events_with_desc['lab_result'] = (
        lab_events_with_desc['itemid'].astype(str) + '_' + 
        lab_events_with_desc['label'].fillna('Unknown') + '_' + 
        lab_events_with_desc['range_status']
    )
    lab_events_with_desc = lab_events_with_desc[lab_events_with_desc['range_status'] != 'Unknown']
    print(f"Lab events after filtering Unknown status: {lab_events_with_desc.shape}")


Merging dataframes...
Diagnoses after merge: (6484228, 4)
   subject_id   hadm_id icd_code  \
0    10000032  22595853     5723   
1    10000032  22595853    78959   
2    10000032  22595853     5715   
3    10000032  22595853    07070   
4    10000032  22595853      496   

                                          long_title  
0                                Portal hypertension  
1                                      Other ascites  
2      Cirrhosis of liver without mention of alcohol  
3  Unspecified viral hepatitis C without hepatic ...  
4  Chronic airway obstruction, not elsewhere clas...  
Procedures after merge: (859708, 4)
   subject_id   hadm_id icd_code  \
0    10000032  22595853     5491   
1    10000032  22841357     5491   
2    10000032  25742920     5491   
3    10000068  25022803     8938   
4    10000117  27988844  0QS734Z   

                                          long_title  
0                    Percutaneous abdominal drainage  
1                    Percutaneou

In [46]:
# Group lab events by hadm_id
lab_grouped = (lab_events_with_desc.groupby('hadm_id')['lab_result']
               .apply(lambda x: list(x.unique()))
               .reset_index()
               .rename(columns={'lab_result': 'labs'}))
print(f"Lab grouped shape: {lab_grouped.shape}")
print(lab_grouped.head())

Lab grouped shape: (1381, 2)
      hadm_id                                               labs
0  20010003.0  [51221_Hematocrit_Below, 51222_Hemoglobin_Belo...
1  20015927.0  [51221_Hematocrit_Below, 51222_Hemoglobin_Belo...
2  20019162.0  [51221_Hematocrit_Below, 51222_Hemoglobin_Belo...
3  20023045.0  [51221_Hematocrit_Below, 51222_Hemoglobin_Belo...
4  20023531.0  [50912_Creatinine_Above, 50931_Glucose_Above, ...


In [47]:
procedures['combined_title'] = (
    procedures['icd_code'].astype(str) + '_' + 
    procedures['long_title'].fillna('Unknown')
)

# Group procedures by hadm_id, collecting unique combined titles
procedures_grouped = (procedures.groupby('hadm_id')['combined_title']
                      .apply(lambda x: list(x.unique()))
                      .reset_index()
                      .rename(columns={'combined_title': 'procedures'}))

# Print shape and sample of grouped procedures
print(f"Procedures grouped shape: {procedures_grouped.shape}")
print(procedures_grouped.head())

Procedures grouped shape: (287504, 2)
    hadm_id                                         procedures
0  20000041                      [8154_Total knee replacement]
1  20000045  [3E0436Z_Introduction of Nutritional Substance...
2  20000069  [0KQM0ZZ_Repair Perineum Muscle, Open Approach...
3  20000102  [7359_Other manually assisted delivery, 7309_O...
4  20000147  [02100Z9_Bypass Coronary Artery, One Artery fr...


In [73]:
# Group diagnoses by hadm_id
diagnoses_grouped = (diagnoses.groupby('hadm_id')['icd_code'] #icd_code
                     .apply(lambda x: list(x.unique()))
                     .reset_index()
                     .rename(columns={'icd_codeicd_code': 'diagnoses'}))
print(f"Diagnoses grouped shape: {diagnoses_grouped.shape}")
print(procedures_grouped.head())

Diagnoses grouped shape: (545497, 2)
    hadm_id                                         procedures
0  20000041                      [8154_Total knee replacement]
1  20000045  [3E0436Z_Introduction of Nutritional Substance...
2  20000069  [0KQM0ZZ_Repair Perineum Muscle, Open Approach...
3  20000102  [7359_Other manually assisted delivery, 7309_O...
4  20000147  [02100Z9_Bypass Coronary Artery, One Artery fr...


In [66]:
notes = pd.read_csv(notes_path, usecols=['hadm_id', 'Symptoms', 'allergies'])

# Preprocess Symptoms and allergies to ensure they are lists with prefixed, formatted items
def format_items(value, prefix):
    if pd.isna(value) or value is None or value == '':
        return []
    try:
        if isinstance(value, str):
            # Split by comma, clean, and format each item
            items = [item.strip().lower().replace(' ', '_') for item in value.split(',') if item.strip() and item.lower() != 'none']
            return [f"{prefix}{item}" for item in items]
        if isinstance(value, list):
            # Clean and format list items
            items = [item.strip().lower().replace(' ', '_') for item in value if isinstance(item, str) and item.strip() and item.lower() != 'none']
            return [f"{prefix}{item}" for item in items]
        print(f"Unexpected value type for {prefix}: {value} (type: {type(value)})")
        return []
    except Exception as e:
        print(f"Error processing {prefix} value {value}: {e}")
        return []

# Apply formatting and ensure no NaN values remain
notes['Symptoms'] = notes['Symptoms'].apply(lambda x: format_items(x, 'Symptom_'))
notes['allergies'] = notes['allergies'].apply(lambda x: format_items(x, 'Allergy_'))

# Group by hadm_id, keeping Symptoms and allergies separate
notes_grouped = notes.groupby('hadm_id').agg({
    'Symptoms': lambda x: list(set(item for sublist in x for item in sublist if isinstance(sublist, list))),
    'allergies': lambda x: list(set(item for sublist in x for item in sublist if isinstance(sublist, list)))
}).reset_index()

# Print shape and sample of grouped notes
print(f"\nNotes grouped shape: {notes_grouped.shape}")
print("Sample of grouped notes:")
print(notes_grouped.head())


Notes grouped shape: (331793, 3)
Sample of grouped notes:
    hadm_id                                           Symptoms  \
0  20000019  [Symptom_nausea/vomiting, Symptom_flank_pain, ...   
1  20000024               [Symptom_weakness, Symptom_diarrhea]   
2  20000034                                                 []   
3  20000041                              [Symptom_l_knee_pain]   
4  20000057  [Symptom_ankle_pain_(s/p_mechanical_fall), Sym...   

                                           allergies  
0  [Allergy_no_known_allergies_/_adverse_drug_rea...  
1                                  [Allergy_aspirin]  
2  [Allergy_no_known_allergies_/_adverse_drug_rea...  
3                                    [Allergy_latex]  
4  [Allergy_no_known_allergies_/_adverse_drug_rea...  


In [67]:
# Admissions with patients
admissions_patients = admissions.merge(patients[['subject_id', 'gender']], on='subject_id', how='left')
print(f"Admissions with patients shape: {admissions_patients.shape}")

Admissions with patients shape: (546028, 4)


In [68]:
# Create base dataframe with all hadm_ids
transactions_df = pd.DataFrame({'hadm_id': admissions['hadm_id'].unique()})

# Merge all grouped data
print("Combining data for transactions...")
transactions_df = transactions_df.merge(admissions_patients[['hadm_id', 'gender', 'race']], on='hadm_id', how='left')
transactions_df = transactions_df.merge(diagnoses_grouped, on='hadm_id', how='left')
transactions_df = transactions_df.merge(procedures_grouped, on='hadm_id', how='left')
transactions_df = transactions_df.merge(lab_grouped, on='hadm_id', how='left')
transactions_df = transactions_df.merge(notes_grouped, on='hadm_id', how='left')
print(f"Transactions dataframe shape: {transactions_df.shape}")

Combining data for transactions...
Transactions dataframe shape: (546028, 8)


In [70]:
# Generate transactions
transactions = []
for _, row in transactions_df.iterrows():
    transaction = []
    
    # Helper function to safely add items
    def add_items(items, prefix=''):
        if isinstance(items, list):
            transaction.extend([f"{prefix}{item}" for item in items if pd.notna(item) and str(item).strip()])
    
    # Diagnoses
    if 'diagnoses' in row:
        add_items(row['diagnoses'], 'D_')
    
    # Procedures
    if 'procedures' in row:
        add_items(row['procedures'], 'P_')
    
    # Lab results
    if 'labs' in row:
        add_items(row['labs'], 'L_')
    
    # Gender
    if 'gender' in row and pd.notna(row['gender']):
        transaction.append(f"Gender_{row['gender']}")
    
    # Race
    if 'race' in row and pd.notna(row['race']):
        transaction.append(f"Race_{row['race'].replace(' ', '_')}")
    
    # Symptoms
    if 'Symptoms' in row:
        add_items(row['Symptoms'])
    
    # Allergies
    if 'allergies' in row:
        add_items(row['allergies'])
    
    transactions.append(transaction)

# Print sample transactions
print("\nSample transactions:")
for i, t in enumerate(transactions[:5], 1):
    print(f"Transaction {i}: {t[:10]}...")


Sample transactions:
Transaction 1: ['D_5723_Portal hypertension', 'D_78959_Other ascites', 'D_5715_Cirrhosis of liver without mention of alcohol', 'D_07070_Unspecified viral hepatitis C without hepatic coma', 'D_496_Chronic airway obstruction, not elsewhere classified', 'D_29680_Bipolar disorder, unspecified', 'D_30981_Posttraumatic stress disorder', 'D_V1582_Personal history of tobacco use', 'P_5491_Percutaneous abdominal drainage', 'L_51514_Urobilinogen_Above']...
Transaction 2: ['D_07071_Unspecified viral hepatitis C with hepatic coma', 'D_78959_Other ascites', 'D_2875_Thrombocytopenia, unspecified', 'D_2761_Hyposmolality and/or hyponatremia', 'D_496_Chronic airway obstruction, not elsewhere classified', 'D_5715_Cirrhosis of liver without mention of alcohol', 'D_V08_Asymptomatic human immunodeficiency virus [HIV] infection status', 'D_3051_Tobacco use disorder', 'P_5491_Percutaneous abdominal drainage', 'L_51516_WBC_Above']...
Transaction 3: ['D_07054_Chronic hepatitis C without m

In [71]:
# Create a DataFrame with hadm_id and transactions
transactions_df_out = pd.DataFrame({
    'hadm_id': transactions_df['hadm_id'],
    'transaction': transactions
})

# Save to CSV
transactions_df_out.to_csv("transactions.csv", index=False)
print("Transactions saved to transactions.csv")

Transactions saved to transactions.csv


In [None]:
# Step 1: Initialize encoder and get item frequencies
encoder = TransactionEncoder()
encoder.fit(transactions)  # Fit without transforming
item_counts = pd.Series(encoder.columns_).value_counts()

# Step 2: Keep only items appearing >= min_freq times
min_freq = 10  # Adjust based on your data
frequent_items = item_counts[item_counts >= min_freq].index.tolist()

# Step 3: Filter transactions to include only frequent items
filtered_transactions = [
    [item for item in txn if item in frequent_items] 
    for txn in transactions
]

# Step 4: Now one-hot encode the filtered transactions
onehot = encoder.fit_transform(transactions, sparse=True)
df_onehot = pd.DataFrame.sparse.from_spmatrix(onehot, columns=encoder.columns_)

  df_onehot = pd.DataFrame.sparse.from_spmatrix(onehot, columns=encoder.columns_)


In [None]:
frequent_itemsets = fpgrowth(df_onehot, min_support=0.1, use_colnames=True)

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.4)

print("Frequent Itemsets:\n", frequent_itemsets)
print("\nAssociation Rules:\n", rules)

In [None]:
frequent_itemsets.to_csv("frequent_itemsets.csv", index=False)
rules.to_csv("association_rules.csv", index=False)