<a href="https://colab.research.google.com/github/AhmadiJahid/CS525-Data-Mining/blob/ar/CS525.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder
import matplotlib.pyplot as plt
import os
from datetime import datetime

In [None]:
# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load LABEVENTS (Lab Test Results)
lab_events = pd.read_csv("labevents_sample.csv", usecols=['subject_id', 'hadm_id', 'itemid', 'value', 'flag'])
print(f"Loaded {len(lab_events)} lab events")

# Load D_LABITEMS (Lab Test Names)
lab_items = pd.read_csv("d_labitems.csv", usecols=['itemid', 'label'])
print(f"Loaded {len(lab_items)} lab items")

# Load Diagnoses (ICD-9 codes)
diagnoses = pd.read_csv("diagnoses_icd.csv", usecols=['subject_id', 'hadm_id', 'icd_code'])
print(f"Loaded {len(diagnoses)} diagnoses")

# Merge LABEVENTS with Lab Names
lab_events = lab_events.merge(lab_items, on="itemid", how="left")
print("\nSample of lab events with labels:")
print(lab_events.head())

# Convert ICD9 codes to strings
diagnoses['ICD9_CODE'] = diagnoses['icd_code'].astype(str)

#Lab results --> Diagnosis

In [None]:
# Drop rows with no abnormal flag and hadm_id
lab_events = lab_events.dropna(subset=['flag', 'hadm_id'])
print(f"Remaining lab events after dropping NaN: {len(lab_events)}")

# Create lab result identifier combining itemid and label
lab_events['lab_result'] = lab_events['itemid'].astype(str) + '_' + lab_events['label'].fillna('')

# Group lab results per hospital visit
lab_grouped = lab_events.groupby('hadm_id')['lab_result'].apply(list).reset_index()
print(f"Number of unique hospital visits with lab results: {len(lab_grouped)}")

# Group diagnoses per hospital visit
diagnoses_grouped = diagnoses.groupby('hadm_id')['ICD9_CODE'].apply(list).reset_index()
print(f"Number of unique hospital visits with diagnoses: {len(diagnoses_grouped)}")

# Merge lab results and diagnoses
diagnosis_lab_grouped = lab_grouped.merge(diagnoses_grouped, on="hadm_id", how="inner")
print(f"Number of hospital visits with both lab results and diagnoses: {len(diagnosis_lab_grouped)}")

# Combine Lab Results and Diagnoses into a single list per hospital visit
diagnosis_lab_grouped['Items'] = diagnosis_lab_grouped.apply(
    lambda row: row['lab_result'] + [f'DIAG_{code}' for code in row['ICD9_CODE']],
    axis=1
)

# diagnosis_lab_grouped['Items'] = diagnosis_lab_grouped['Items'].apply(
#             lambda x: x[:50])

# Create final transaction dataset
lab_diagnosis_transaction = diagnosis_lab_grouped[['hadm_id', 'Items']]

# Clean items: remove NaN and convert to strings
lab_diagnosis_transaction['Items'] = lab_diagnosis_transaction['Items'].apply(
    lambda item_list: [str(item) for item in item_list if pd.notnull(item)]
)

lab_diagnosis_transaction = lab_diagnosis_transaction.head(50)

print(len(lab_diagnosis_transaction))

print("\nSample of prepared transactions:")
print(lab_diagnosis_transaction.head())

Remaining lab events after dropping NaN: 96695
Number of unique hospital visits with lab results: 1382
Number of unique hospital visits with diagnoses: 545497
Number of hospital visits with both lab results and diagnoses: 1382
50

Sample of prepared transactions:
      hadm_id                                              Items
0  20010003.0  [51221_Hematocrit, 51222_Hemoglobin, 51265_Pla...
1  20015927.0  [51221_Hematocrit, 51222_Hemoglobin, 51248_MCH...
2  20019162.0  [51221_Hematocrit, 51222_Hemoglobin, 51265_Pla...
3  20023045.0  [51221_Hematocrit, 51222_Hemoglobin, 51279_Red...
4  20023531.0  [50912_Creatinine, 50931_Glucose, 50970_Phosph...


In [None]:
min_support=0.01
min_confidence=0.1

In [None]:
"""Run FP-Growth algorithm and generate association rules"""
te = TransactionEncoder()
te_ary = te.fit(lab_diagnosis_transaction['Items']).transform(lab_diagnosis_transaction['Items'])
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)
print(f"Number of unique items: {len(te.columns_)}")
print(f"Number of transactions: {len(df_encoded)}")

# Display sample of encoded transactions
#print("\nSample of encoded transactions:")
#print(df_encoded.head())

print("\nRunning FP-Growth algorithm...")

# Run FP-Growth
frequent_itemsets = fpgrowth(df_encoded, min_support=min_support, use_colnames=True)
print(f"\nFound {len(frequent_itemsets)} frequent itemsets")

# Display top 10 frequent itemsets
print("\nTop 10 frequent itemsets:")
print(frequent_itemsets.sort_values('support', ascending=False).head(10))

print("\nGenerating association rules...")
print(f"Minimum confidence: {min_confidence}")

# Generate rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
print(f"\nGenerated {len(rules)} association rules")

# Sort rules by lift
rules = rules.sort_values('lift', ascending=False)

# Display top 10 rules
print("\nTop 10 association rules by lift:")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10))


Number of unique items: 582
Number of transactions: 50

Running FP-Growth algorithm...


In [None]:
# Plot support, confidence, and lift for top N rules
top_n=20
plt.figure(figsize=(15, 5))

# Plot support vs confidence
plt.subplot(131)
plt.scatter(rules['support'].head(top_n), rules['confidence'].head(top_n))
plt.xlabel('Support')
plt.ylabel('Confidence')
plt.title('Support vs Confidence')

# Plot support vs lift
plt.subplot(132)
plt.scatter(rules['support'].head(top_n), rules['lift'].head(top_n))
plt.xlabel('Support')
plt.ylabel('Lift')
plt.title('Support vs Lift')

# Plot confidence vs lift
plt.subplot(133)
plt.scatter(rules['confidence'].head(top_n), rules['lift'].head(top_n))
plt.xlabel('Confidence')
plt.ylabel('Lift')
plt.title('Confidence vs Lift')

plt.tight_layout()
plt.show()

In [None]:
from datetime import datetime
import os
from google.colab import files  # Colab-specific

# Define the output directory inside /content
output_dir = "/content/rule_outputs"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created output directory: {output_dir}")

# Generate timestamp for filenames
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# File paths
all_rules_file = os.path.join(output_dir, f"all_rules_{timestamp}.csv")
lab_diagnosis_file = os.path.join(output_dir, f"lab_diagnosis_rules_{timestamp}.csv")
summary_file = os.path.join(output_dir, f"rules_summary_{timestamp}.txt")

# Save files
rules.to_csv(all_rules_file, index=False)
lab_diagnosis_rules.to_csv(lab_diagnosis_file, index=False)

with open(summary_file, 'w') as f:
    f.write(f"Association Rules Analysis Summary\n")
    f.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
    f.write(f"Total number of rules: {len(rules)}\n")
    f.write(f"Number of lab-diagnosis rules: {len(lab_diagnosis_rules)}\n\n")
    f.write(f"Top 10 rules by lift:\n")
    f.write(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10).to_string())
    f.write(f"\n\nTop 10 lab-diagnosis rules by lift:\n")
    f.write(lab_diagnosis_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10).to_string())

print("Files saved! Preparing download links...")

# Download to local machine
files.download(all_rules_file)
files.download(lab_diagnosis_file)
files.download(summary_file)


#Diagnosis --> Procedure

In [None]:
# Load diagnosis and procedure data
procedures = pd.read_csv("procedures_icd.csv", usecols=['subject_id', 'hadm_id', 'icd_code'])
diagnoses = pd.read_csv("diagnoses_icd.csv", usecols=['subject_id', 'hadm_id', 'icd_code'])

# Convert codes to strings for easier processing
diagnoses['ICD9_CODE'] = 'DX_' + diagnoses['icd_code'].astype(str)
procedures['ICD9_CODE'] = 'PR_' + procedures['icd_code'].astype(str)

In [None]:
# Group diagnoses by HADM_ID
diagnosis_grouped = diagnoses.groupby('hadm_id')['icd_code'].apply(list).reset_index()

# Group procedures by HADM_ID
procedure_grouped = procedures.groupby('hadm_id')['icd_code'].apply(list).reset_index()

# Merge both on HADM_ID
diagnosis_procedure_grouped = pd.merge(diagnosis_grouped, procedure_grouped, on='hadm_id', how='inner')

# Combine diagnoses and procedures into a single transaction
diagnosis_procedure_grouped['Items'] = diagnosis_procedure_grouped['icd_code_x'] + diagnosis_procedure_grouped['icd_code_y']
print(diagnosis_procedure_grouped.head())
diagnosis_procedure_transaction = diagnosis_procedure_grouped[['hadm_id', 'Items']]


      hadm_id  \
0  20000045.0   
1  20000069.0   
2  20000102.0   
3  20000235.0   
4  20000239.0   

                                                                                                           icd_code_x  \
0         [A419, N390, C7951, C787, K56699, C779, K5100, B9620, K1230, D630, C679, E8339, Z87891, Z8616, G893, D6481]   
1                                                                                                 [O701, Z370, Z3A37]   
2                                                                                                 [64421, V235, V270]   
3  [5722, 42833, 5724, 5856, 6826, 1122, 5715, 3970, 4280, 2841, 45621, 5723, V4511, 45981, 42731, V5861, 2449, 2749]   
4              [I120, N186, I5022, I69354, E1140, Z992, F329, E785, I447, N400, K219, Z794, E875, I25119, N289, D649]   

                 icd_code_y  \
0                 [3E0436Z]   
1        [0KQM0ZZ, 10E0XZZ]   
2              [7359, 7309]   
3  [3723, 8856, 4523, 3995]   
4              

In [None]:
print(diagnosis_procedure_transaction)

           hadm_id                                              Items
0       20000045.0  [A419, N390, C7951, C787, K56699, C779, K5100,...
1       20000069.0              [O701, Z370, Z3A37, 0KQM0ZZ, 10E0XZZ]
2       20000102.0                    [64421, V235, V270, 7359, 7309]
3       20000235.0  [5722, 42833, 5724, 5856, 6826, 1122, 5715, 39...
4       20000239.0  [I120, N186, I5022, I69354, E1140, Z992, F329,...
...            ...                                                ...
140732  29999415.0                   [55320, V4283, 2449, 5363, 5369]
140733  29999444.0  [0389, 5845, 59010, 29680, V4365, 71590, 0088,...
140734  29999616.0  [I5021, I240, I4892, I420, I952, I480, I340, F...
140735  29999625.0  [I614, T83511A, J9600, I5033, J150, N390, I481...
140736  29999693.0  [E6601, Z6842, I10, E785, Z23, K7581, M170, M4...

[140737 rows x 2 columns]
