In [None]:

# Final Optimized Medication Prediction Pipeline with GPU Support + Frequency Filtering + Hyperparameter Tuning
import pandas as pd
import numpy as np
import ast
from itertools import chain
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score, hamming_loss, accuracy_score, classification_report
from sklearn.multioutput import ClassifierChain
from catboost import CatBoostClassifier

# Step 1: Load data
file_path = "checking_data.csv"
df = pd.read_csv(file_path)

# Step 2: Safely parse all multi-label columns
def safe_eval_list(x):
    try:
        val = ast.literal_eval(str(x))
        return list(map(str, val)) if isinstance(val, (list, tuple)) else [str(val)]
    except:
        return []

df["CONDITION_CODES"] = df["CONDITION_CODES"].apply(safe_eval_list)
df["PROCEDURE_CODES"] = df["PROCEDURE_CODES"].apply(safe_eval_list)
df["MEDICATION_CODES"] = df["MEDICATION_CODES"].apply(safe_eval_list)
df["ALLERGIES_CODE"] = df["ALLERGIES_CODE"].apply(lambda x: [i.strip() for i in str(x).split(',') if i.strip().isdigit()])

# Step 3: Filter top 200 most common medication codes
top_med_codes = pd.Series(chain(*df["MEDICATION_CODES"])).value_counts().nlargest(200).index
df["MEDICATION_CODES"] = df["MEDICATION_CODES"].apply(lambda meds: [m for m in meds if m in top_med_codes])
df = df[df["MEDICATION_CODES"].map(len) > 0]

# Step 4: Drop NA
df.dropna(subset=["AGE", "Weight_kg", "Height_cm", "GENDER", "RACE", "ETHNICITY", "BP_CITY"], inplace=True)

# Step 5: Encode targets
mlb_med = MultiLabelBinarizer()
y = mlb_med.fit_transform(df["MEDICATION_CODES"])

# Step 6: Encode input features
mlb_proc = MultiLabelBinarizer().fit(df["PROCEDURE_CODES"])
mlb_cond = MultiLabelBinarizer().fit(df["CONDITION_CODES"])
mlb_alg = MultiLabelBinarizer().fit(df["ALLERGIES_CODE"])

X_proc = mlb_proc.transform(df["PROCEDURE_CODES"])
X_cond = mlb_cond.transform(df["CONDITION_CODES"])
X_alg = mlb_alg.transform(df["ALLERGIES_CODE"])

# Step 7: Dimensionality reduction
svd_proc = TruncatedSVD(n_components=min(50, X_proc.shape[1]), random_state=42)
svd_cond = TruncatedSVD(n_components=min(50, X_cond.shape[1]), random_state=42)
svd_alg = TruncatedSVD(n_components=min(30, X_alg.shape[1]), random_state=42)

X_proc_reduced = svd_proc.fit_transform(X_proc)
X_cond_reduced = svd_cond.fit_transform(X_cond)
X_alg_reduced = svd_alg.fit_transform(X_alg)

# Step 8: Categorical and numerical features
label_cols = ["BP_CITY", "MARITAL", "RACE", "ETHNICITY", "GENDER", "ENCOUNTER_CODE"]
label_encoders = {col: LabelEncoder().fit(df[col].astype(str)) for col in label_cols}
X_cat = np.column_stack([label_encoders[col].transform(df[col].astype(str)) for col in label_cols])

X_num = df[["AGE", "Weight_kg", "Height_cm"]].values

# Combine all features
X = np.hstack([X_proc_reduced, X_cond_reduced, X_alg_reduced, X_cat, X_num])

# Step 9: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 10: Filter labels with only one class in training
y_train_valid_idx = [i for i in range(y_train.shape[1]) if len(np.unique(y_train[:, i])) > 1]
y_train = y_train[:, y_train_valid_idx]
y_test = y_test[:, y_train_valid_idx]
mlb_med.classes_ = mlb_med.classes_[y_train_valid_idx]

# Step 11: Hyperparameter-tuned CatBoost with GPU
tuned_model = CatBoostClassifier(
    iterations=600,
    depth=8,
    learning_rate=0.03,
    task_type='GPU',
    devices='0,1',
    verbose=100
)

model = ClassifierChain(tuned_model)
model.fit(X_train, y_train)

# Step 12: Evaluation
y_pred = model.predict(X_test)
y_pred_label = mlb_med.inverse_transform(y_pred)
y_true_label = mlb_med.inverse_transform(y_test)


print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

# Step 13: Sample Predictions
sample_input_idx = np.where(y_test.sum(axis=1) > 0)[0][:10]
sample_df = df.iloc[sample_input_idx].copy().reset_index(drop=True)
sample_df["Actual_Medication_Codes"] = [", ".join(y_true_label[i]) for i in sample_input_idx]
sample_df["Predicted_Medication_Codes"] = [", ".join(y_pred_label[i]) for i in sample_input_idx]


0:	learn: 0.5922968	total: 1.29s	remaining: 12m 50s
100:	learn: 0.0125095	total: 11.1s	remaining: 54.7s
200:	learn: 0.0106590	total: 21.3s	remaining: 42.2s
300:	learn: 0.0096368	total: 31.3s	remaining: 31.1s
400:	learn: 0.0085921	total: 41.6s	remaining: 20.6s
500:	learn: 0.0078775	total: 53.8s	remaining: 10.6s
599:	learn: 0.0073341	total: 1m 7s	remaining: 0us
0:	learn: 0.5798768	total: 64.8ms	remaining: 38.8s
100:	learn: 0.0033867	total: 8.77s	remaining: 43.3s
200:	learn: 0.0023643	total: 18.2s	remaining: 36.2s
300:	learn: 0.0018647	total: 27.5s	remaining: 27.4s
400:	learn: 0.0014773	total: 37.3s	remaining: 18.5s
500:	learn: 0.0011868	total: 47.6s	remaining: 9.41s
599:	learn: 0.0010167	total: 57.9s	remaining: 0us
0:	learn: 0.5832059	total: 63.8ms	remaining: 38.2s
100:	learn: 0.0005846	total: 8.87s	remaining: 43.8s
200:	learn: 0.0002065	total: 19.1s	remaining: 37.9s
300:	learn: 0.0001105	total: 30.7s	remaining: 30.5s
400:	learn: 0.0000761	total: 40.8s	remaining: 20.2s
500:	learn: 0.0000

In [3]:
print("\n✅ Evaluation Metrics")
print("Micro F1 Score:", f1_score(y_test, y_pred, average="micro"))
print("Hamming Loss:", hamming_loss(y_test, y_pred))
print("Subset Accuracy (Exact Match):", accuracy_score(y_test, y_pred))



Evaluation Metrics
Micro F1 Score: 0.8512833327853221
Hamming Loss: 0.010499031056511825
Subset Accuracy (Exact Match): 0.6229234361319195


In [6]:
print("\n📋 Sample Predictions:")
print(sample_df.columns)
print(sample_df[["Actual_Medication_Codes", "Predicted_Medication_Codes"]].head(15))


📋 Sample Predictions:
Index(['ENCOUNTER', 'PATIENT', 'DATE', 'ENCOUNTER_CODE',
       'ENCOUNTER_DESCRIPTION', 'CONDITION_CODES', 'CONDITION_DESCRIPTIONS',
       'MEDICATION_CODES', 'MEDICATION_DESCRIPTIONS', 'PROCEDURE_CODES',
       'PROCEDURE_DESCRIPTIONS', 'Height_cm', 'Weight_kg', 'BP_CITY', 'STATE',
       'ZIP', 'ALLERGIES', 'ALLERGIES_CODE', 'BIRTHDATE', 'DEATHDATE', 'SSN',
       'DRIVERS', 'PASSPORT', 'PREFIX', 'FIRST', 'LAST', 'SUFFIX', 'MAIDEN',
       'MARITAL', 'RACE', 'ETHNICITY', 'GENDER', 'BIRTHPLACE', 'ADDRESS',
       'AGE', 'BMI', 'ENCOUNTER_LENGTH', 'PROCEDURE_LENGTH',
       'CONDITION_LENGTH', 'Actual_Medication_Codes',
       'Predicted_Medication_Codes'],
      dtype='object')
   Actual_Medication_Codes Predicted_Medication_Codes
0                  1020137                     608680
1           313782, 861467             849574, 861467
2   608680, 745679, 895994     608680, 745679, 895994
3                  1049630                     834060
4                

In [3]:
import joblib

# Save the trained ClassifierChain model
joblib.dump(model, "medication_chain_catboost.pkl")

# Save the medication label binarizer
joblib.dump(mlb_med, "mlb_medication.pkl")

# Save the input multi-label encoders
joblib.dump(mlb_proc, "mlb_procedure.pkl")
joblib.dump(mlb_cond, "mlb_condition.pkl")
joblib.dump(mlb_alg, "mlb_allergy.pkl")

# Save the LabelEncoders for categorical features
joblib.dump(label_encoders, "label_encoders.pkl")

# Save dimensionality reduction models (optional but recommended for consistency)
joblib.dump(svd_proc, "svd_proc.pkl")
joblib.dump(svd_cond, "svd_cond.pkl")
joblib.dump(svd_alg, "svd_alg.pkl")


['svd_alg.pkl']