<a href="https://colab.research.google.com/github/2403A51L33/PfDS-PROJECT/blob/main/FULL%20CODE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import os
import zipfile
import glob
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

try:
    import xgboost as xgb
    HAS_XGB = True
except Exception:
    HAS_XGB = False

try:
    import lightgbm as lgb
    HAS_LGB = True
except Exception:
    HAS_LGB = False

import shap
import joblib
import matplotlib.pyplot as plt

RANDOM_STATE = 42
TEST_SIZE = 0.2

DATASET_PATH = "/content/realistic_drug_labels_side_effects.csv"

def find_csv_in_zip(zip_path):
    if not os.path.exists(zip_path):
        return None
    extract_dir = os.path.splitext(zip_path)[0] + "_extracted"
    os.makedirs(extract_dir, exist_ok=True)
    with zipfile.ZipFile(zip_path, "r") as z:
        z.extractall(extract_dir)
    csvs = glob.glob(os.path.join(extract_dir, "**", "*.csv"), recursive=True)
    return csvs[0] if csvs else None

def find_first_csv_in_dir(search_dir="/mnt/data"):
    csvs = glob.glob(os.path.join(search_dir, "**", "*.csv"), recursive=True)
    return csvs[0] if csvs else None

def load_dataset():
    if os.path.exists(DATASET_PATH):
        print("Found CSV at specified path:", DATASET_PATH)
        return pd.read_csv(DATASET_PATH)
    raise FileNotFoundError("No CSV found. Place dataset in /mnt/data or set DATASET_PATH correctly.")

df = load_dataset()
print("Dataset shape:", df.shape)

TARGET_COL = "side_effect_severity"
if TARGET_COL not in df.columns:
    for alt in ["approval_status", "target", "label"]:
        if alt in df.columns:
            TARGET_COL = alt
            print("Using alternate target:", TARGET_COL)
            break
    else:
        raise ValueError(f"Target column '{TARGET_COL}' not found. Columns: {df.columns.tolist()}")

X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL].copy().astype("category").cat.codes

numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

for col in numeric_cols[:]:
    if X[col].nunique() <= 20:
        numeric_cols.remove(col)
        categorical_cols.append(col)

print("Numeric cols:", numeric_cols)
print("Categorical cols:", categorical_cols)

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_cols),
    ("cat", categorical_transformer, categorical_cols)
], remainder="drop")

rf = RandomForestClassifier(
    n_estimators=400, max_depth=None, class_weight="balanced",
    n_jobs=-1, random_state=RANDOM_STATE
)
estimators = [("rf", rf)]

if HAS_XGB:
    xgb_c = xgb.XGBClassifier(
        n_estimators=400, learning_rate=0.05, max_depth=6,
        subsample=0.9, colsample_bytree=0.8, use_label_encoder=False,
        eval_metric="mlogloss", random_state=RANDOM_STATE
    )
    estimators.append(("xgb", xgb_c))

if HAS_LGB:
    lgb_c = lgb.LGBMClassifier(
        n_estimators=400, learning_rate=0.05, num_leaves=63,
        feature_fraction=0.9, bagging_fraction=0.8,
        random_state=RANDOM_STATE, n_jobs=-1
    )
    estimators.append(("lgbm", lgb_c))

if len(estimators) < 2:
    estimators.append(("rf2", RandomForestClassifier(
        n_estimators=300, random_state=RANDOM_STATE+1, class_weight="balanced"
    )))

meta = LogisticRegression(max_iter=1000, class_weight="balanced")

stack_model = StackingClassifier(
    estimators=estimators,
    final_estimator=meta,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    n_jobs=-1,
    passthrough=False
)

full_pipeline = ImbPipeline([
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=RANDOM_STATE)),
    ("stack", stack_model)
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

print("\nTraining stacked ensemble model...")
full_pipeline.fit(X_train, y_train)

y_pred = full_pipeline.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1m = f1_score(y_test, y_pred, average="macro")

print("\nClassification Report (Test):")
print(classification_report(y_test, y_pred, digits=4))
print(f"Final Test Accuracy: {acc:.4f}")
print(f"Final Test F1 (macro): {f1m:.4f}")
print("\nBaseline GradientBoosting F1 ≈ 0.3419 → Check if improved!")

os.makedirs("models", exist_ok=True)
joblib.dump(full_pipeline, "models/stacked_pipeline_fixed.joblib")
print("\nSaved model to models/stacked_pipeline_fixed.joblib")

print("\nComputing SHAP explainability (tree-based model)...")
try:
    rf_model = full_pipeline.named_steps["stack"].estimators_[0]
    X_train_proc = full_pipeline.named_steps["preprocessor"].transform(X_train)
    background = shap.utils.sample(X_train_proc, 100)
    explainer = shap.TreeExplainer(rf_model)
    X_test_proc = full_pipeline.named_steps["preprocessor"].transform(X_test)
    sample = shap.utils.sample(X_test_proc, min(50, X_test_proc.shape[0]))
    shap_values = explainer.shap_values(sample)
    shap.summary_plot(shap_values, sample, show=False)
    plt.tight_layout()
    plt.savefig("shap_summary.png", bbox_inches="tight")
    print("Saved SHAP summary plot → shap_summary.png")
except Exception as e:
    print("SHAP skipped due to error:", e)

print("\n Script finished successfully.")

Found CSV at specified path: /content/realistic_drug_labels_side_effects.csv
Dataset shape: (1436, 15)
Numeric cols: ['approval_year', 'dosage_mg', 'price_usd']

Training stacked ensemble model...

Classification Report (Test):
              precision    recall  f1-score   support

           0     0.3942    0.4184    0.4059        98
           1     0.3176    0.2784    0.2967        97
           2     0.2727    0.2903    0.2812        93

    accuracy                         0.3299       288
   macro avg     0.3282    0.3290    0.3280       288
weighted avg     0.3292    0.3299    0.3289       288

Final Test Accuracy: 0.3299
Final Test F1 (macro): 0.3280

Baseline GradientBoosting F1 ≈ 0.3419 → Check if improved!

Saved model to models/stacked_pipeline_fixed.joblib

Computing SHAP explainability (tree-based model)...
SHAP skipped due to error: Additivity check failed in TreeExplainer! Please ensure the data matrix you passed to the explainer is the same shape that the model was tra