In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ------------------------------------------------------------
# 1) Load Lung Cancer dataset (TRY OpenML first, else fallback)
# ------------------------------------------------------------
def load_lung_cancer():
    # ---- Option A: OpenML (preferred) ----
    try:
        from sklearn.datasets import fetch_openml
        X, y = fetch_openml(name="lung-cancer", as_frame=True, return_X_y=True)
        df = X.copy()
        df["target"] = y
        print("Loaded: OpenML lung-cancer")
        return df, "target"
    except Exception as e:
        print("OpenML load failed, trying UCI fallback...")

    # ---- Option B: UCI fallback (small classic dataset; contains '?' missing values) ----
    # UCI Lung Cancer dataset is often mirrored; this is a common raw mirror:
    # If this mirror is blocked in your environment, the code will still tell you what failed.
    import requests
    from io import StringIO

    uci_url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/lung-cancer.csv"
    try:
        csv_text = requests.get(uci_url, timeout=60).text
        # This dataset usually has class in the first column followed by attributes
        df = pd.read_csv(StringIO(csv_text), header=None)
        df = df.replace("?", np.nan)

        # first column is target
        df = df.rename(columns={0: "target"})
        print("Loaded: UCI lung-cancer (GitHub mirror)")
        return df, "target"
    except Exception as e:
        raise RuntimeError(
            "Could not load Lung Cancer dataset from OpenML or the UCI mirror. "
            "If your network blocks downloads, upload the dataset CSV and I’ll adapt the code instantly."
        )

df, target_col = load_lung_cancer()

# ------------------------------------------------------------
# 2) Separate X and y
# ------------------------------------------------------------
y = df[target_col]
X = df.drop(columns=[target_col])

# If target is non-numeric labels, encode to integers
if y.dtype == "object" or str(y.dtype).startswith("string"):
    y = y.astype(str).str.strip()
    y = pd.Series(LabelEncoder().fit_transform(y), name="target")

# ------------------------------------------------------------
# 3) Identify numeric/categorical columns (robust)
# ------------------------------------------------------------
# Convert numeric-looking columns safely; keep others categorical
X_numeric = X.apply(pd.to_numeric, errors="coerce")
numeric_cols = X_numeric.columns[X_numeric.notna().any()].tolist()

# columns that are not reliably numeric are treated as categorical
categorical_cols = [c for c in X.columns if c not in numeric_cols]

# Build a clean working X where numeric cols are numeric
X_clean = X.copy()
for c in numeric_cols:
    X_clean[c] = pd.to_numeric(X_clean[c], errors="coerce")

print("\nDataset shape:", X_clean.shape)
print("Numeric cols:", len(numeric_cols), "Categorical cols:", len(categorical_cols))
print("Classes:", np.unique(y))

# ------------------------------------------------------------
# 4) Train-test split
# ------------------------------------------------------------
# Stratify only if every class has at least 2 samples in this split
# (This dataset is small, so stratify may fail in rare cases; handle safely.)
try:
    X_tr, X_te, y_tr, y_te = train_test_split(
        X_clean, y, test_size=0.25, random_state=42, stratify=y
    )
except Exception:
    X_tr, X_te, y_tr, y_te = train_test_split(
        X_clean, y, test_size=0.25, random_state=42
    )

# ------------------------------------------------------------
# 5) Preprocessing pipelines
# ------------------------------------------------------------
# For Naive Bayes + ANN (scaling helps)
num_scaled = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# For Decision Tree (no scaling needed)
num_noscale = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

cat_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocess_scaled = ColumnTransformer(
    transformers=[
        ("num", num_scaled, numeric_cols),
        ("cat", cat_pipe, categorical_cols)
    ],
    remainder="drop"
)

preprocess_tree = ColumnTransformer(
    transformers=[
        ("num", num_noscale, numeric_cols),
        ("cat", cat_pipe, categorical_cols)
    ],
    remainder="drop"
)

# ------------------------------------------------------------
# 6) Model A — Bayesian Classification (GaussianNB)
# ------------------------------------------------------------
nb_model = Pipeline(steps=[
    ("prep", preprocess_scaled),
    ("clf", GaussianNB())
])

nb_model.fit(X_tr, y_tr)
nb_pred = nb_model.predict(X_te)

print("\n==============================")
print("MODEL: Gaussian Naive Bayes")
print("==============================")
print("Accuracy:", accuracy_score(y_te, nb_pred))
print("Confusion Matrix:\n", confusion_matrix(y_te, nb_pred))
print("Classification Report:\n", classification_report(y_te, nb_pred, zero_division=0))

# ------------------------------------------------------------
# 7) Model B — Decision Tree (Entropy)
# ------------------------------------------------------------
dt_model = Pipeline(steps=[
    ("prep", preprocess_tree),
    ("clf", DecisionTreeClassifier(
        criterion="entropy",
        max_depth=10,         # control overfitting
        min_samples_leaf=2,
        random_state=42
    ))
])

dt_model.fit(X_tr, y_tr)
dt_pred = dt_model.predict(X_te)

print("\n==============================")
print("MODEL: Decision Tree (Entropy)")
print("==============================")
print("Accuracy:", accuracy_score(y_te, dt_pred))
print("Confusion Matrix:\n", confusion_matrix(y_te, dt_pred))
print("Classification Report:\n", classification_report(y_te, dt_pred, zero_division=0))

# ------------------------------------------------------------
# 8) Model C — ANN (MLPClassifier)
# ------------------------------------------------------------
ann_model = Pipeline(steps=[
    ("prep", preprocess_scaled),
    ("clf", MLPClassifier(
        hidden_layer_sizes=(64, 32),
        activation="relu",
        max_iter=800,
        random_state=42
    ))
])

ann_model.fit(X_tr, y_tr)
ann_pred = ann_model.predict(X_te)

print("\n==============================")
print("MODEL: ANN (MLP)")
print("==============================")
print("Accuracy:", accuracy_score(y_te, ann_pred))
print("Confusion Matrix:\n", confusion_matrix(y_te, ann_pred))
print("Classification Report:\n", classification_report(y_te, ann_pred, zero_division=0))


Loaded: OpenML lung-cancer

Dataset shape: (32, 56)
Numeric cols: 56 Categorical cols: 0
Classes: ['1' '2' '3']

MODEL: Gaussian Naive Bayes
Accuracy: 0.625
Confusion Matrix:
 [[0 1 1]
 [1 2 0]
 [0 0 3]]
Classification Report:
               precision    recall  f1-score   support

           1       0.00      0.00      0.00         2
           2       0.67      0.67      0.67         3
           3       0.75      1.00      0.86         3

    accuracy                           0.62         8
   macro avg       0.47      0.56      0.51         8
weighted avg       0.53      0.62      0.57         8


MODEL: Decision Tree (Entropy)
Accuracy: 0.75
Confusion Matrix:
 [[1 0 1]
 [1 2 0]
 [0 0 3]]
Classification Report:
               precision    recall  f1-score   support

           1       0.50      0.50      0.50         2
           2       1.00      0.67      0.80         3
           3       0.75      1.00      0.86         3

    accuracy                           0.75         8
 