In [4]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, recall_score,
    f1_score, matthews_corrcoef
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from google.colab import drive
drive.mount('/content/drive')



# ========== CHANGE THESE TWO LINES ==========
CSV_PATH = "/content/drive/MyDrive/dataSet/ml/data.csv"
TARGET_COL = "diagnosis"   # e.g., "diagnosis" or "target" (change if needed)
# ===========================================

df = pd.read_csv(CSV_PATH)

# Drop common junk columns like "Unnamed: 32"
df = df.loc[:, ~df.columns.str.contains(r"^Unnamed", regex=True)]

print("Shape:", df.shape)
print("Columns:", df.columns.tolist())

# Split features/target
# Split features/target
TARGET_COL = "diagnosis"

X = df.drop(columns=[TARGET_COL]).copy()

# Drop ID if present
if "id" in X.columns:
    X = X.drop(columns=["id"])

y_raw = df[TARGET_COL].astype(str)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y_raw)

print("Target mapping:", dict(zip(le.classes_, le.transform(le.classes_))))
# Typically: {'B': 0, 'M': 1}

# Train/test split (stratify keeps class balance)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

import numpy as np
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Class balance (train):", np.bincount(y_train))
print("Class balance (test):", np.bincount(y_test))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Shape: (569, 32)
Columns: ['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']
Target mapping: {'B': np.int64(0), 'M': np.int64(1)}
Train shape: (455, 30) Test shape: (114, 30)
Class balance (train): [285 170]
Class balance (test): [72 42]


# logistic regression

In [5]:
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, recall_score,
    f1_score, matthews_corrcoef
)

# Preprocess (numeric features)
preprocess = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Logistic Regression pipeline
lr = Pipeline(steps=[
    ("prep", preprocess),
    ("model", LogisticRegression(max_iter=5000, solver="lbfgs"))
])

# Train
lr.fit(X_train, y_train)

# Predict
y_pred = lr.predict(X_test)
y_proba = lr.predict_proba(X_test)[:, 1]  # prob of class 1 (Malignant)

# Metrics
metrics_lr = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "AUC": roc_auc_score(y_test, y_proba),
    "Precision": precision_score(y_test, y_pred, zero_division=0),
    "Recall": recall_score(y_test, y_pred, zero_division=0),
    "F1": f1_score(y_test, y_pred, zero_division=0),
    "MCC": matthews_corrcoef(y_test, y_pred),
}

print("\n=== Model 1: Logistic Regression Metrics ===")
for k, v in metrics_lr.items():
    print(f"{k:10s}: {v:.6f}")



=== Model 1: Logistic Regression Metrics ===
Accuracy  : 0.964912
AUC       : 0.996032
Precision : 0.975000
Recall    : 0.928571
F1        : 0.951220
MCC       : 0.924518


# Decision tree classifier

In [6]:
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, recall_score,
    f1_score, matthews_corrcoef
)

# For trees: scaling is NOT needed. We'll just impute missing values (if any).
prep_tree = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

dt = Pipeline(steps=[
    ("prep", prep_tree),
    ("model", DecisionTreeClassifier(
        random_state=42,
        # leave defaults first; we'll tune later only if needed
    ))
])

# Train
dt.fit(X_train, y_train)

# Predict
y_pred = dt.predict(X_test)

# AUC: DecisionTree has predict_proba, so use it
y_proba = dt.predict_proba(X_test)[:, 1]

# Metrics
metrics_dt = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "AUC": roc_auc_score(y_test, y_proba),
    "Precision": precision_score(y_test, y_pred, zero_division=0),
    "Recall": recall_score(y_test, y_pred, zero_division=0),
    "F1": f1_score(y_test, y_pred, zero_division=0),
    "MCC": matthews_corrcoef(y_test, y_pred),
}

print("\n=== Model 2: Decision Tree Metrics ===")
for k, v in metrics_dt.items():
    print(f"{k:10s}: {v:.6f}")



=== Model 2: Decision Tree Metrics ===
Accuracy  : 0.929825
AUC       : 0.924603
Precision : 0.904762
Recall    : 0.904762
F1        : 0.904762
MCC       : 0.849206


# k nearnes neabours

In [7]:
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, recall_score,
    f1_score, matthews_corrcoef
)

knn = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("model", KNeighborsClassifier(
        n_neighbors=5,      # default baseline
        weights="distance", # often better than uniform
        metric="minkowski", # Euclidean (p=2) by default
        p=2
    ))
])

# Train
knn.fit(X_train, y_train)

# Predict
y_pred = knn.predict(X_test)

# AUC: KNN supports predict_proba
y_proba = knn.predict_proba(X_test)[:, 1]

# Metrics
metrics_knn = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "AUC": roc_auc_score(y_test, y_proba),
    "Precision": precision_score(y_test, y_pred, zero_division=0),
    "Recall": recall_score(y_test, y_pred, zero_division=0),
    "F1": f1_score(y_test, y_pred, zero_division=0),
    "MCC": matthews_corrcoef(y_test, y_pred),
}

print("\n=== Model 3: KNN Metrics ===")
for k, v in metrics_knn.items():
    print(f"{k:10s}: {v:.6f}")



=== Model 3: KNN Metrics ===
Accuracy  : 0.956140
AUC       : 0.982474
Precision : 0.974359
Recall    : 0.904762
F1        : 0.938272
MCC       : 0.905824


# Model 4: Naive Bayes (GaussianNB)

In [8]:
import numpy as np

from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, recall_score,
    f1_score, matthews_corrcoef
)

# GaussianNB does NOT need scaling, but itâ€™s okay if you scale.
# We'll keep it simple: just impute.
gnb = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("model", GaussianNB())
])

# Train
gnb.fit(X_train, y_train)

# Predict
y_pred = gnb.predict(X_test)

# AUC (GaussianNB has predict_proba)
y_proba = gnb.predict_proba(X_test)[:, 1]

# Metrics
metrics_gnb = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "AUC": roc_auc_score(y_test, y_proba),
    "Precision": precision_score(y_test, y_pred, zero_division=0),
    "Recall": recall_score(y_test, y_pred, zero_division=0),
    "F1": f1_score(y_test, y_pred, zero_division=0),
    "MCC": matthews_corrcoef(y_test, y_pred),
}

print("\n=== Model 4: Gaussian Naive Bayes Metrics ===")
for k, v in metrics_gnb.items():
    print(f"{k:10s}: {v:.6f}")



=== Model 4: Gaussian Naive Bayes Metrics ===
Accuracy  : 0.938596
AUC       : 0.993386
Precision : 1.000000
Recall    : 0.833333
F1        : 0.909091
MCC       : 0.871489


# Model 5: Random Forest (Ensemble)

In [9]:
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, recall_score,
    f1_score, matthews_corrcoef
)

rf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("model", RandomForestClassifier(
        n_estimators=500,
        random_state=42,
        n_jobs=-1,
        class_weight="balanced"   # helpful if classes are imbalanced
    ))
])

# Train
rf.fit(X_train, y_train)

# Predict
y_pred = rf.predict(X_test)

# AUC (RF has predict_proba)
y_proba = rf.predict_proba(X_test)[:, 1]

# Metrics
metrics_rf = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "AUC": roc_auc_score(y_test, y_proba),
    "Precision": precision_score(y_test, y_pred, zero_division=0),
    "Recall": recall_score(y_test, y_pred, zero_division=0),
    "F1": f1_score(y_test, y_pred, zero_division=0),
    "MCC": matthews_corrcoef(y_test, y_pred),
}

print("\n=== Model 5: Random Forest Metrics ===")
for k, v in metrics_rf.items():
    print(f"{k:10s}: {v:.6f}")



=== Model 5: Random Forest Metrics ===
Accuracy  : 0.973684
AUC       : 0.996693
Precision : 1.000000
Recall    : 0.928571
F1        : 0.962963
MCC       : 0.944155


# Model 6: XGBoost Classifier (Ensemble)

In [10]:
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, recall_score,
    f1_score, matthews_corrcoef
)

from xgboost import XGBClassifier

xgb = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("model", XGBClassifier(
        n_estimators=600,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.85,
        colsample_bytree=0.85,
        reg_lambda=1.0,
        objective="binary:logistic",
        eval_metric="auc",
        random_state=42,
        n_jobs=-1
    ))
])

# Train
xgb.fit(X_train, y_train)

# Predict
y_pred = xgb.predict(X_test)
y_proba = xgb.predict_proba(X_test)[:, 1]

# Metrics
metrics_xgb = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "AUC": roc_auc_score(y_test, y_proba),
    "Precision": precision_score(y_test, y_pred, zero_division=0),
    "Recall": recall_score(y_test, y_pred, zero_division=0),
    "F1": f1_score(y_test, y_pred, zero_division=0),
    "MCC": matthews_corrcoef(y_test, y_pred),
}

print("\n=== Model 6: XGBoost Metrics ===")
for k, v in metrics_xgb.items():
    print(f"{k:10s}: {v:.6f}")



=== Model 6: XGBoost Metrics ===
Accuracy  : 0.964912
AUC       : 0.992725
Precision : 1.000000
Recall    : 0.904762
F1        : 0.950000
MCC       : 0.925820


# Final comparison table

In [11]:
import pandas as pd

# Put your saved dicts here (names exactly as you used)
# metrics_lr, metrics_dt, metrics_knn, metrics_gnb, metrics_rf, metrics_xgb

summary = pd.DataFrame({
    "Logistic Regression": metrics_lr,
    "Decision Tree": metrics_dt,
    "KNN": metrics_knn,
    "Gaussian NB": metrics_gnb,
    "Random Forest": metrics_rf,
    "XGBoost": metrics_xgb
}).T

# Optional: nice rounding for display
summary = summary.round(6)

summary

# Optional: save to CSV for submission
summary.to_csv("ml_assignment2_model_metrics_summary.csv", index=True)
#print("\nSaved: ml_assignment2_model_metrics_summary.csv")
summary

Unnamed: 0,Accuracy,AUC,Precision,Recall,F1,MCC
Logistic Regression,0.964912,0.996032,0.975,0.928571,0.95122,0.924518
Decision Tree,0.929825,0.924603,0.904762,0.904762,0.904762,0.849206
KNN,0.95614,0.982474,0.974359,0.904762,0.938272,0.905824
Gaussian NB,0.938596,0.993386,1.0,0.833333,0.909091,0.871489
Random Forest,0.973684,0.996693,1.0,0.928571,0.962963,0.944155
XGBoost,0.964912,0.992725,1.0,0.904762,0.95,0.92582
