In [4]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, recall_score,
    f1_score, matthews_corrcoef
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from google.colab import drive
drive.mount('/content/drive')



# ========== CHANGE THESE TWO LINES ==========
CSV_PATH = "/content/drive/MyDrive/dataSet/ml/data.csv"
TARGET_COL = "diagnosis"   # e.g., "diagnosis" or "target" (change if needed)
# ===========================================

df = pd.read_csv(CSV_PATH)

# Drop common junk columns like "Unnamed: 32"
df = df.loc[:, ~df.columns.str.contains(r"^Unnamed", regex=True)]

print("Shape:", df.shape)
print("Columns:", df.columns.tolist())

# Split features/target
# Split features/target
TARGET_COL = "diagnosis"

X = df.drop(columns=[TARGET_COL]).copy()

# Drop ID if present
if "id" in X.columns:
    X = X.drop(columns=["id"])

y_raw = df[TARGET_COL].astype(str)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y_raw)

print("Target mapping:", dict(zip(le.classes_, le.transform(le.classes_))))
# Typically: {'B': 0, 'M': 1}

# Train/test split (stratify keeps class balance)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

import numpy as np
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Class balance (train):", np.bincount(y_train))
print("Class balance (test):", np.bincount(y_test))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Shape: (569, 32)
Columns: ['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']
Target mapping: {'B': np.int64(0), 'M': np.int64(1)}
Train shape: (455, 30) Test shape: (114, 30)
Class balance (train): [285 170]
Class balance (test): [72 42]


# Model 5: Random Forest (Ensemble)

In [9]:
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, recall_score,
    f1_score, matthews_corrcoef
)

rf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("model", RandomForestClassifier(
        n_estimators=500,
        random_state=42,
        n_jobs=-1,
        class_weight="balanced"   # helpful if classes are imbalanced
    ))
])

# Train
rf.fit(X_train, y_train)

# Predict
y_pred = rf.predict(X_test)

# AUC (RF has predict_proba)
y_proba = rf.predict_proba(X_test)[:, 1]

# Metrics
metrics_rf = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "AUC": roc_auc_score(y_test, y_proba),
    "Precision": precision_score(y_test, y_pred, zero_division=0),
    "Recall": recall_score(y_test, y_pred, zero_division=0),
    "F1": f1_score(y_test, y_pred, zero_division=0),
    "MCC": matthews_corrcoef(y_test, y_pred),
}

print("\n=== Model 5: Random Forest Metrics ===")
for k, v in metrics_rf.items():
    print(f"{k:10s}: {v:.6f}")



=== Model 5: Random Forest Metrics ===
Accuracy  : 0.973684
AUC       : 0.996693
Precision : 1.000000
Recall    : 0.928571
F1        : 0.962963
MCC       : 0.944155
