## Imports & Setup

In [35]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    log_loss,
    confusion_matrix,
    classification_report,
    matthews_corrcoef,
    cohen_kappa_score
)

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

RANDOM_STATE = 42
all_results = []


## Evaluation Function

In [36]:
def evaluate_model(y_true, y_pred, y_prob=None):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1": f1_score(y_true, y_pred),
        "ROC_AUC": roc_auc_score(y_true, y_prob) if y_prob is not None else np.nan
    }

## Dataset Loading & Intelligent Preprocessing

In [38]:
# Load dataset
df = pd.read_csv("brca.csv")

# Drop ID-like columns automatically
id_cols = [col for col in df.columns if 'id' in col.lower()]
df.drop(columns=id_cols, inplace=True, errors='ignore')

# Identify target column (binary column)
target_col = None
for col in df.columns:
    if df[col].nunique() == 2:
        target_col = col
        break

if target_col is None:
    raise ValueError("No binary target column found!")

# Encode target if categorical
if df[target_col].dtype == 'object':
    le = LabelEncoder()
    df[target_col] = le.fit_transform(df[target_col])

# Handle missing values
df = df.fillna(df.median(numeric_only=True))

# Feature-target split
X = df.drop(columns=[target_col])
y = df[target_col]

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=RANDOM_STATE
)


KeyError: "['target'] not found in axis"

## Logistic Regression

In [28]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

all_results.append({"Model": "Logistic Regression",
                    **evaluate_model(y_test, y_pred, y_prob)})


## KNN

In [29]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

all_results.append({"Model": "KNN",
                    **evaluate_model(y_test, y_pred)})


## Decision Tree

In [30]:
model = DecisionTreeClassifier(max_depth=6, random_state=RANDOM_STATE)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

all_results.append({"Model": "Decision Tree",
                    **evaluate_model(y_test, y_pred)})


## Random Forest

In [31]:
model = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

all_results.append({"Model": "Random Forest",
                    **evaluate_model(y_test, y_pred, y_prob)})


## Gradient Boosting

In [32]:
model = GradientBoostingClassifier(n_estimators=300, learning_rate=0.05)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

all_results.append({"Model": "Gradient Boosting",
                    **evaluate_model(y_test, y_pred, y_prob)})


## SVM

In [33]:
model = SVC(kernel='rbf', probability=True)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

all_results.append({"Model": "SVM (RBF)",
                    **evaluate_model(y_test, y_pred, y_prob)})


## Naive Bayes

In [34]:
model = GaussianNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

all_results.append({"Model": "Naive Bayes",
                    **evaluate_model(y_test, y_pred)})


# FINAL RESULTS