# ðŸŽ¬ Supervised learning (Binary Classification)

This notebook demonstrates a simple example using Kaggle Breast Cancer dataset built with the **sklearn** library.

In [None]:
# ====================
# STEP 1: Import libs
# ====================
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
import joblib
import numpy as np
import os

# ===========================================
# STEP 2: Read csv file and show top 5 rows
# ===========================================

csv_path = "./data.csv"
df = pd.read_csv(csv_path)
#print(df.head())


# ===========================================
# STEP 3: Data Preprocessing / Data Cleaning
# ===========================================

# Drop ID column becuase column is just an identifier
# It doesnâ€™t describe any physical or biological property of the tumor.
# Such columns donâ€™t carry predictive information

if 'id' in df.columns:
    df = df.drop('id', axis=1)

# Label Encoding / Target Encoding : Convert 'diagnosis' to binary 0/1
df['diagnosis'] = df['diagnosis'].map({'M':0, 'B':1})

X = df.drop('diagnosis', axis=1)
y = df['diagnosis']


# ====================
# STEP 4: Split data
# =====================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ======================
# STEP 5: Scale features
# ======================

scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)

# ======================
# STEP 6: Train model
# ======================

clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train_s, y_train)


## Evaluate

In [None]:
# ======================
# STEP 7: Evaluate
# ======================

y_pred = clf.predict(X_test_s)
y_proba = clf.predict_proba(X_test_s)[:,1]

print("\nConfusion matrix:")
print("***********************\n\n")
print(confusion_matrix(y_test, y_pred))

print("\nClassification report:")
print("***********************\n\n")
print(classification_report(y_test, y_pred, target_names=['Malignant','Benign']))

print("\nROC AUC:")
print("***********************\n\n")
print(roc_auc_score(y_test, y_proba))




## Three sample input/output *examples*



In [None]:
# ===========================================
# STEP 8: Three sample input/output examples
# =============================================

#Task : Predict whether a breast tumor is malignant (M) or benign (B) based on numeric features extracted from fine-needle aspirate (FNA) images.

print("Goal: Predict whether a tumor is malignant (M=0) or benign (B=1)")
print("*****************************************************************\n\n")
# Select 3 random test samples
sample_idx = np.random.choice(range(X_test.shape[0]), size=3, replace=False)
sample_features = X_test.iloc[sample_idx]
sample_true = y_test.iloc[sample_idx].values
sample_pred = clf.predict(X_test_s[sample_idx])
sample_prob = clf.predict_proba(X_test_s[sample_idx])[:, 1]

# Display examples (limit columns for readability)
display_cols = ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness']
example_df = sample_features[display_cols].copy()
example_df['True_Label'] = sample_true
example_df['Predicted_Label'] = sample_pred
example_df['Prob_Benign'] = sample_prob.round(3)

print(example_df)
print("\n\nInterpretation:")
print("***********************\n")
print(" - True_Label: 0 = malignant, 1 = benign")
print(" - Predicted_Label: model's classification")
print(" - Prob_Benign: model's confidence in benign class")