In [22]:
import os, re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import classification_report, confusion_matrix

In [6]:
CSV_PATH = "/kaggle/input/breakhis/Folds.csv"
if not os.path.exists(CSV_PATH):
    CSV_PATH = "/kaggle/input/breakhis/Folds.csv"
IMG_ROOT = "/kaggle/input/breakhis/BreaKHis_v1"
IMG_SIZE = (224, 224)
BATCH_SIZE = 32
EPOCHS = 10

In [7]:
meta = pd.read_csv(CSV_PATH)

In [9]:
def infer_label(p):
    s = str(p).lower()
    if "benign" in s or "sob_b" in s or "_b_" in s:
        return "benign"
    if "malignant" in s or "sob_m" in s or "_m_" in s:
        return "malignant"
    return np.nan

In [10]:
meta["label"] = meta["filename"].apply(infer_label)
meta = meta[meta["label"].isin(["benign", "malignant"])].reset_index(drop=True)
meta["filepath"] = meta["filename"].apply(lambda p: p if os.path.isabs(str(p)) else os.path.join(IMG_ROOT, str(p)))

In [11]:
subset, _ = train_test_split(meta, train_size=1200, stratify=meta["label"], random_state=42)
train_df, tmp_df = train_test_split(subset, train_size=800, stratify=subset["label"], random_state=42)
val_df, test_df = train_test_split(tmp_df, train_size=200, stratify=tmp_df["label"], random_state=42)

In [12]:
label_map = {"benign": 0, "malignant": 1}
train_paths = train_df["filepath"].to_numpy()
val_paths = val_df["filepath"].to_numpy()
test_paths = test_df["filepath"].to_numpy()
train_labels = train_df["label"].map(label_map).astype("int32").to_numpy()
val_labels = val_df["label"].map(label_map).astype("int32").to_numpy()
test_labels = test_df["label"].map(label_map).astype("int32").to_numpy()

In [13]:
def decode_and_resize(path, label):
    img = tf.io.read_file(path)
    img = tf.io.decode_image(img, channels=3, expand_animations=False)
    img = tf.image.resize(img, IMG_SIZE)
    return img, tf.cast(label, tf.float32)

In [14]:
def make_ds(paths, labels, training):
    ds = tf.data.Dataset.from_tensor_slices((paths, labels))
    if training:
        ds = ds.shuffle(len(paths), reshuffle_each_iteration=True)
    ds = ds.map(decode_and_resize, num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    return ds

In [15]:
train_ds = make_ds(train_paths, train_labels, True)
val_ds = make_ds(val_paths, val_labels, False)
test_ds = make_ds(test_paths, test_labels, False)

In [16]:
inputs = keras.Input(shape=IMG_SIZE + (3,))
x = layers.Lambda(lambda t: keras.applications.vgg16.preprocess_input(t))(inputs)

base = keras.applications.VGG16(
    include_top=False,
    weights="imagenet",
    input_tensor=x,
    pooling="avg"
)
base.trainable = False

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [17]:
x = layers.Dropout(0.3)(base.output)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs=inputs, outputs=outputs)

In [18]:
model.compile(
    optimizer=keras.optimizers.Adam(1e-3),
    loss="binary_crossentropy",
    metrics=["accuracy", keras.metrics.AUC(name="auc")]
)

In [19]:
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [20]:
loss, acc, auc = model.evaluate(test_ds, verbose=0)
print({"test_loss": float(loss), "test_acc": float(acc), "test_auc": float(auc)})

{'test_loss': 0.35507839918136597, 'test_acc': 0.8450000286102295, 'test_auc': 0.905630886554718}


In [21]:
probs = model.predict(test_ds, verbose=0).ravel()
preds = (probs >= 0.5).astype("int32")

In [23]:
print(classification_report(test_labels, preds, target_names=["benign", "malignant"]))
print(confusion_matrix(test_labels, preds))

              precision    recall  f1-score   support

      benign       0.83      0.63      0.72        63
   malignant       0.85      0.94      0.89       137

    accuracy                           0.84       200
   macro avg       0.84      0.79      0.81       200
weighted avg       0.84      0.84      0.84       200

[[ 40  23]
 [  8 129]]


In [24]:
model.save("breast-cancer-vgg16.h5")
print("Model saved")

Model saved
