<a href="https://colab.research.google.com/github/24phyr/24phyr/blob/main/notebooks/breast_cancer_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install/Upgrade libraries
!pip install -U tensorflow

# Core imports
import tensorflow as tf
import kagglehub
import numpy as np
import pandas as pd
import sys
import os

# ML preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Hardware check
print("="*50)
print("Hardware")
print("="*50)
print(f"TensorFlow: {tf.__version__}")
print(f"Python: {sys.version.split()[0]}")

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"\nGPU: {gpus[0]}")
else:
    print("\nGPU: None - using CPU")

Collecting tensorflow
  Downloading tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting tensorboard~=2.20.0 (from tensorflow)
  Downloading tensorboard-2.20.0-py3-none-any.whl.metadata (1.8 kB)
Downloading tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (620.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m620.7/620.7 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tensorboard-2.20.0-py3-none-any.whl (5.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m117.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorboard, tensorflow
  Attempting uninstall: tensorboard
    Found existing installation: tensorboard 2.19.0
    Uninstalling tensorboard-2.19.0:
      Successfully uninstalled tensorboard-2.19.0
  Attempting uninstall: tensorflow
    Found existing installation: tensorflow 2.19.0
    Uninstalling tensorflow-2

In [2]:
# Dataset download
path = kagglehub.dataset_download("uciml/breast-cancer-wisconsin-data")
print(f"Dataset path: {path}")

Using Colab cache for faster access to the 'breast-cancer-wisconsin-data' dataset.
Dataset path: /kaggle/input/breast-cancer-wisconsin-data


In [3]:
# Find CSV file in downloaded path
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
if not csv_files:
    raise FileNotFoundError("No CSV found in dataset path")

csv_path = os.path.join(path, csv_files[0])
print(f"Loading: {csv_path}")

# Load data
df = pd.read_csv(csv_path)
print(f"Shape: {df.shape}")
print(f"Columns: {len(df.columns)}")
print(f"Column names: {list(df.columns)}")

# Map diagnosis (M=0 malignant, B=1 benign)
df['diagnosis'] = df['diagnosis'].map({'M': 0, 'B': 1})

# Separate features and target
X = df.drop(['id', 'diagnosis'], axis=1).values
y = df['diagnosis'].values

print(f"\nFeature matrix: {X.shape}")
print(f"Target vector: {y.shape}")

# Class distribution
dist = pd.Series(y).value_counts().rename({0: 'Malignant', 1: 'Benign'})
print(f"\nClass distribution:\n{dist}")

# Preview data
print("\nFirst 5 rows:")
print(df.head())

Loading: /kaggle/input/breast-cancer-wisconsin-data/data.csv
Shape: (569, 33)
Columns: 33
Column names: ['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32']

Feature matrix: (569, 31)
Target vector: (569,)

Class distribution:
Benign       357
Malignant    212
Name: count, dtype: int64

First 5 rows:
         id  diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302          0        17.99         10.38          122.80     1001.0   
1    842517

In [4]:
# Clean dataset: Remove unused column
df_clean = df.drop(['id', 'diagnosis', 'Unnamed: 32'], axis=1)
print(f"Cleaned shape: {df_clean.shape}")  # Should be (569, 30)

# Separate features and target (cleaned)
X = df_clean.values
y = df['diagnosis'].values

print(f"\nFeature matrix: {X.shape}")
print(f"Target vector: {y.shape}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# Scale features (critical for neural networks)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape for 1D CNN: add channel dimension
X_train_cnn = X_train_scaled.reshape(-1, 30, 1)
X_test_cnn = X_test_scaled.reshape(-1, 30, 1)

print(f"\nCNN training shape: {X_train_cnn.shape}")
print(f"CNN test shape: {X_test_cnn.shape}")
print("Reshaped: (samples, timesteps, channels)")

Cleaned shape: (569, 30)

Feature matrix: (569, 30)
Target vector: (569,)

Train set: (455, 30)
Test set: (114, 30)

CNN training shape: (455, 30, 1)
CNN test shape: (114, 30, 1)
Reshaped: (samples, timesteps, channels)


In [16]:
from tensorflow.keras import layers, models, optimizers

class BreastCancerCNN(models.Model):
    def __init__(self):
        super(BreastCancerCNN, self).__init__()

        # First block
        self.conv1 = layers.Conv1D(32, 3, activation='relu', padding='same')
        self.bn1 = layers.BatchNormalization()
        self.pool1 = layers.MaxPooling1D(2)

        # Second block
        self.conv2 = layers.Conv1D(64, 3, activation='relu', padding='same')
        self.bn2 = layers.BatchNormalization()
        self.pool2 = layers.MaxPooling1D(2)

        # Dense head
        self.flatten = layers.Flatten()
        self.fc1 = layers.Dense(32, activation='relu')
        self.dropout = layers.Dropout(0.3)
        self.output_layer = layers.Dense(1, activation='sigmoid')

    def call(self, inputs, training=False):
        x = self.pool1(self.bn1(self.conv1(inputs), training=training))
        x = self.pool2(self.bn2(self.conv2(x), training=training))
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.dropout(x, training=training)
        return self.output_layer(x)

# Create model instance
model = BreastCancerCNN()

# Build by calling it on a dummy batch
dummy_input = tf.zeros((1, 30, 1))  # (batch_size, timesteps, channels)
_ = model(dummy_input)  # Forward pass builds the graph

# Now compile and show summary
model.compile(
    optimizer=optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.Recall()]
)

model.summary()

In [17]:
# Training callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_recall', patience=10,
        restore_best_weights=True, mode='max'
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss', factor=0.5, patience=5, verbose=1
    )
]

# Train
history = model.fit(
    X_train_cnn, y_train,
    validation_data=(X_test_cnn, y_test),
    epochs=50,
    batch_size=32,
    callbacks=callbacks,
    verbose=1
)

print("\nTraining complete")

Epoch 1/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 243ms/step - accuracy: 0.7185 - loss: 0.5437 - recall_9: 0.5979 - val_accuracy: 0.9298 - val_loss: 0.5417 - val_recall_9: 0.9444 - learning_rate: 0.0010
Epoch 2/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9739 - loss: 0.1148 - recall_9: 0.9933 - val_accuracy: 0.9211 - val_loss: 0.5004 - val_recall_9: 0.9306 - learning_rate: 0.0010
Epoch 3/50
[1m 1/15[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 19ms/step - accuracy: 0.9688 - loss: 0.0818 - recall_9: 0.9545

  current = self.get_monitor_value(logs)


[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9795 - loss: 0.0748 - recall_9: 0.9879 - val_accuracy: 0.9211 - val_loss: 0.4871 - val_recall_9: 0.9167 - learning_rate: 0.0010
Epoch 4/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9673 - loss: 0.0858 - recall_9: 0.9774 - val_accuracy: 0.9211 - val_loss: 0.4744 - val_recall_9: 0.9167 - learning_rate: 0.0010
Epoch 5/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9775 - loss: 0.0598 - recall_9: 0.9841 - val_accuracy: 0.9211 - val_loss: 0.4617 - val_recall_9: 0.9167 - learning_rate: 0.0010
Epoch 6/50
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9868 - loss: 0.0598 - recall_9: 0.9929 - val_accuracy: 0.9035 - val_loss: 0.4306 - val_recall_9: 0.8889 - learning_rate: 0.0010
Epoch 7/50
[1