In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()

!kaggle competitions download -c histopathologic-cancer-detection

import zipfile
import os

zip_file_path = 'histopathologic-cancer-detection.zip'

extract_to_path = 'histopathologic-cancer-detection'

os.makedirs(extract_to_path, exist_ok=True)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to_path)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from skimage import io

train_labels = pd.read_csv('histopathologic-cancer-detection/train_labels.csv')

print(train_labels.describe())

sns.countplot(x='label', data=train_labels)
plt.title('Class Distribution')
plt.show()


In [None]:
import os
from tensorflow.keras.preprocessing.image import ImageDataGenerator

train_dir = 'histopathologic-cancer-detection/train'

train_labels = pd.read_csv('histopathologic-cancer-detection/train_labels.csv')

train_labels['label'] = train_labels['label'].astype(str)

train_labels['id'] = train_labels['id'].apply(lambda x: x if x.endswith('.tif') else f"{x}.tif")

print(train_labels['id'].head())

datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2,
    horizontal_flip=True,
    vertical_flip=True,
    zoom_range=0.2
)

train_generator = datagen.flow_from_dataframe(
    dataframe=train_labels,
    directory=train_dir,
    x_col='id',
    y_col='label',
    subset='training',
    batch_size=32,
    shuffle=True,
    class_mode='binary',
    target_size=(96, 96)
)

validation_generator = datagen.flow_from_dataframe(
    dataframe=train_labels,
    directory=train_dir,
    x_col='id',
    y_col='label',
    subset='validation',
    batch_size=32,
    shuffle=True,
    class_mode='binary',
    target_size=(96, 96)
)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(96, 96, 3)),
    MaxPooling2D((2, 2)),
    Dropout(0.2),
    
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Dropout(0.2),
    
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Dropout(0.2),
    
    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()


In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True)

history = model.fit(
    train_generator,
    epochs=30,
    validation_data=validation_generator,
    callbacks=[early_stopping, model_checkpoint]
)

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.title('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='train_acc')
plt.plot(history.history['val_accuracy'], label='val_acc')
plt.title('Accuracy')
plt.legend()

plt.show()


In [None]:
val_loss, val_acc = model.evaluate(validation_generator)
print(f'{val_loss}')
print(f'{val_acc}')

from sklearn.metrics import confusion_matrix, roc_curve, auc
import seaborn as sns

y_true = validation_generator.classes
y_pred = model.predict(validation_generator).ravel()

cm = confusion_matrix(y_true, y_pred > 0.5)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.show()

fpr, tpr, _ = roc_curve(y_true, y_pred)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()


In [None]:
from tensorflow.keras.models import load_model

best_model = load_model('best_model.keras')

test_datagen = ImageDataGenerator(rescale=1./255)

test_dir = 'histopathologic-cancer-detection/test'

test_generator = test_datagen.flow_from_directory(
    directory=test_dir,
    target_size=(96, 96),
    batch_size=32,
    class_mode=None,
    shuffle=False
)

predictions = best_model.predict(test_generator, steps=len(test_generator))

submission = pd.DataFrame({
    'id': [fname.split('/')[-1].split('.')[0] for fname in test_generator.filenames],
    'label': predictions.ravel()
})

submission['label'] = (submission['label'] > 0.5).astype(int)

submission.to_csv('submission.csv', index=False)
