<a href="https://www.kaggle.com/code/ahmedaboenaba/vgg-colorectal-cancer-classification?scriptVersionId=215787395" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Multi-tissue classification colorectal cancer

**Table of Contents**                                                                      
Step1: Environment Setup                                                                          
Step2: Data Loading and Preparation                                                                      
Step3: Analyzing Class Distribution                                                          
Step4: Data Augmentation for Minority Classes                                                            
Step5: Splitting the Dataset                                                                                     
Step6: Preparing Data Generators                                                             
Step7: Building the Model                                                                                       
Step8: Compiling the Model with Class Weights                                                  
Step9: Implementing Learning Rate Scheduling and Early Stopping                                      
Step10: Training the Model                                                             
Step11: Evaluating the Model                                                                   
Step12: Conclusion and Next Steps                                                           



**Step: 1 Environment Setup**

In [None]:
# Data manipulation and analysis
import os 
import numpy as np
import pandas as pd
from glob import glob

# Image processing
from PIL import Image

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

# Deep learning
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
#from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Conv2D, MaxPooling2D, Flatten, Dropout, BatchNormalization, MaxPool2D, Input
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input


# Callbacks
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Visualization
import matplotlib.pyplot as plt

**Step 2: Loading Data**


In [None]:
# Assuming the dataset is in a folder named 'dataset'
data_dir = '/kaggle/input/No_Norm'

# Get class names from subdirectory names
classes = sorted([d.name for d in os.scandir(data_dir) if d.is_dir()])
print("Classes:", classes)


# Initialize lists to hold file paths and labels
image_paths = []
image_labels = []

# Map class names to numerical labels
class_to_label = {class_name: idx for idx, class_name in enumerate(classes)}
print("Class to label mapping:", class_to_label)

# Loop through each class directory and collect file paths
for class_name in classes:
    class_path = os.path.join(data_dir, class_name)
    # Get all image file paths in the class directory
    image_files = [os.path.join(class_path, f) for f in os.listdir(class_path) if f.lower().endswith('.png')]
    # Append file paths and labels
    image_paths.extend(image_files)
    image_labels.extend([class_to_label[class_name]] * len(image_files))
    #print(f"Processing {len(image_files)} images for class '{class_name}'")

print(f"Total images found: {len(image_paths)}")

**3. Analyzing Class Distribution**

In [None]:
# Create a DataFrame to aid in analysis
df = pd.DataFrame({
    'image_path': image_paths,
    'label': image_labels
})

# Map numerical labels to class names
label_to_class = {v: k for k, v in class_to_label.items()}
df['class_name'] = df['label'].map(label_to_class)

# Display class counts
class_counts = df['class_name'].value_counts()
print("Class counts before augmentation:\n", class_counts)

# Plot class distribution
class_counts.plot(kind='barh')
plt.title('Class Distribution Before Augmentation')
plt.xlabel('Number of Images')
plt.ylabel('Class Name')
plt.xticks(rotation=45)
plt.show() 

**4. Data Augmentation for Minority Classes**


In [None]:
# Define augmentation parameters
data_gen_args = dict(
    preprocessing_function=preprocess_input,  # Required for VGG
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=True,
    zoom_range=0.1,
    fill_mode='nearest'
)

In [None]:
# Define data augmentation transformations
def balanced_data_generator(df, batch_size, target_size, datagen):
    '''
    A generator that yields batches of images and labels, balancing classes by oversampling minority classes.
    '''
    # Get the class counts
    class_counts = df['label'].value_counts()
    max_count = class_counts.max()
    class_indices = {label: df[df['label'] == label].index for label in class_counts.index}

    while True:
        batch_paths = []
        batch_labels = []

        # Oversample minority classes
        for label, indices in class_indices.items():
            num_samples = max_count - len(indices)
            if num_samples > 0:
                extra_indices = np.random.choice(indices, size=num_samples, replace=True)
                indices = indices.union(extra_indices)

            # Shuffle indices
            indices = np.random.permutation(indices)
            batch_paths.extend(df.loc[indices, 'image_path'].tolist())
            batch_labels.extend(df.loc[indices, 'label'].tolist())

        # Combine and shuffle
        combined = list(zip(batch_paths, batch_labels))
        np.random.shuffle(combined)
        batch_paths, batch_labels = zip(*combined)

        # Yield batches
        for i in range(0, len(batch_paths), batch_size):
            batch_end = min(i + batch_size, len(batch_paths))
            batch_images = []
            batch_labels_one_hot = []

            for j in range(i, batch_end):
                # Load and preprocess the image
                img = Image.open(batch_paths[j]).convert('RGB')
                img = img.resize(target_size)
                img_array = np.array(img)
                batch_images.append(img_array)
                batch_labels_one_hot.append(to_categorical(batch_labels[j], num_classes=len(classes)))

            # Convert lists to arrays
            X_batch = np.array(batch_images)
            y_batch = np.array(batch_labels_one_hot)

            # Apply real-time data augmentation
            augmented_iterator = datagen.flow(X_batch, y_batch, batch_size=batch_size, shuffle=False)
            X_batch_augmented, y_batch_augmented = next(augmented_iterator)

            # Normalize
            X_batch_augmented = X_batch_augmented / 255.0

            yield X_batch_augmented, y_batch_augmented

**Step 5: Split the Dataset**


In [None]:
# Split into training and temp (validation + test)
train_df, temp_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df['label'],
    random_state=42
)

# Split temp into validation and test
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df['label'],
    random_state=42
)

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")




**6. Preparing Data Generators**


In [None]:
# Parameters
batch_size = 32
target_size = (150, 150)  # VGG input size
num_classes = len(classes)

# Data generators
# Create an instance of ImageDataGenerator with the defined transformations
datagen = ImageDataGenerator(**data_gen_args)
val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

6.1. Training Data Generator


In [None]:
# Training generator with balanced batches
train_generator = balanced_data_generator(
    train_df,
    batch_size=batch_size,
    target_size=target_size,
    datagen=datagen
)


# Calculate steps per epoch
#steps_per_epoch = len(train_df) * 2 // batch_size  # Oversampling will effectively double the data

steps_per_epoch = (train_df['label'].value_counts().max() * num_classes) // batch_size


6.2. Validation Data Generator

In [None]:
# Validation data preparation
def prepare_dataset(df):
    images = []
    labels = []

    for idx, row in df.iterrows():
        img = Image.open(row['image_path']).convert('RGB')
        img = img.resize(target_size)
        img_array = np.array(img)
        images.append(img_array)
        labels.append(row['label'])

    X = np.array(images) / 255.0
    y = to_categorical(labels, num_classes=len(classes))
    return X, y

# Prepare validation and test datasets
X_val, y_val = prepare_dataset(val_df)
X_test, y_test = prepare_dataset(test_df)

print(f"Validation set shape: {X_val.shape}, {y_val.shape}")
print(f"Test set shape: {X_test.shape}, {y_test.shape}")

**9. Building and Configuring the VGG Model**

In [None]:
visible = Input(shape=(150, 150, 3))
#######
layer_in = Conv2D(64, 1, padding='same', activation='relu')(visible)
layer_in = Conv2D(64, 1, padding='same', activation='relu')(layer_in)
layer_in = MaxPooling2D((2,2), strides=(2,2))(layer_in)
#######
layer_in = Conv2D(128, 1, padding='same', activation='relu')(layer_in)
layer_in = Conv2D(128, 1, padding='same', activation='relu')(layer_in)
layer_in = MaxPooling2D((2,2), strides=(2,2))(layer_in)
#######
layer_in = Conv2D(256, 1, padding='same', activation='relu')(layer_in)
layer_in = Conv2D(256, 1, padding='same', activation='relu')(layer_in)
layer_in = Conv2D(256, 1, padding='same', activation='relu')(layer_in)
layer_in = Conv2D(256, 1, padding='same', activation='relu')(layer_in)
layer_in = MaxPooling2D((2,2), strides=(2,2))(layer_in)
#######
layer_in = Conv2D(512, 1, padding='same', activation='relu')(layer_in)
layer_in = Conv2D(512, 1, padding='same', activation='relu')(layer_in)
layer_in = Conv2D(512, 1, padding='same', activation='relu')(layer_in)
layer_in = Conv2D(512, 1, padding='same', activation='relu')(layer_in)
layer_in = MaxPooling2D((2,2), strides=(2,2))(layer_in)
#######
layer_in = Conv2D(512, 1, padding='same', activation='relu')(layer_in)
layer_in = Conv2D(512, 1, padding='same', activation='relu')(layer_in)
layer_in = Conv2D(512, 1, padding='same', activation='relu')(layer_in)
layer_in = Conv2D(512, 1, padding='same', activation='relu')(layer_in)
#######
layer_in = MaxPooling2D((2,2), strides=(2,2))(layer_in)
layer_in = Flatten()(layer_in)
layer_in = Dense(4096, activation='relu' )(layer_in)
layer_in = Dropout(0.5)(layer_in)
layer_in = Dense(4096, activation='relu' )(layer_in)
layer_in = Dropout(0.5)(layer_in)
layer_in = Dense(6, activation='softmax' )(layer_in)
model = Model(inputs=visible, outputs=layer_in)

model.summary()


In [None]:
# Compile the model
model.compile(optimizer= 'adam' , loss= tf.keras.losses.binary_crossentropy, metrics=['accuracy']) 

In [None]:
# Early stopping to prevent overfitting
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True
)

# Reduce learning rate when a metric has stopped improving
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=10,
    min_lr=1e-6,
    #verbose=0
)

callbacks = [early_stopping, reduce_lr]

Step 7: Cross-validating model        


In [None]:
# Train the model
history = model.fit(
    train_generator,
    steps_per_epoch=steps_per_epoch,
    epochs=30,
    validation_data=(X_val, y_val),
    #class_weight=class_weights,
    callbacks=callbacks
)

Step 8: Testing model            


In [None]:
# Evaluate the model on the test set
test_steps = len(test_df) // batch_size
test_loss, test_accuracy = model.evaluate(test_generator, steps=test_steps)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Generate classification report and confusion matrix
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

print("Classification Report:")
print(classification_report(y_true_classes, y_pred_classes))

print("Confusion Matrix:")
print(confusion_matrix(y_true_classes, y_pred_classes))

# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()