In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/Breast Cancer Project/new_master_dataset.csv")
df

Mounted at /content/drive


Unnamed: 0,mag,path,filename,class,slide_id,tumor_type
0,100,/content/drive/MyDrive/Breast Cancer Project/b...,SOB_B_A-14-22549CD-100-001.png,benign,22549CD,A
1,100,/content/drive/MyDrive/Breast Cancer Project/b...,SOB_B_A-14-22549CD-100-002.png,benign,22549CD,A
2,100,/content/drive/MyDrive/Breast Cancer Project/b...,SOB_B_A-14-22549CD-100-003.png,benign,22549CD,A
3,100,/content/drive/MyDrive/Breast Cancer Project/b...,SOB_B_A-14-22549CD-100-004.png,benign,22549CD,A
4,100,/content/drive/MyDrive/Breast Cancer Project/b...,SOB_B_A-14-22549CD-100-005.png,benign,22549CD,A
...,...,...,...,...,...,...
7904,400,/content/drive/MyDrive/Breast Cancer Project/b...,SOB_M_PC-14-15704-400-031.png,malignant,15704,PC
7905,400,/content/drive/MyDrive/Breast Cancer Project/b...,SOB_M_PC-14-15704-400-032.png,malignant,15704,PC
7906,400,/content/drive/MyDrive/Breast Cancer Project/b...,SOB_M_PC-14-15704-400-033.png,malignant,15704,PC
7907,400,/content/drive/MyDrive/Breast Cancer Project/b...,SOB_M_PC-14-15704-400-034.png,malignant,15704,PC


In [2]:
groupd_df = df.groupby("tumor_type")

# You can then perform various operations on the grouped data, such as getting the count of each tumor type
tumor_type_counts = groupd_df.size()
tumor_type_counts

tumor_type
A      444
DC    3451
F     1014
LC     626
MC     792
PC     560
PT     453
TA     569
dtype: int64

In [3]:
# df_train_100 = pd.read_csv("/content/drive/MyDrive/Breast Cancer Project/Mag100/train_df_100.csv")
# df_test_100 = pd.read_csv("/content/drive/MyDrive/Breast Cancer Project/Mag100/test_df_100.csv")
# df_val_100 = pd.read_csv("/content/drive/MyDrive/Breast Cancer Project/Mag100/val_df_100.csv")

In [4]:
!pip install timm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from PIL import Image
import os
from transformers import EfficientNetImageProcessor, EfficientNetForImageClassification
import timm

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Define constants
data_dirs = ["TA", "PT", "PC", "MC", "LC", "F", "DC", "A"]
data_root = "/content/drive/MyDrive/Breast Cancer Project/IW/200"  # Replace with the root directory of your data
train_split = 0.7

# Create a list to store the paths and labels of all images
all_data = []

# Populate the list with paths and labels
for label, folder in enumerate(data_dirs):
    folder_path = os.path.join(data_root, folder)
    image_files = os.listdir(folder_path)
    for image_file in image_files:
        image_path = os.path.join(folder_path, image_file)
        all_data.append((image_path, label))

# Split data into training and testing sets
train_data, test_data = train_test_split(all_data, train_size=train_split, shuffle=True, random_state=42)

# Define custom dataset class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path, label = self.data[idx]
        img = Image.open(img_path).convert('RGB')  # Open image and convert to RGB mode
        if self.transform:
            img = self.transform(img)
        label_tensor = torch.tensor(label, dtype=torch.long)  # Convert label to tensor
        return img, label_tensor

# Image preprocessing with augmentation for training
train_transform = transforms.Compose([
    transforms.Resize((384, 384)),
    transforms.RandomRotation(90),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.ToTensor()
])

# Image preprocessing without augmentation for testing and validation
test_val_transform = transforms.Compose([
    transforms.Resize((384, 384)),
    transforms.ToTensor()
])

# Create custom datasets
train_dataset = CustomDataset(train_data, transform=train_transform)
test_dataset = CustomDataset(test_data, transform=test_val_transform)

# DataLoaders for batching and shuffling
batch_size = 10  # Define the batch size
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

import timm

model = timm.create_model("timm/cait_xs24_384.fb_dist_in1k", pretrained=True)

model.to(device)  # Move model to GPU

# Define optimizer and scheduler
optimizer = optim.Adam(model.parameters(), lr=0.0001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=3, verbose=True)

# Define loss function
criterion = nn.CrossEntropyLoss()

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    correct = 0
    total = 0
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=False)
    for images, labels in progress_bar:
        images, labels = images.to(device), labels.to(device)  # Move data to GPU
        optimizer.zero_grad()
        # Ensure the input tensor is passed correctly
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * labels.size(0)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        progress_bar.set_postfix({'Loss': train_loss / total, 'Accuracy': 100 * correct / total})

    train_loss = train_loss / len(train_loader.dataset)
    train_accuracy = 100 * correct / total

    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)  # Move data to GPU
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * labels.size(0)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    val_loss = val_loss / len(test_loader.dataset)
    val_accuracy = 100 * correct / total

    print(f'Epoch {epoch+1}/{num_epochs}, '
          f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, '
          f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%')

    # Adjust learning rate
    scheduler.step(val_loss)

# Test the model
model.eval()
test_correct = 0
test_total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)  # Move data to GPU
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

test_accuracy = 100 * test_correct / test_total
print(f'Test Accuracy: {test_accuracy:.2f}%')

Collecting timm
  Downloading timm-0.9.16-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->timm)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->timm)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->timm)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m92.9 MB/s[0m eta 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/107M [00:00<?, ?B/s]



Epoch 1/10, Train Loss: 1.7761, Train Accuracy: 48.62%, Val Loss: 1.5259, Val Accuracy: 53.81%




Epoch 2/10, Train Loss: 1.2166, Train Accuracy: 60.04%, Val Loss: 1.0227, Val Accuracy: 62.09%




Epoch 3/10, Train Loss: 0.8143, Train Accuracy: 72.32%, Val Loss: 1.1412, Val Accuracy: 60.43%




Epoch 4/10, Train Loss: 0.6371, Train Accuracy: 76.79%, Val Loss: 0.5987, Val Accuracy: 77.48%




Epoch 5/10, Train Loss: 0.4688, Train Accuracy: 82.61%, Val Loss: 0.5084, Val Accuracy: 81.13%




Epoch 6/10, Train Loss: 0.4069, Train Accuracy: 84.60%, Val Loss: 0.5601, Val Accuracy: 82.12%




Epoch 7/10, Train Loss: 0.3261, Train Accuracy: 88.36%, Val Loss: 0.5158, Val Accuracy: 82.62%




Epoch 8/10, Train Loss: 0.2528, Train Accuracy: 90.70%, Val Loss: 0.4297, Val Accuracy: 85.26%




Epoch 9/10, Train Loss: 0.2827, Train Accuracy: 89.78%, Val Loss: 0.3757, Val Accuracy: 85.60%




Epoch 10/10, Train Loss: 0.1747, Train Accuracy: 93.61%, Val Loss: 0.5725, Val Accuracy: 82.78%
Test Accuracy: 82.78%


In [5]:
import numpy as np
from sklearn.metrics import classification_report

# Evaluate the model on the test set
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Convert predictions and true labels to numpy arrays
predictions = np.array(predictions)
true_labels = np.array(true_labels)

# Print classification report with three digits
print(classification_report(true_labels, predictions, target_names=data_dirs, digits=3))

              precision    recall  f1-score   support

          TA      0.872     0.944     0.907        36
          PT      1.000     0.097     0.176        31
          PC      1.000     0.608     0.756        51
          MC      0.850     0.785     0.816        65
          LC      0.811     0.625     0.706        48
           F      0.723     0.901     0.802        81
          DC      0.833     0.958     0.891       260
           A      0.853     0.906     0.879        32

    accuracy                          0.828       604
   macro avg      0.868     0.728     0.742       604
weighted avg      0.844     0.828     0.808       604



In [6]:
# Train the model
num_epochs = 20
for epoch in range(10, num_epochs):
    model.train()
    train_loss = 0.0
    correct = 0
    total = 0
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=False)
    for images, labels in progress_bar:
        images, labels = images.to(device), labels.to(device)  # Move data to GPU
        optimizer.zero_grad()
        # Ensure the input tensor is passed correctly
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * labels.size(0)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        progress_bar.set_postfix({'Loss': train_loss / total, 'Accuracy': 100 * correct / total})

    train_loss = train_loss / len(train_loader.dataset)
    train_accuracy = 100 * correct / total

    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)  # Move data to GPU
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * labels.size(0)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    val_loss = val_loss / len(test_loader.dataset)
    val_accuracy = 100 * correct / total

    print(f'Epoch {epoch+1}/{num_epochs}, '
          f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, '
          f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%')

    # Adjust learning rate
    scheduler.step(val_loss)

# Test the model
model.eval()
test_correct = 0
test_total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)  # Move data to GPU
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

test_accuracy = 100 * test_correct / test_total
print(f'Test Accuracy: {test_accuracy:.2f}%')



Epoch 11/20, Train Loss: 0.1804, Train Accuracy: 93.04%, Val Loss: 0.4184, Val Accuracy: 84.44%




Epoch 12/20, Train Loss: 0.1507, Train Accuracy: 94.75%, Val Loss: 0.6577, Val Accuracy: 77.48%




Epoch 13/20, Train Loss: 0.2370, Train Accuracy: 91.06%, Val Loss: 0.6667, Val Accuracy: 78.31%




Epoch 14/20, Train Loss: 0.1011, Train Accuracy: 96.74%, Val Loss: 0.2847, Val Accuracy: 89.07%
Epoch 14/20, Train Loss: 0.1011, Train Accuracy: 96.74%, Val Loss: 0.2847, Val Accuracy: 89.07%




Epoch 15/20, Train Loss: 0.0526, Train Accuracy: 97.73%, Val Loss: 0.2827, Val Accuracy: 90.23%
Epoch 15/20, Train Loss: 0.0526, Train Accuracy: 97.73%, Val Loss: 0.2827, Val Accuracy: 90.23%




Epoch 16/20, Train Loss: 0.0432, Train Accuracy: 98.30%, Val Loss: 0.2669, Val Accuracy: 90.56%
Epoch 16/20, Train Loss: 0.0432, Train Accuracy: 98.30%, Val Loss: 0.2669, Val Accuracy: 90.56%




Epoch 17/20, Train Loss: 0.0362, Train Accuracy: 98.44%, Val Loss: 0.2702, Val Accuracy: 90.89%
Epoch 17/20, Train Loss: 0.0362, Train Accuracy: 98.44%, Val Loss: 0.2702, Val Accuracy: 90.89%




Epoch 18/20, Train Loss: 0.0376, Train Accuracy: 98.23%, Val Loss: 0.2615, Val Accuracy: 90.56%
Epoch 18/20, Train Loss: 0.0376, Train Accuracy: 98.23%, Val Loss: 0.2615, Val Accuracy: 90.56%




Epoch 19/20, Train Loss: 0.0330, Train Accuracy: 98.51%, Val Loss: 0.2946, Val Accuracy: 90.23%
Epoch 19/20, Train Loss: 0.0330, Train Accuracy: 98.51%, Val Loss: 0.2946, Val Accuracy: 90.23%




Epoch 20/20, Train Loss: 0.0312, Train Accuracy: 98.51%, Val Loss: 0.2590, Val Accuracy: 92.05%
Epoch 20/20, Train Loss: 0.0312, Train Accuracy: 98.51%, Val Loss: 0.2590, Val Accuracy: 92.05%
Test Accuracy: 92.05%
Test Accuracy: 92.05%


In [7]:
import numpy as np
from sklearn.metrics import classification_report

# Evaluate the model on the test set
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Convert predictions and true labels to numpy arrays
predictions = np.array(predictions)
true_labels = np.array(true_labels)

# Print classification report with three digits
print(classification_report(true_labels, predictions, target_names=data_dirs, digits=3))

              precision    recall  f1-score   support

          TA      0.944     0.944     0.944        36
          PT      1.000     0.774     0.873        31
          PC      1.000     0.922     0.959        51
          MC      0.983     0.892     0.935        65
          LC      0.711     0.667     0.688        48
           F      0.878     0.975     0.924        81
          DC      0.926     0.965     0.945       260
           A      0.969     0.969     0.969        32

    accuracy                          0.921       604
   macro avg      0.926     0.889     0.905       604
weighted avg      0.922     0.921     0.920       604

              precision    recall  f1-score   support

          TA      0.944     0.944     0.944        36
          PT      1.000     0.774     0.873        31
          PC      1.000     0.922     0.959        51
          MC      0.983     0.892     0.935        65
          LC      0.711     0.667     0.688        48
           F      0.878 

In [8]:
# Train the model
num_epochs = 30
for epoch in range(20, num_epochs):
    model.train()
    train_loss = 0.0
    correct = 0
    total = 0
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=False)
    for images, labels in progress_bar:
        images, labels = images.to(device), labels.to(device)  # Move data to GPU
        optimizer.zero_grad()
        # Ensure the input tensor is passed correctly
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * labels.size(0)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        progress_bar.set_postfix({'Loss': train_loss / total, 'Accuracy': 100 * correct / total})

    train_loss = train_loss / len(train_loader.dataset)
    train_accuracy = 100 * correct / total

    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)  # Move data to GPU
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * labels.size(0)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    val_loss = val_loss / len(test_loader.dataset)
    val_accuracy = 100 * correct / total

    print(f'Epoch {epoch+1}/{num_epochs}, '
          f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, '
          f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%')

    # Adjust learning rate
    scheduler.step(val_loss)

# Test the model
model.eval()
test_correct = 0
test_total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)  # Move data to GPU
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

test_accuracy = 100 * test_correct / test_total
print(f'Test Accuracy: {test_accuracy:.2f}%')



Epoch 21/30, Train Loss: 0.0322, Train Accuracy: 98.23%, Val Loss: 0.2545, Val Accuracy: 91.72%




Epoch 22/30, Train Loss: 0.0290, Train Accuracy: 98.72%, Val Loss: 0.2813, Val Accuracy: 91.72%




Epoch 23/30, Train Loss: 0.0267, Train Accuracy: 98.65%, Val Loss: 0.2601, Val Accuracy: 91.72%




Epoch 24/30, Train Loss: 0.0272, Train Accuracy: 98.44%, Val Loss: 0.2734, Val Accuracy: 92.22%




Epoch 25/30, Train Loss: 0.0277, Train Accuracy: 98.44%, Val Loss: 0.2887, Val Accuracy: 91.39%




Epoch 26/30, Train Loss: 0.0256, Train Accuracy: 98.94%, Val Loss: 0.2847, Val Accuracy: 91.56%




Epoch 27/30, Train Loss: 0.0238, Train Accuracy: 98.79%, Val Loss: 0.2830, Val Accuracy: 91.56%




Epoch 28/30, Train Loss: 0.0232, Train Accuracy: 98.86%, Val Loss: 0.2822, Val Accuracy: 91.23%




Epoch 29/30, Train Loss: 0.0248, Train Accuracy: 98.58%, Val Loss: 0.2794, Val Accuracy: 91.06%




Epoch 30/30, Train Loss: 0.0213, Train Accuracy: 99.08%, Val Loss: 0.2795, Val Accuracy: 91.06%
Test Accuracy: 91.06%


In [9]:
import numpy as np
from sklearn.metrics import classification_report

# Evaluate the model on the test set
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Convert predictions and true labels to numpy arrays
predictions = np.array(predictions)
true_labels = np.array(true_labels)

# Print classification report with three digits
print(classification_report(true_labels, predictions, target_names=data_dirs, digits=3))

              precision    recall  f1-score   support

          TA      0.919     0.944     0.932        36
          PT      1.000     0.774     0.873        31
          PC      1.000     0.922     0.959        51
          MC      0.965     0.846     0.902        65
          LC      0.660     0.646     0.653        48
           F      0.878     0.975     0.924        81
          DC      0.919     0.958     0.938       260
           A      1.000     0.969     0.984        32

    accuracy                          0.911       604
   macro avg      0.918     0.879     0.895       604
weighted avg      0.913     0.911     0.910       604

