In [10]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/Breast Cancer Project/new_master_dataset.csv")
df

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,mag,path,filename,class,slide_id,tumor_type
0,100,/content/drive/MyDrive/Breast Cancer Project/b...,SOB_B_A-14-22549CD-100-001.png,benign,22549CD,A
1,100,/content/drive/MyDrive/Breast Cancer Project/b...,SOB_B_A-14-22549CD-100-002.png,benign,22549CD,A
2,100,/content/drive/MyDrive/Breast Cancer Project/b...,SOB_B_A-14-22549CD-100-003.png,benign,22549CD,A
3,100,/content/drive/MyDrive/Breast Cancer Project/b...,SOB_B_A-14-22549CD-100-004.png,benign,22549CD,A
4,100,/content/drive/MyDrive/Breast Cancer Project/b...,SOB_B_A-14-22549CD-100-005.png,benign,22549CD,A
...,...,...,...,...,...,...
7904,400,/content/drive/MyDrive/Breast Cancer Project/b...,SOB_M_PC-14-15704-400-031.png,malignant,15704,PC
7905,400,/content/drive/MyDrive/Breast Cancer Project/b...,SOB_M_PC-14-15704-400-032.png,malignant,15704,PC
7906,400,/content/drive/MyDrive/Breast Cancer Project/b...,SOB_M_PC-14-15704-400-033.png,malignant,15704,PC
7907,400,/content/drive/MyDrive/Breast Cancer Project/b...,SOB_M_PC-14-15704-400-034.png,malignant,15704,PC


In [11]:
groupd_df = df.groupby("tumor_type")

# You can then perform various operations on the grouped data, such as getting the count of each tumor type
tumor_type_counts = groupd_df.size()
tumor_type_counts

tumor_type
A      444
DC    3451
F     1014
LC     626
MC     792
PC     560
PT     453
TA     569
dtype: int64

In [12]:
# df_train_100 = pd.read_csv("/content/drive/MyDrive/Breast Cancer Project/Mag100/train_df_100.csv")
# df_test_100 = pd.read_csv("/content/drive/MyDrive/Breast Cancer Project/Mag100/test_df_100.csv")
# df_val_100 = pd.read_csv("/content/drive/MyDrive/Breast Cancer Project/Mag100/val_df_100.csv")

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader
from transformers import AutoImageProcessor, AutoModelForImageClassification
import torchvision.transforms as transforms
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from PIL import Image
import os
import torch
from transformers import EfficientNetImageProcessor, EfficientNetForImageClassification

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

import os

# Define constants
train_root = "/content/drive/MyDrive/Breast Cancer Project/Akash Split/train"
test_root = "/content/drive/MyDrive/Breast Cancer Project/Akash Split/test"
classes = ["benign", "malignant"]

# Create a list to store the paths and labels of all images for training data
train_data = []

# Populate the list with paths and labels for training data
for label, class_name in enumerate(classes):
    folder_path = os.path.join(train_root, class_name)
    image_files = os.listdir(folder_path)
    for image_file in image_files:
        image_path = os.path.join(folder_path, image_file)
        train_data.append((image_path, label))

# Create a list to store the paths and labels of all images for test data
test_data = []

# Populate the list with paths and labels for test data
for label, class_name in enumerate(classes):
    folder_path = os.path.join(test_root, class_name)
    image_files = os.listdir(folder_path)
    for image_file in image_files:
        image_path = os.path.join(folder_path, image_file)
        test_data.append((image_path, label))

# Define custom dataset class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path, label = self.data[idx]
        img = Image.open(img_path).convert('RGB')  # Open image and convert to RGB mode
        if self.transform:
            img = self.transform(img)
        label_tensor = torch.tensor(label, dtype=torch.long)  # Convert label to tensor
        return img, label_tensor

# Image preprocessing with augmentation for training
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomRotation(90),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.ToTensor()
])

# Image preprocessing without augmentation for testing and validation
test_val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

# Create custom datasets
train_dataset = CustomDataset(train_data, transform=train_transform)
test_dataset = CustomDataset(test_data, transform=test_val_transform)

# DataLoaders for batching and shuffling
batch_size = 60 # Define the batch size
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

from transformers import AutoImageProcessor, ConvNextForImageClassification

processor = AutoImageProcessor.from_pretrained("facebook/convnext-tiny-224")
model = AutoModelForImageClassification.from_pretrained("facebook/convnext-tiny-224")

# Move model to device
model.to(device)

# Define optimizer and scheduler
optimizer = optim.Adam(model.parameters(), lr=0.0001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=3, verbose=True)

Using device: cuda




In [14]:
# Define loss function
criterion = nn.CrossEntropyLoss()

# Train the model
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    correct = 0
    total = 0
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=False)
    for images, labels in progress_bar:
        images, labels = images.to(device), labels.to(device)  # Move data to GPU
        optimizer.zero_grad()
        # Ensure the input tensor is passed correctly
        outputs = model(images).logits
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * labels.size(0)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        progress_bar.set_postfix({'Loss': train_loss / total, 'Accuracy': 100 * correct / total})

    train_loss = train_loss / len(train_loader.dataset)
    train_accuracy = 100 * correct / total

    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)  # Move data to GPU
            outputs = model(images).logits
            loss = criterion(outputs, labels)
            val_loss += loss.item() * labels.size(0)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    val_loss = val_loss / len(test_loader.dataset)
    val_accuracy = 100 * correct / total

    print(f'Epoch {epoch+1}/{num_epochs}, '
          f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, '
          f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%')

    # Adjust learning rate
    scheduler.step(val_loss)

# Test the model
model.eval()
test_correct = 0
test_total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)  # Move data to GPU
        outputs = model(images).logits
        _, predicted = torch.max(outputs, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

test_accuracy = 100 * test_correct / test_total
print(f'Test Accuracy: {test_accuracy:.2f}%')



Epoch 1/20, Train Loss: 2.3841, Train Accuracy: 54.66%, Val Loss: 0.8293, Val Accuracy: 65.26%




Epoch 2/20, Train Loss: 0.4156, Train Accuracy: 82.96%, Val Loss: 0.5386, Val Accuracy: 75.84%




Epoch 3/20, Train Loss: 0.2180, Train Accuracy: 93.17%, Val Loss: 0.4047, Val Accuracy: 82.54%




Epoch 4/20, Train Loss: 0.1405, Train Accuracy: 94.23%, Val Loss: 0.3875, Val Accuracy: 85.36%




Epoch 5/20, Train Loss: 0.0973, Train Accuracy: 95.92%, Val Loss: 0.3940, Val Accuracy: 88.54%




Epoch 6/20, Train Loss: 0.0907, Train Accuracy: 96.81%, Val Loss: 0.3501, Val Accuracy: 86.60%




Epoch 7/20, Train Loss: 0.0416, Train Accuracy: 98.85%, Val Loss: 0.3853, Val Accuracy: 87.13%




Epoch 8/20, Train Loss: 0.0838, Train Accuracy: 97.43%, Val Loss: 0.3802, Val Accuracy: 88.54%




Epoch 9/20, Train Loss: 0.1226, Train Accuracy: 95.30%, Val Loss: 0.5506, Val Accuracy: 82.72%




Epoch 10/20, Train Loss: 0.0548, Train Accuracy: 98.31%, Val Loss: 0.3934, Val Accuracy: 86.60%




Epoch 11/20, Train Loss: 0.0291, Train Accuracy: 98.85%, Val Loss: 0.3616, Val Accuracy: 88.18%




Epoch 12/20, Train Loss: 0.0196, Train Accuracy: 99.65%, Val Loss: 0.3569, Val Accuracy: 88.36%




Epoch 13/20, Train Loss: 0.0193, Train Accuracy: 99.38%, Val Loss: 0.3549, Val Accuracy: 88.54%




Epoch 14/20, Train Loss: 0.0176, Train Accuracy: 99.65%, Val Loss: 0.3553, Val Accuracy: 89.24%




Epoch 15/20, Train Loss: 0.0162, Train Accuracy: 99.65%, Val Loss: 0.3539, Val Accuracy: 88.89%




Epoch 16/20, Train Loss: 0.0161, Train Accuracy: 99.47%, Val Loss: 0.3537, Val Accuracy: 88.89%




Epoch 17/20, Train Loss: 0.0158, Train Accuracy: 99.38%, Val Loss: 0.3541, Val Accuracy: 89.42%




Epoch 18/20, Train Loss: 0.0145, Train Accuracy: 99.82%, Val Loss: 0.3543, Val Accuracy: 89.42%




Epoch 19/20, Train Loss: 0.0131, Train Accuracy: 99.73%, Val Loss: 0.3544, Val Accuracy: 89.42%




Epoch 20/20, Train Loss: 0.0149, Train Accuracy: 99.65%, Val Loss: 0.3544, Val Accuracy: 89.42%
Test Accuracy: 89.42%


In [15]:
from sklearn.metrics import classification_report

# Test the model
model.eval()
test_correct = 0
test_total = 0
all_predicted = []
all_labels = []
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)  # Move data to GPU
        outputs = model(images).logits
        _, predicted = torch.max(outputs, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()
        all_predicted.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

test_accuracy = 100 * test_correct / test_total
print(f'Test Accuracy: {test_accuracy:.2f}%')

# Print classification report
print("Classification Report:")
report = classification_report(all_labels, all_predicted, digits=3)
print(report)

Test Accuracy: 89.42%
Classification Report:
              precision    recall  f1-score   support

           0      0.872     0.760     0.812       171
           1      0.902     0.952     0.926       396

    accuracy                          0.894       567
   macro avg      0.887     0.856     0.869       567
weighted avg      0.893     0.894     0.892       567



In [30]:
# Define a dictionary to store image paths grouped by patient IDs
test_data_by_patient = {}

# Iterate through the test data
for image_path, _ in test_data:
    # Extract the patient ID from the image filename
    patient_id = image_path.split("-")[2]
    # Check if the patient ID already exists in the dictionary
    if patient_id in test_data_by_patient:
        # Append the image path to the list associated with the patient ID
        test_data_by_patient[patient_id].append(image_path)
    else:
        # Create a new list with the image path for the patient ID
        test_data_by_patient[patient_id] = [image_path]

# Print the dictionary containing image paths grouped by patient IDs
for patient_id, images in test_data_by_patient.items():
    print(f"Patient ID: {patient_id}, Number of images: {len(images)}")
    print("Image paths:")
    for image_path in images:
        print(image_path)
    print()


Patient ID: 29960CD, Number of images: 13
Image paths:
/content/drive/MyDrive/Breast Cancer Project/Akash Split/test/benign/SOB_B_A-14-29960CD-100-011.png
/content/drive/MyDrive/Breast Cancer Project/Akash Split/test/benign/SOB_B_A-14-29960CD-100-004.png
/content/drive/MyDrive/Breast Cancer Project/Akash Split/test/benign/SOB_B_A-14-29960CD-100-001.png
/content/drive/MyDrive/Breast Cancer Project/Akash Split/test/benign/SOB_B_A-14-29960CD-100-013.png
/content/drive/MyDrive/Breast Cancer Project/Akash Split/test/benign/SOB_B_A-14-29960CD-100-009.png
/content/drive/MyDrive/Breast Cancer Project/Akash Split/test/benign/SOB_B_A-14-29960CD-100-003.png
/content/drive/MyDrive/Breast Cancer Project/Akash Split/test/benign/SOB_B_A-14-29960CD-100-010.png
/content/drive/MyDrive/Breast Cancer Project/Akash Split/test/benign/SOB_B_A-14-29960CD-100-005.png
/content/drive/MyDrive/Breast Cancer Project/Akash Split/test/benign/SOB_B_A-14-29960CD-100-008.png
/content/drive/MyDrive/Breast Cancer Project/

In [48]:
# Define a dictionary to store image paths grouped by patient IDs for the training data
train_data_by_patient = {}
# Define a set to store patient IDs for the training data
train_patient_ids = set()

# Iterate through the training data
for image_path, _ in train_data:
    # Extract the patient ID from the image filename
    patient_id = image_path.split("-")[2]
    # Add the patient ID to the set
    train_patient_ids.add(patient_id)
    # Check if the patient ID already exists in the dictionary
    if patient_id in train_data_by_patient:
        # Append the image path to the list associated with the patient ID
        train_data_by_patient[patient_id].append(image_path)
    else:
        # Create a new list with the image path for the patient ID
        train_data_by_patient[patient_id] = [image_path]

# Define a dictionary to store image paths grouped by patient IDs for the test data
test_data_by_patient = {}
# Define a set to store patient IDs for the test data
test_patient_ids = set()

# Iterate through the test data
for image_path, _ in test_data:
    # Extract the patient ID from the image filename
    patient_id = image_path.split("-")[2]
    # Add the patient ID to the set
    test_patient_ids.add(patient_id)
    # Check if the patient ID already exists in the dictionary
    if patient_id in test_data_by_patient:
        # Append the image path to the list associated with the patient ID
        test_data_by_patient[patient_id].append(image_path)
    else:
        # Create a new list with the image path for the patient ID
        test_data_by_patient[patient_id] = [image_path]

# Find common patient IDs between training and test datasets
common_patient_ids = train_patient_ids.intersection(test_patient_ids)

# Find unique patient IDs in the training dataset
unique_train_patient_ids = train_patient_ids.difference(test_patient_ids)

# Find unique patient IDs in the test dataset
unique_test_patient_ids = test_patient_ids.difference(train_patient_ids)

# Print results
print("Number of unique patient IDs in training data:", len(unique_train_patient_ids))
print("Number of unique patient IDs in test data:", len(unique_test_patient_ids))
print("Number of common patient IDs between training and test data:", len(common_patient_ids))

Number of unique patient IDs in training data: 42
Number of unique patient IDs in test data: 25
Number of common patient IDs between training and test data: 0


In [49]:
# Determine the maximum length of the sets to align the printing
max_length = max(len(train_patient_ids), len(test_patient_ids))

# Iterate over the sets simultaneously and print side by side
print("Train Patient IDs\t\tTest Patient IDs")
for train_id, test_id in zip(sorted(train_patient_ids), sorted(test_patient_ids)):
    print(f"{train_id}\t\t\t{test_id}")

# If one set is longer than the other, print the remaining elements
if len(train_patient_ids) > len(test_patient_ids):
    for train_id in sorted(train_patient_ids)[len(test_patient_ids):]:
        print(f"{train_id}\t\t\t")
elif len(test_patient_ids) > len(train_patient_ids):
    for test_id in sorted(test_patient_ids)[len(train_patient_ids):]:
        print(f"\t\t\t{test_id}")

Train Patient IDs		Test Patient IDs
11031			10926
11951			12312
12204			13993
12773			14134
13413			14926
13418DE			15275
14134E			15687B
15570			16184
15570C			16188
15696			16196
15704			16716
15792			17901
16184CD			18842D
16336			190EF
16448			19979
16456			20629
16601			20636
16875			21978AB
17614			22549AB
18650			29315EF
19440			29960AB
19854C			29960CD
19979C			4364
21998AB			5694
21998EF			8168
22549G			
23060AB			
23060CD			
23222AB			
25197			
2523			
2773			
2980			
2985			
3411F			
3909			
4372			
5287			
5695			
6241			
9133			
9461			


In [60]:
# Create a dictionary to store image paths grouped by patient ID
images_by_patient = {}

# Populate the dictionary with image paths grouped by patient ID
for label, class_name in enumerate(classes):
    folder_path = os.path.join(test_root, class_name)
    image_files = os.listdir(folder_path)
    for image_file in image_files:
        image_path = os.path.join(folder_path, image_file)
        # Extract patient ID from image file name
        patient_id = image_file.split("-")[2]
        if patient_id not in images_by_patient:
            images_by_patient[patient_id] = []  # Initialize list for patient ID if not already present
        images_by_patient[patient_id].append(image_path)  # Append image path to list for patient ID

# Print images grouped by patient ID
for patient_id, image_paths in images_by_patient.items():
    print("Patient ID:", patient_id)
    for image_path in image_paths:
        print("Image Path:", image_path)
    print()  # Print an empty line for better readability

Patient ID: 29960CD
Image Path: /content/drive/MyDrive/Breast Cancer Project/Akash Split/test/benign/SOB_B_A-14-29960CD-100-011.png
Image Path: /content/drive/MyDrive/Breast Cancer Project/Akash Split/test/benign/SOB_B_A-14-29960CD-100-004.png
Image Path: /content/drive/MyDrive/Breast Cancer Project/Akash Split/test/benign/SOB_B_A-14-29960CD-100-001.png
Image Path: /content/drive/MyDrive/Breast Cancer Project/Akash Split/test/benign/SOB_B_A-14-29960CD-100-013.png
Image Path: /content/drive/MyDrive/Breast Cancer Project/Akash Split/test/benign/SOB_B_A-14-29960CD-100-009.png
Image Path: /content/drive/MyDrive/Breast Cancer Project/Akash Split/test/benign/SOB_B_A-14-29960CD-100-003.png
Image Path: /content/drive/MyDrive/Breast Cancer Project/Akash Split/test/benign/SOB_B_A-14-29960CD-100-010.png
Image Path: /content/drive/MyDrive/Breast Cancer Project/Akash Split/test/benign/SOB_B_A-14-29960CD-100-005.png
Image Path: /content/drive/MyDrive/Breast Cancer Project/Akash Split/test/benign/SOB

In [65]:
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report

# Function to test images of a specific patient ID
def test_patient_images(images):
    # Create a list to store the paths and labels of the images for the patient
    patient_data = []
    for image_path in images:
        label = int(image_path.split("/")[-2] == "malignant")  # Extract label (0 for benign, 1 for malignant)
        patient_data.append((image_path, label))

    # Create a custom dataset for the patient's images
    patient_dataset = CustomDataset(patient_data, transform=test_val_transform)

    # DataLoader for the patient's images
    patient_loader = DataLoader(patient_dataset, batch_size=batch_size, shuffle=False)

    # Evaluate the model on the patient's images
    model.eval()
    all_predicted = []
    all_labels = []
    with torch.no_grad():
        for images, labels in patient_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images).logits
            _, predicted = torch.max(outputs, 1)
            all_predicted.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Return predictions and labels for the patient's images
    return all_labels, all_predicted

# List of patient IDs
patient_ids = [
    "10926", "12312", "13993", "14134", "14926", "15275", "15687B", "16184", "16188",
    "16196", "16716", "17901", "18842D", "190EF", "19979", "20629", "20636", "21978AB",
    "22549AB", "29315EF", "29960AB", "29960CD", "4364", "5694", "8168"
]

# Initialize lists to store all predictions and labels
all_predictions = []
all_labels = []

# Iterate through each patient ID
for patient_id in patient_ids:
    # Test images for the patient ID
    patient_images = images_by_patient.get(patient_id, [])
    labels, predictions = test_patient_images(patient_images)
    all_labels.extend(labels)
    all_predictions.extend(predictions)

# Calculate the average classification report
average_classification_report = classification_report(all_labels, all_predictions, digits=3)

# Print the average classification report
print("Average Classification Report Across All Patient IDs:")
print(average_classification_report)

Average Classification Report Across All Patient IDs:
              precision    recall  f1-score   support

           0      0.872     0.760     0.812       171
           1      0.902     0.952     0.926       396

    accuracy                          0.894       567
   macro avg      0.887     0.856     0.869       567
weighted avg      0.893     0.894     0.892       567

