In [None]:
import pandas as pd

In [26]:
import torch
from torch import nn
from torch.utils.data import DataLoader,Dataset,random_split
from torchvision import datasets
from torchvision.transforms import ToTensor

In [2]:
# Download the dataset
!kaggle datasets download -d fanconic/skin-cancer-malignant-vs-benign

Dataset URL: https://www.kaggle.com/datasets/fanconic/skin-cancer-malignant-vs-benign
License(s): unknown
Downloading skin-cancer-malignant-vs-benign.zip to /content
 99% 323M/325M [00:07<00:00, 20.4MB/s]
100% 325M/325M [00:07<00:00, 43.5MB/s]


In [3]:
# Unzip the dataset (if it's a zip file)
!unzip skin-cancer-malignant-vs-benign.zip -d skin-cancer-malignant-vs-benign

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: skin-cancer-malignant-vs-benign/data/train/benign/420.jpg  
  inflating: skin-cancer-malignant-vs-benign/data/train/benign/421.jpg  
  inflating: skin-cancer-malignant-vs-benign/data/train/benign/422.jpg  
  inflating: skin-cancer-malignant-vs-benign/data/train/benign/423.jpg  
  inflating: skin-cancer-malignant-vs-benign/data/train/benign/424.jpg  
  inflating: skin-cancer-malignant-vs-benign/data/train/benign/425.jpg  
  inflating: skin-cancer-malignant-vs-benign/data/train/benign/426.jpg  
  inflating: skin-cancer-malignant-vs-benign/data/train/benign/429.jpg  
  inflating: skin-cancer-malignant-vs-benign/data/train/benign/43.jpg  
  inflating: skin-cancer-malignant-vs-benign/data/train/benign/430.jpg  
  inflating: skin-cancer-malignant-vs-benign/data/train/benign/431.jpg  
  inflating: skin-cancer-malignant-vs-benign/data/train/benign/432.jpg  
  inflating: skin-cancer-malignant-vs-benign/data/train/beni

# Data Preprocesing
- In this step we can preprocess the data mean that we can remove the corrupt imagess

In [4]:
import shutil
shutil.rmtree('/content/skin-cancer-malignant-vs-benign/data')

In [14]:
import os
from PIL import Image

def remove_corrupt_images(image_dir):
    """
    Removes corrupt images from a directory.

    Args:
      image_dir: The path to the directory containing the images.
    """
    dir = os.listdir(image_dir)
    print(dir)

    # iamge dir
    for img_folder in dir:
      print(img_folder)
      for img in os.listdir(os.path.join(image_dir, img_folder)):
        try:
          # print(img)
          img_path = os.path.join(image_dir, img_folder, img)
          img = Image.open(img_path)
          img.verify()
          img.close()
        except Exception as e:
          print(f"Corrupt Image: {img}")
          # reemove the  image
          os.remove(img_path)
          return str(e)
      print("No Corrupt Image Found in the given folder")

In [16]:
# train dir
remove_corrupt_images("/content/skin-cancer-malignant-vs-benign/train")

['malignant', 'benign']
malignant
No Corrupt Image Found in the given folder
benign
No Corrupt Image Found in the given folder


In [17]:
# test dir
remove_corrupt_images("/content/skin-cancer-malignant-vs-benign/test")

['malignant', 'benign']
malignant
No Corrupt Image Found in the given folder
benign
No Corrupt Image Found in the given folder


In [21]:
import matplotlib.pyplot as plt
import random
import os

# Assuming your images are in the 'train' and 'test' directories within the dataset folder
data_dir = "/content/skin-cancer-malignant-vs-benign"
train_dir = os.path.join(data_dir, "train")
test_dir = os.path.join(data_dir, "test")


def plot_images(directory, num_images=5):
    image_files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

    if len(image_files) < num_images:
        print(f"Warning: Not enough images in {directory} to display {num_images}. Displaying {len(image_files)} images.")
        num_images = len(image_files)

    random_images = random.sample(image_files, num_images)

    plt.figure(figsize=(10, 5))
    for i, image_file in enumerate(random_images):
        img_path = os.path.join(directory, image_file)
        try:
            img = Image.open(img_path)
            plt.subplot(1, num_images, i + 1)
            plt.imshow(img)
            plt.axis('off')  # Hide axis
        except Exception as e:
            print(f"Error loading image {image_file}: {e}")

    plt.tight_layout()
    plt.show()


In [23]:
# Plot random images from the train directory
print("Random Images from Train Directory:")
plot_images(train_dir, num_images=random.randint(5, 10))

Random Images from Train Directory:


<Figure size 1000x500 with 0 Axes>

In [24]:
# Plot random images from the test directory (optional)
print("\nRandom Images from Test Directory:")
plot_images(test_dir, num_images=random.randint(5, 10))


Random Images from Test Directory:


<Figure size 1000x500 with 0 Axes>

In [30]:
from torchvision import transforms

In [33]:
# train data
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(20),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_data = datasets.ImageFolder(root="/content/skin-cancer-malignant-vs-benign/train",transform=transform)
train_data

Dataset ImageFolder
    Number of datapoints: 2637
    Root location: /content/skin-cancer-malignant-vs-benign/train
    StandardTransform
Transform: Compose(
               Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=True)
               RandomHorizontalFlip(p=0.5)
               RandomVerticalFlip(p=0.5)
               RandomRotation(degrees=[-20.0, 20.0], interpolation=nearest, expand=False, fill=0)
               ToTensor()
               Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
           )

In [34]:
test_data = datasets.ImageFolder(root="/content/skin-cancer-malignant-vs-benign/test",transform=transform)
test_data

Dataset ImageFolder
    Number of datapoints: 660
    Root location: /content/skin-cancer-malignant-vs-benign/test
    StandardTransform
Transform: Compose(
               Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=True)
               RandomHorizontalFlip(p=0.5)
               RandomVerticalFlip(p=0.5)
               RandomRotation(degrees=[-20.0, 20.0], interpolation=nearest, expand=False, fill=0)
               ToTensor()
               Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
           )

In [None]:
# Data Loader

In [41]:
# Train loader
train_loader = DataLoader(train_data,batch_size=32,shuffle=True)

# test loader
test_loader = DataLoader(test_data,batch_size=32,shuffle=True)

In [42]:
# view the batch
for batch in train_loader:
  print(batch[0].shape)
  print(batch[1].shape)
  break

torch.Size([32, 3, 224, 224])
torch.Size([32])


# Transformer Learning

In [35]:
import torchvision.models as models

# Load the pre-trained VGG16 model
vgg16 = models.vgg16(pretrained=True)

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth
100%|██████████| 528M/528M [00:04<00:00, 125MB/s]


In [36]:
vgg16

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [37]:
# update the last layers
vgg16.classifier[6] = nn.Linear(4096, 2)

In [38]:
vgg16

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [39]:
# Freeze all tje layers
for params in vgg16.parameters():
  params.requires_grad = False

# Unfreeze the updated layer
for params in vgg16.classifier[6].parameters():
  params.requires_grad = True

In [44]:
# set the loss and optimzir
from torch.optim import Adam
loss_fn = nn.CrossEntropyLoss()
optimizer = Adam(vgg16.parameters(),lr=0.001)
epochs = 1

In [53]:
# set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [54]:
# Save model checkpoint
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)

In [59]:
# Function to test the model
def test_model(vgg16, test_loader, device):
    vgg16.eval()  # Set model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():  # No gradient computation for testing
        for image, label in test_loader:
            image, label = image.to(device), label.to(device)
            output = vgg16(image)

            _, predicted = torch.max(output, 1)  # Get the class with highest probability
            total += label.size(0)
            correct += (predicted == label).sum().item()

    accuracy = 100 * correct / total
    return accuracy

In [60]:
# Function to train the model
def train_model(vgg16, train_loader, optimizer, loss_fn, epochs, device):
    for epoch in range(epochs):
        vgg16.train()  # Set model to training mode
        correct = 0
        total = 0

        for image, label in train_loader:
            image, label = image.to(device), label.to(device)

            # Forward pass
            output = vgg16(image)

            # Calculate the loss
            loss = loss_fn(output, label)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()

            # Update the weights
            optimizer.step()

            # Calculate the accuracy
            _, predicted = torch.max(output, 1)
            total += label.size(0)
            correct += (predicted == label).sum().item()

            # Print loss for each iteration
            print(f"Epoch: {epoch + 1}/{epochs} Loss: {loss.item()}")

        # Calculate and print accuracy after each epoch
        accuracy = 100 * correct / total
        print(f"Epoch: {epoch + 1}/{epochs} Training Accuracy: {accuracy:.2f}%")

        # Call test function inside training loop
        test_accuracy = test_model(vgg16, test_loader, device)
        print(f"Epoch: {epoch + 1}/{epochs} Test Accuracy: {test_accuracy:.2f}%")


        # save model checkpoint
        checkpoint = {
            "epoch": epoch + 1,
            "state_dict": vgg16.state_dict(),
            "optimizer": optimizer.state_dict(),
        }
        save_checkpoint(checkpoint)

In [57]:
vgg16.to(device)  # Move the model to the correct device

train_model(vgg16, train_loader, optimizer, loss_fn, epochs, device)

Epoch: 1/1 Loss: 0.7669618129730225
Epoch: 1/1 Training Accuracy: 53.12%
Epoch: 1/1 Test Accuracy: 59.38%
