In [1]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import os

In [2]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

data_dir = 'RockPaperScissorsDataset'

# Define data transformations
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

# Load datasets
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x]) for x in ['train', 'val']}
dataloaders = {x: DataLoader(image_datasets[x], batch_size=32, shuffle=True, num_workers=4) for x in ['train', 'val']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes
class_names

['paper', 'rock', 'scissors']

First, let's print the model architecture to understand its structure:

Run the above code to see the structure of the model. Look for the final layer that produces the class scores, typically named something like classifier, head, or fc.

In [3]:
# Step 4: Define the Model
# Load the FasterViT model and modify it for your number of classes.

from fastervit import create_model

# Load FasterViT model
model = create_model('faster_vit_0_224', 
                     pretrained=True,
                     model_path="tmp/faster_vit_0.pth.tar")

# Print the model architecture
print(model)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


FasterViT(
  (patch_embed): PatchEmbed(
    (proj): Identity()
    (conv_down): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(64, eps=0.0001, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (4): BatchNorm2d(64, eps=0.0001, momentum=0.1, affine=True, track_running_stats=True)
      (5): ReLU()
    )
  )
  (levels): ModuleList(
    (0): FasterViTLayer(
      (blocks): ModuleList(
        (0): ConvBlock(
          (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          (norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act1): GELU(approximate='none')
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          (norm2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_s


From the model architecture, it looks like the final classification layer is named head. 

To modify this layer for your custom classification task, you should replace the head layer with a new Linear layer that has the appropriate number of output classes for your dataset.

In [4]:

# Modify the final layer for custom classification
num_ftrs = model.head.in_features
model.head = torch.nn.Linear(num_ftrs, len(class_names))

In [5]:
# Move the model to GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)


In [4]:
import torch.optim as optim
from torch.optim import lr_scheduler

# Define loss function
criterion = torch.nn.CrossEntropyLoss()

# Define optimizer
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# Learning rate scheduler
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)


In [9]:
import time
import copy

def train_model(model, criterion, optimizer, scheduler, num_epochs=5):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # Zero the parameter gradients
                optimizer.zero_grad()

                # Forward
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # Backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # Statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # Deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Acc: {best_acc:.4f}')

    # Load best model weights
    model.load_state_dict(best_model_wts)
    return model

# Train the model
model = train_model(model, criterion, optimizer, exp_lr_scheduler, num_epochs=5)

# Save the model
torch.save(model.state_dict(), 'faster_vit_custom_model.pth')

Epoch 0/4
----------
train Loss: 0.8346 Acc: 0.7202
val Loss: 0.4109 Acc: 0.9946

Epoch 1/4
----------
train Loss: 0.4181 Acc: 0.8750
val Loss: 0.1203 Acc: 1.0000

Epoch 2/4
----------
train Loss: 0.2697 Acc: 0.9115
val Loss: 0.0723 Acc: 1.0000

Epoch 3/4
----------
train Loss: 0.2665 Acc: 0.9020
val Loss: 0.1885 Acc: 0.9328

Epoch 4/4
----------
train Loss: 0.2348 Acc: 0.9079
val Loss: 0.0690 Acc: 0.9839

Training complete in 1m 32s
Best val Acc: 1.0000


# Testing

In [6]:
import torch
from torchvision import transforms
from PIL import Image
from fastervit import create_model

# Define the number of classes in your custom dataset
num_classes = 3  # Replace with your actual number of classes

# Create the model architecture
model = create_model('faster_vit_0_224', pretrained=False)

# Modify the final classification layer to match the number of classes in your custom dataset
model.head = torch.nn.Linear(model.head.in_features, num_classes)

# Move the model to GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Load the trained model weights
model.load_state_dict(torch.load('faster_vit_custom_model.pth'))
model.eval()  # Set the model to evaluation mode

# Define data transformations for the input image
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Function to load and preprocess the image
def load_image(image_path):
    image = Image.open(image_path).convert('RGB')
    image = preprocess(image)
    image = image.unsqueeze(0)  # Add batch dimension
    return image.to(device)

# Function to make predictions
def predict(image_path, model, class_names):
    image = load_image(image_path)
    with torch.no_grad():
        outputs = model(image)
        _, preds = torch.max(outputs, 1)
        predicted_class = class_names[preds.item()]
    return predicted_class

# List of class names (ensure this matches your custom dataset's classes)
class_names = ['paper', 'rock', 'scissors']  # Replace with your actual class names

# Example usage
image_path = 'RockPaperScissorsDataset\\test\\rock\\rock2_png.rf.baa4a80a096a58d85ba7c79bd8cd0a74.jpg'
predicted_class = predict(image_path, model, class_names)
print(predicted_class)


rock


In [7]:
from PIL import Image, ImageDraw, ImageFont
# Function to make predictions and draw the label on the image
def predict_and_draw(image_path, model, class_names):
    image = Image.open(image_path).convert('RGB')
    input_tensor = preprocess(image).unsqueeze(0).to(device)
    
    with torch.no_grad():
        outputs = model(input_tensor)
        _, preds = torch.max(outputs, 1)
        predicted_class = class_names[preds.item()]
    
    # Draw the predicted class on the image
    draw = ImageDraw.Draw(image)
    font = ImageFont.load_default()
    text = f'Predicted: {predicted_class}'
    
    # Get the size of the text
    text_bbox = draw.textbbox((10, 10), text, font=font)
    text_width = text_bbox[2] - text_bbox[0]
    text_height = text_bbox[3] - text_bbox[1]
    
    # Position the text at the top left corner
    text_position = (10, 10)
    draw.rectangle([text_position, (text_position[0] + text_width, text_position[1] + text_height)], fill="black")
    draw.text(text_position, text, fill="white", font=font)
    
    # Display the image
    image.show()
    
    # Save the image with the prediction text
    output_image_path = "output_with_prediction.jpg"
    image.save(output_image_path)
    print(f"Saved output image with prediction: {output_image_path}")




# List of class names (ensure this matches your custom dataset's classes)
class_names = ['paper', 'rock', 'scissors']  # Replace with your actual class names

# Example usage
image_path = 'RockPaperScissorsDataset\\test\\rock\\rock2_png.rf.baa4a80a096a58d85ba7c79bd8cd0a74.jpg'
predict_and_draw(image_path, model, class_names)

Saved output image with prediction: output_with_prediction.jpg
