In [1]:
# pip install vit_pytorch facenet_pytorch

In [82]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, datasets
import torch.nn.functional as F
from torchvision import models
from collections import Counter
import numpy as np
from PIL import Image
import cv2
from tqdm import tqdm

In [83]:
print(torch.__version__)

2.3.1+cu121


In [98]:
# Define the transformation to normalize the data
transform = transforms.Compose([
    transforms.Resize((200,200)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Grayscale(1),
    transforms.Normalize((0.5), (0.5))  # Normalize
])

# Download and load the training and test sets
trainset = datasets.ImageFolder(
    root='abc/train',
    transform = transform
)

testset = datasets.ImageFolder(
    root='abc/test',
    transform = transform
)

In [99]:
len(trainset.classes)

49

In [100]:
# Create DataLoaders
trainloader = torch.utils.data.DataLoader(trainset, batch_size=5, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=5, shuffle=False)

In [101]:
def save_checkpoint(model, optimizer, epoch, loss, file_path='vit_checkpoint.pth'):
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': epoch,
        'loss': loss
    }
    torch.save(checkpoint, file_path)
    print(f'Checkpoint saved at {file_path}')

In [102]:
def load_checkpoint(model, optimizer=None, file_path='vit_checkpoint.pth'):
    checkpoint = torch.load(file_path)
    print("here")
    model.load_state_dict(checkpoint['model_state_dict'])
    
    if optimizer:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    
    print(f'Checkpoint loaded from {file_path} (Epoch: {epoch}, Loss: {loss})')
    
    return model, optimizer, epoch, loss

In [103]:
def transfer_learning(model):
    # Step 1: Modify the classification layer for transfer learning
    model.linear_head = nn.Linear(in_features=model.linear_head.in_features, out_features=len(trainset.classes))  # Change 49 to the new number of classes
    
    # Step 2: Load the saved checkpoint
    checkpoint = torch.load('vit_checkpoint.pth')
    
    # Remove the `linear_head` weights from the checkpoint to avoid size mismatch
    del checkpoint['model_state_dict']['linear_head.weight']
    del checkpoint['model_state_dict']['linear_head.bias']
    
    # Load the remaining weights
    model.load_state_dict(checkpoint['model_state_dict'], strict=False)
    
    model.to(device)
    return model


In [104]:
#Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [105]:
from vit_pytorch import SimpleViT
def init_model():
    model = SimpleViT(
        image_size = 200,
        channels=1,
        patch_size = 20,
        num_classes = len(trainset.classes),
        dim = 128,
        depth = 6,
        heads = 16,
        mlp_dim = 2048
    )
    return model

In [106]:
#load the previous model if exist otherwise initialize the new model
try:
    model, optimizer, start_epoch, _ = load_checkpoint(model, optimizer, checkpoint_path)
except:
    model = init_model()
    model = model.to(device)
    start_epoch = 0
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

here
Checkpoint loaded from vit_checkpoint.pth (Epoch: 45, Loss: 0.1548540075576102)


In [107]:
# model = model.to(device)
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

In [108]:
images, labels = next(iter(trainloader))
images, labels = images.to(device), labels.to(device)
print("Input shape:", images.shape)

Input shape: torch.Size([5, 1, 200, 200])


In [109]:
patches = model.to_patch_embedding(images)
print("Patch embedding shape:", patches.shape)

Patch embedding shape: torch.Size([5, 100, 128])


In [110]:
try:
    # Apply transfer learning modifications
    model = transfer_learning(model)
    print(model)
    start_epoch += 1  # Start training from the next epoch
except FileNotFoundError:
    print("No checkpoint found, starting training from scratch.")

num_epochs = 20
for epoch in range(start_epoch, start_epoch + num_epochs):
    model.train()
    running_loss = 0.0
    correct_train = 0
    total_train = 0
    print("here")
    
    for images, labels in tqdm(trainloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()  # Zero the parameter gradients

        outputs = model(images)  # Forward pass
        loss = criterion(outputs, labels)  # Compute loss
        loss.backward()  # Backward pass
        optimizer.step()  # Optimize the model parameters

        running_loss += loss.item() * images.size(0)  # Update running loss

        # Calculate training accuracy
        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(trainset)
    epoch_train_accuracy = 100 * correct_train / total_train

    # Evaluate on test set
    model.eval()
    correct_test = 0
    total_test = 0
    with torch.no_grad():
        for images, labels in testloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total_test += labels.size(0)
            correct_test += (predicted == labels).sum().item()

    test_accuracy = 100 * correct_test / total_test

    # Print loss and accuracy
    print(f"Epoch [{epoch+1}/{num_epochs}]")
    print(f"Train Loss: {epoch_loss:.4f}, Train Accuracy: {epoch_train_accuracy:.2f}%")
    print(f"Test Accuracy: {test_accuracy:.2f}%")
    
    # Save checkpoint
    save_checkpoint(model, optimizer, epoch, epoch_loss, file_path=checkpoint_path)


SimpleViT(
  (to_patch_embedding): Sequential(
    (0): Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=20, p2=20)
    (1): LayerNorm((400,), eps=1e-05, elementwise_affine=True)
    (2): Linear(in_features=400, out_features=128, bias=True)
    (3): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  )
  (transformer): Transformer(
    (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    (layers): ModuleList(
      (0-5): 6 x ModuleList(
        (0): Attention(
          (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (attend): Softmax(dim=-1)
          (to_qkv): Linear(in_features=128, out_features=3072, bias=False)
          (to_out): Linear(in_features=1024, out_features=128, bias=False)
        )
        (1): FeedForward(
          (net): Sequential(
            (0): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
            (1): Linear(in_features=128, out_features=2048, bias=True)
            (2): GELU(approximate='none')
    

Epoch 47/20: 100%|██████████████████████████| 1614/1614 [00:29<00:00, 55.36it/s]


KeyboardInterrupt: 

In [33]:
model, optimizer, epoch, loss = load_checkpoint(model, optimizer=None, file_path='vit_checkpoint.pth')


here
Checkpoint loaded from vit_checkpoint.pth (Epoch: 40, Loss: 0.03186562934699328)


In [None]:
model.eval()

In [114]:
import cv2
import torch
from torchvision import transforms
import numpy as np


# Capture video from webcam
cap = cv2.VideoCapture('vids/all_test.mp4')  # Change to 'video.mp4' for video file
from facenet_pytorch import MTCNN

mtcnn = MTCNN(keep_all=True, device='cuda' if torch.cuda.is_available() else 'cpu')

class_names = trainset.classes
# Define the transformations
transform = transforms.Compose([
    transforms.Resize((200, 200)),  # Resize images to 300x300
    transforms.ToTensor(),  # Convert image to tensor
    transforms.Grayscale(1),
    transforms.Normalize((0.5,), (0.5,))  # Normalize with mean and std for grayscale
])

while True:
    ret, frame = cap.read()
    print("here")
    if not ret:
        print("here")
        break
    # print("here")
    # Convert frame to RGB
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Detect faces
    boxes, _ = mtcnn.detect(rgb_frame)
    print(boxes)
    # print("hello")

    if boxes is not None:
        for box in boxes:
#             print("hello")
            x1, y1, x2, y2 = map(int, box)

            # Extract face ROI
            face = frame[y1:y2, x1:x2]
            if face.size == 0:
                continue

            try:
                # Apply transformations
                face_pil = Image.fromarray(face)  # Convert numpy array to PIL image
                face_transformed = transform(face_pil)  # Apply transformations
                face_transformed = face_transformed.unsqueeze(0)  # Add batch dimension
#                 print("hello")
            except Exception as e:
                print(f"Error during preprocessing: {e}")
#                 print("hello")
                continue

            # Move the tensor to the device (GPU/CPU)
            face_transformed = face_transformed.to(device)

            # Classify face
            with torch.no_grad():

                output = model(face_transformed)
                _, predicted = torch.max(output, 1)
                label = predicted.item()

            label_name = class_names[label]
            # Draw rectangle and label
            cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 0, 0), 2)
            cv2.putText(frame, str(label_name), (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 0, 0), 2)

    # Display the resulting frame
    cv2.imshow('Video', frame)
    # print("hello")

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release capture and close windows
cap.release()
cv2.destroyAllWindows()


here
[[366.79730224609375 131.9984588623047 396.4997863769531
  171.57675170898438]
 [379.8697204589844 29.110618591308594 401.41351318359375
  60.22030258178711]
 [36.59773254394531 122.90636444091797 55.85639572143555
  154.9058074951172]
 [96.36554718017578 103.5871810913086 117.37486267089844
  130.42666625976562]
 [103.3097915649414 50.78644943237305 119.62472534179688
  75.47760772705078]]
here
[[366.7972717285156 131.9980010986328 396.5005187988281
  171.57801818847656]
 [379.8693542480469 29.109163284301758 401.41497802734375
  60.221405029296875]
 [36.639434814453125 122.85186767578125 55.9250602722168 154.887451171875]
 [96.36579895019531 103.58629608154297 117.37445068359375
  130.4260711669922]
 [103.30919647216797 50.786075592041016 119.62509155273438
  75.47859191894531]]
here
[[366.7969055175781 132.0001220703125 396.4921875 171.56292724609375]
 [36.642059326171875 122.8530044555664 55.92482376098633
  154.88674926757812]
 [380.78729248046875 29.79729652404785 400.926574