In [2]:
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision import models
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import random_split
import random
import os
#import Image
from PIL import Image

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [1]:
classes = ['3d','c0','c1','c10','c2','c3','c4','c5','c6','c7','c8','c9','e1','e2','ent','ex','exb','i','m','meet','mod','r','sew','sv','tech','wm1','wm2']

In [19]:
# Load a pre-trained ResNet50 model
model_resnet50 = models.resnet50(pretrained=True)

# Unfreeze some of the layers for fine-tuning
for name, child in model_resnet50.named_children():
    if name in ['layer3', 'layer4']:
        for param in child.parameters():
            param.requires_grad = True
    else:
        for param in child.parameters():
            param.requires_grad = False

# Modify the final layer for  len(dataset.classes) classes
num_ftrs = model_resnet50.fc.in_features
model_resnet50.fc = nn.Linear(num_ftrs,  len(classes))

model_resnet50 = model_resnet50.to(device)

# Define loss function and optimizer for ResNet50
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model_resnet50.parameters()), lr=0.001)
criterion = nn.CrossEntropyLoss()


In [15]:
# Load a pre-trained ResNet101 model
model_resnet101 = models.resnet101(pretrained=True)

# Modify the final layer for  len(dataset.classes) classes
num_ftrs = model_resnet101.fc.in_features
model_resnet101.fc = nn.Linear(num_ftrs,  len(classes))

model_resnet101 = model_resnet101.to(device)

# Define loss function and optimizer for ResNet101
optimizer = optim.Adam(model_resnet101.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()


In [13]:
# Load a pre-trained model (ResNet18 in this case) and modify it
model_resnet18 = models.resnet18(pretrained=True)
num_ftrs = model_resnet18.fc.in_features
model_resnet18.fc = nn.Linear(num_ftrs, len(classes))  # Adjusting for the number of classes
model = model_resnet18.to(device)

# Loss Function and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [20]:
num_ftrs

2048

In [9]:
#load weights
model_resnet101.load_state_dict(torch.load('./models/FC_Res101_simple/epoch_10.pth'))

<All keys matched successfully>

In [10]:
from PIL import Image
import torchvision.transforms as transforms



# Function to preprocess the image
def preprocess_image(image_path):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    image = Image.open(image_path)
    image = transform(image).unsqueeze(0)  # Add batch dimension
    return image

def predict_image(model, image_path, class_names):
    image = preprocess_image(image_path)
    image = image.to(device)

    model.eval()
    with torch.no_grad():
        outputs = model(image)
        _, predicted = torch.max(outputs, 1)
        predicted_class = class_names[predicted[0].item()]

    return predicted_class

class_names = classes

In [12]:
image = './frames/dataset/3d/.png'
preprocess_image(image)
p = predict_image(model_resnet101, image, class_names)
print(p)

3d


In [22]:
import torch
from torch import nn
from transformers import ViTForImageClassification, ViTFeatureExtractor
from torchvision.datasets import ImageFolder
from torchvision.transforms import Compose
from torch.utils.data import DataLoader, random_split

# Check if GPU is available
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

# Path to your dataset
data_dir = "./frames/dataset/"

# Load and preprocess the dataset
# Transformations
transform = transforms.Compose([
    transforms.Resize((256, 256)),  # Resize to a slightly larger size before cropping
    transforms.RandomCrop(224),     # Randomly crop to 224x224
    transforms.RandomHorizontalFlip(),  # Randomly flip the images
    transforms.RandomRotation(10),      # Randomly rotate the images
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.Lambda(lambda img: overlay_human_silhouette(img)),
    transforms.RandomPerspective(distortion_scale=0.2, p=0.5),
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1), scale=(0.9, 1.1)),
    transforms.ToTensor(),          # Convert images to PyTorch tensors
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize with ImageNet stats
    
])

# Load dataset
dataset = torchvision.datasets.ImageFolder(root=data_dir, transform=transform)
# List full path of silhouette images
silhouette_image_paths = [os.path.join("./silhoutte/", path) for path in os.listdir("./silhoutte/") if path.endswith('.png')]

# Load silhouette images
silhouette_images = [Image.open(path).convert("RGBA") for path in silhouette_image_paths]

# Define a function to overlay human silhouette
def overlay_human_silhouette(image):
    silhouette = random.choice(silhouette_images)
    silhouette = silhouette.resize((random.randint(50, 100), random.randint(100, 200)))

    x, y = random.randint(0, image.width - silhouette.width), random.randint(0, image.height - silhouette.height)
    image.paste(silhouette, (x, y), silhouette)
    return image


dataset = ImageFolder(root=data_dir, transform=transform)

# Split dataset into train, validation, and test sets
train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# Data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Load pre-trained Vision Transformer
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224-in21k', num_labels=27)
model.to(device)

# Optimizer and Loss Function
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

# Training function
def train(model, train_loader, optimizer, criterion):
    model.train()
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images).logits
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Validation function
def validate(model, val_loader):
    model.eval()
    total, correct = 0, 0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images).logits
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    train(model, train_loader, optimizer, criterion)
    val_accuracy = validate(model, val_loader)
    print(f'Epoch {epoch+1}/{num_epochs}, Validation Accuracy: {val_accuracy:.4f}')


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Validation Accuracy: 0.8711
Epoch 2/10, Validation Accuracy: 0.9389
Epoch 3/10, Validation Accuracy: 0.9523
Epoch 4/10, Validation Accuracy: 0.9637
Epoch 5/10, Validation Accuracy: 0.9663
