Necessary Imports

In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision import models
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import random_split
import random
import os
#import Image
from PIL import Image

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [6]:
dataset = torchvision.datasets.ImageFolder(root="./dataset/")
classes = dataset.classes

# Load model of interest

### Load a pre-trained ResNet50 model

In [19]:
# Load a pre-trained ResNet50 model
model_resnet50 = models.resnet50(pretrained=True)

# Unfreeze some of the layers for fine-tuning
for name, child in model_resnet50.named_children():
    if name in ['layer3', 'layer4']:
        for param in child.parameters():
            param.requires_grad = True
    else:
        for param in child.parameters():
            param.requires_grad = False

# Modify the final layer for  len(dataset.classes) classes
num_ftrs = model_resnet50.fc.in_features
model_resnet50.fc = nn.Linear(num_ftrs,  len(classes))

model_resnet50 = model_resnet50.to(device)

# Define loss function and optimizer for ResNet50
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model_resnet50.parameters()), lr=0.001)
criterion = nn.CrossEntropyLoss()


### Load a pre-trained ResNet101 model

In [15]:
# Load a pre-trained ResNet101 model
model_resnet101 = models.resnet101(pretrained=True)

# Modify the final layer for  len(dataset.classes) classes
num_ftrs = model_resnet101.fc.in_features
model_resnet101.fc = nn.Linear(num_ftrs,  len(classes))

model_resnet101 = model_resnet101.to(device)

# Define loss function and optimizer for ResNet101
optimizer = optim.Adam(model_resnet101.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()


### Load a pre-trained model ResNet18

In [7]:
# Load a pre-trained model (ResNet18 in this case) and modify it
model_resnet18 = models.resnet18(pretrained=True)
num_ftrs = model_resnet18.fc.in_features
model_resnet18.fc = nn.Linear(num_ftrs, len(classes))  # Adjusting for the number of classes
model = model_resnet18.to(device)

# Loss Function and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)



## Load weights from training

In [9]:
#load weights
model_resnet18.load_state_dict(torch.load('./models/custom_simple/epoch_10.pth'))

<All keys matched successfully>

### Image prediction example

In [10]:
from PIL import Image
import torchvision.transforms as transforms



# Function to preprocess the image
def preprocess_image(image_path):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    image = Image.open(image_path)
    image = transform(image).unsqueeze(0)  # Add batch dimension
    return image

def predict_image(model, image_path, class_names):
    image = preprocess_image(image_path)
    image = image.to(device)

    model.eval()
    with torch.no_grad():
        outputs = model(image)
        _, predicted = torch.max(outputs, 1)
        predicted_class = class_names[predicted[0].item()]

    return predicted_class

class_names = classes

In [12]:
# Test the model on a single image
# input image
image = './dataset/a1/source1_0.png'

preprocess_image(image)
p = predict_image(model_resnet18, image, class_names)
# print the prediction
print(p)

a1


In [13]:
class_names = dataset.classes
class_names

['a1', 'a2']

# Video Inference Example

In [15]:
#start reading a video IMG_0387.MOV
import cv2
import os

# Path to your video file
video_path = r"./data/source1/a1/IMG_0704.MOV"

# output video name
videoname = "try1"

# Directory to save the frames
if not os.path.exists('demo'):
    os.makedirs('demo')
import cv2
import torch
from torchvision import transforms
from PIL import Image
import numpy as np

# Function to preprocess image
def preprocess_image(frame):
    # Convert the NumPy array to a PIL Image
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    # Define your preprocessing steps here
    preprocess = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    return preprocess(image).unsqueeze(0)

# Rest of your code remains the same

# Start capturing the feed
cap = cv2.VideoCapture(video_path)
fourcc = cv2.VideoWriter_fourcc(*'MP4V')
out = cv2.VideoWriter(f'./demo/{videoname}.mp4', fourcc, 60.0, (int(cap.get(3)), int(cap.get(4))))
frame_count = 0
while True:
    ret, frame = cap.read()
    if ret == False:
        break
    #preprocess the frame and predict the class
    image = preprocess_image(frame)
    image = image.to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(image)
        _, predicted = torch.max(outputs, 1)
        predicted_class = class_names[predicted[0].item()]
    #save the video with annotation of the class predicted
    cv2.putText(frame, predicted_class, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
    out.write(frame)
    
    

cap.release()
out.release()
cv2.destroyAllWindows()



OpenCV: FFMPEG: tag 0x5634504d/'MP4V' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'
