### Trying to train a model to identify dogsâ€™ behaviours via my dataset in assignment 1. (Considering of the data, I chose one class 'Dog_barking' as train and valid data.) 

Preprocess images: Resize them to a consistent size, and normalize their pixel values.

In [1]:
import os
import numpy as np
from PIL import Image


# define the size of the resized images
IMAGE_SIZE = (224, 224)

# define the path to the directory containing the images
dataset_path = "data/Dataset"

# loop over all the image files in the directory
for filename in os.listdir(dataset_path):
    # check if the file is an image
    if filename.endswith(".jpg") or filename.endswith(".png"):
        # construct the full file path
        file_path = os.path.join(dataset_path, filename)
        
        # open the image using Pillow
        img = Image.open(file_path)
        
        # resize the image to the desired size
        img = img.resize(IMAGE_SIZE)
        
        # convert the image to grayscale or RGB
        # depending on the desired input format of the model
        img = img.convert('RGB')
        
        # convert the image to a NumPy array and normalize its values
        img_array = np.asarray(img) / 255.0
        
        # convert the NumPy array back to an Image object
        img = Image.fromarray((img_array * 255.0).astype(np.uint8))
        
        # save the preprocessed image to disk
        img.save(file_path)

In [2]:
#print(os.getcwd())

Split dataset into training and validation sets: Use about 80% of images for training and 20% for validation

In [3]:
from sklearn.model_selection import train_test_split

# create lists to hold the image file paths and corresponding labels
image_paths = []
labels = []

# define the path to the directory containing the images
dataset_path = "data/Dataset/Dog_barking"

# loop over all the image files in the directory
for filename in os.listdir(dataset_path):
    # check if the file is an image
    if filename.endswith(".jpg") or filename.endswith(".png"):
        # construct the full file path
        file_path = os.path.join(dataset_path, filename)
        
        # add the file path to the list of image paths
        image_paths.append(file_path)
        
        # extract the label from the filename (assuming the filename contains the label)
        label = filename.split("_")[0]
        labels.append(label)
        
# convert the lists to NumPy arrays
image_paths = np.array(image_paths)
labels = np.array(labels)

# split the dataset into training and validation sets
train_image_paths, val_image_paths, train_labels, val_labels = train_test_split(
    image_paths, labels, test_size=0.2, random_state=42)

Use a pre-trained model as a feature extractor: Remove the last layer(s) of the pre-trained model and use the remaining layers to extract features from your barking dog images. Save these features as a new dataset.

In [4]:
import torch
import torch.nn as nn
import torchvision.models as models

# define the input shape of the images
input_shape = (3, 224, 224)

# create a model using a pre-trained convolutional base
model = models.mobilenet_v2(pretrained=True)

# freeze the convolutional layers
for param in model.parameters():
    param.requires_grad = False

# replace the last fully-connected layer with a new one for our binary classification task
num_ftrs = model.classifier[-1].in_features
model.classifier[-1] = nn.Linear(num_ftrs, 1)

# define the loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters())

# move the model to a CUDA-enabled GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)




MobileNetV2(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(96, eps=

Train a new classifier: Add a new fully-connected layer to the pre-trained model, and train a new classifier to classify barking dog images based on the extracted features. 

In [5]:
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim

# Define the input shape
input_shape = 224

# Define the transformations to apply to the data
transform = transforms.Compose([
    transforms.Resize((input_shape, input_shape)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load the data using the ImageFolder class
data_dir = 'data'
train_dataset = torchvision.datasets.ImageFolder(root=data_dir+'/dataset', transform=transform)

# Create a DataLoader to load the data in batches
batch_size = 32
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)

# Define the model architecture
model = torchvision.models.mobilenet_v2(pretrained=True)
num_features = model.classifier[1].in_features
model.classifier = nn.Linear(num_features, 2)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model for 10 epochs
num_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

for epoch in range(num_epochs):
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % 10 == 9:  # print every 10 mini-batches
            print('[Epoch %d, Batch %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 10))
            running_loss = 0.0

print('Finished Training')


Finished Training


In [9]:
# Save the trained model
# it may need to save and delete and save for two times
torch.save(model.state_dict(), 'model.pth')

Evaluate the model: Evaluate the performance of your model on the validation set. If the accuracy is not satisfactory, you can try fine-tuning the model or using data augmentation techniques to increase the size of your dataset.

In [10]:
saved_state_dict = torch.load('model.pth')
print("Keys of saved state dictionary:")
print(saved_state_dict.keys())
print("\nKeys of current model state dictionary:")
print(model.state_dict().keys())

Keys of saved state dictionary:
odict_keys(['0.weight', '0.bias', '3.weight', '3.bias', '7.weight', '7.bias'])

Keys of current model state dictionary:
odict_keys(['0.weight', '0.bias', '3.weight', '3.bias', '7.weight', '7.bias'])


In [11]:
import torchvision.datasets as datasets

# define the device for training the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# load the dataset
data_dir = 'data/dataset'
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
test_dataset = datasets.ImageFolder(root=data_dir, transform=transform)

# define the model
model = nn.Sequential(
    nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=2, stride=2),
    nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=2, stride=2),
    nn.Flatten(),
    nn.Linear(128 * 56 * 56, 2)
)

# load the trained weights
model.load_state_dict(torch.load('model.pth'))

# set the model to evaluation mode
model.eval()

# create a data loader for the test set
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

# evaluate the model on the test set
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy on test set: {:.2%}'.format(correct / total))


Accuracy on test set: 82.86%


Fine-tune the model: Fine-tune the last few layers of the pre-trained model with my data to improve its performance.

**Sorry for the unfinished project, since the training is finished, but the accuracy is low. While when I want to fine-tune it, the error is similar to the last one i fixed, but it cost a lot time for me to figure out.**

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import datasets, models, transforms

# Define transforms for data augmentation and normalization
train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(size=224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Define the transforms for the validation set
val_transforms = transforms.Compose([
    transforms.Resize((input_shape, input_shape)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load the data using the ImageFolder class
data_dir = 'data'
train_dataset = datasets.ImageFolder(root=data_dir+'/dataset', transform=train_transforms)
val_dataset = datasets.ImageFolder(root=data_dir+'/val', transform=val_transforms)

# Create a DataLoader to load the data in batches
batch_size = 32
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Load a pre-trained ResNet18 model and replace the last fully connected layer
model = models.resnet18(pretrained=True)
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, len(train_dataset.classes))

# Set the optimizer and loss function
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
criterion = nn.CrossEntropyLoss()

# Train the model for a few epochs
num_epochs = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(train_dataset)
    print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, epoch_loss))

# Evaluate the model on the validation set
model.eval()
val_acc = 0.0
with torch.no_grad():
    for inputs, labels in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model(inputs)
        _, preds = torch.max(outputs.data, 1)
        val_acc += torch.sum(preds == labels.data)
val_acc = val_acc / len(val_dataset)
print('Validation Accuracy: {:.4f}'.format(val_acc))


FileNotFoundError: Couldn't find any class folder in data/val.

Test the model: Test the model on a hold-out set of images that it hasn't seen before to evaluate its performance.

**Sorry for the unfinished project, since the training is finished,

In [33]:
# define the size of the resized images
IMAGE_SIZE = (512, 700)

# define the path to the directory containing the images
dataset_path = "data/Dataset"

# loop over all the image files in the directory
for filename in os.listdir(dataset_path):
    # check if the file is an image
    if filename.endswith(".jpg") or filename.endswith(".png"):
        # construct the full file path
        file_path = os.path.join(dataset_path, filename)
        
        # open the image using Pillow
        img = Image.open(file_path)
        
        # resize the image to the desired size
        img = img.resize(IMAGE_SIZE)
        
        # convert the image to grayscale or RGB
        # depending on the desired input format of the model
        img = img.convert('RGB')
        
        # convert the image to a NumPy array and normalize its values
        img_array = np.asarray(img) / 255.0
        
        # convert the NumPy array back to an Image object
        img = Image.fromarray((img_array * 255.0).astype(np.uint8))
        
        # save the preprocessed image to disk
        img.save(file_path)

In [34]:
import torch
from torchvision.models import resnet18

# create a ResNet18 model
model = resnet18()

# load the state dictionary from a saved file
state_dict = torch.load('model.pth')

# load the state dictionary into the model
model.load_state_dict(state_dict)

# set the model to evaluation mode
model.eval()

# load the saved model
model = torch.load('model.pth')

# set the model to evaluation mode
model.eval()

# define the test dataset and data loader
test_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
test_dataset = datasets.ImageFolder(root='data/test', transform=test_transforms)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32)

# define the loss function
criterion = nn.CrossEntropyLoss()

# test the model
with torch.no_grad():
    running_loss = 0
    running_corrects = 0
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        _, preds = torch.max(outputs, 1)
        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)

    test_loss = running_loss / len(test_dataset)
    test_acc = running_corrects.double() / len(test_dataset)

print('Test Loss: {:.4f} | Test Acc: {:.4f}'.format(test_loss, test_acc))


RuntimeError: Error(s) in loading state_dict for ResNet:
	Missing key(s) in state_dict: "conv1.weight", "bn1.weight", "bn1.bias", "bn1.running_mean", "bn1.running_var", "layer1.0.conv1.weight", "layer1.0.bn1.weight", "layer1.0.bn1.bias", "layer1.0.bn1.running_mean", "layer1.0.bn1.running_var", "layer1.0.conv2.weight", "layer1.0.bn2.weight", "layer1.0.bn2.bias", "layer1.0.bn2.running_mean", "layer1.0.bn2.running_var", "layer1.1.conv1.weight", "layer1.1.bn1.weight", "layer1.1.bn1.bias", "layer1.1.bn1.running_mean", "layer1.1.bn1.running_var", "layer1.1.conv2.weight", "layer1.1.bn2.weight", "layer1.1.bn2.bias", "layer1.1.bn2.running_mean", "layer1.1.bn2.running_var", "layer2.0.conv1.weight", "layer2.0.bn1.weight", "layer2.0.bn1.bias", "layer2.0.bn1.running_mean", "layer2.0.bn1.running_var", "layer2.0.conv2.weight", "layer2.0.bn2.weight", "layer2.0.bn2.bias", "layer2.0.bn2.running_mean", "layer2.0.bn2.running_var", "layer2.0.downsample.0.weight", "layer2.0.downsample.1.weight", "layer2.0.downsample.1.bias", "layer2.0.downsample.1.running_mean", "layer2.0.downsample.1.running_var", "layer2.1.conv1.weight", "layer2.1.bn1.weight", "layer2.1.bn1.bias", "layer2.1.bn1.running_mean", "layer2.1.bn1.running_var", "layer2.1.conv2.weight", "layer2.1.bn2.weight", "layer2.1.bn2.bias", "layer2.1.bn2.running_mean", "layer2.1.bn2.running_var", "layer3.0.conv1.weight", "layer3.0.bn1.weight", "layer3.0.bn1.bias", "layer3.0.bn1.running_mean", "layer3.0.bn1.running_var", "layer3.0.conv2.weight", "layer3.0.bn2.weight", "layer3.0.bn2.bias", "layer3.0.bn2.running_mean", "layer3.0.bn2.running_var", "layer3.0.downsample.0.weight", "layer3.0.downsample.1.weight", "layer3.0.downsample.1.bias", "layer3.0.downsample.1.running_mean", "layer3.0.downsample.1.running_var", "layer3.1.conv1.weight", "layer3.1.bn1.weight", "layer3.1.bn1.bias", "layer3.1.bn1.running_mean", "layer3.1.bn1.running_var", "layer3.1.conv2.weight", "layer3.1.bn2.weight", "layer3.1.bn2.bias", "layer3.1.bn2.running_mean", "layer3.1.bn2.running_var", "layer4.0.conv1.weight", "layer4.0.bn1.weight", "layer4.0.bn1.bias", "layer4.0.bn1.running_mean", "layer4.0.bn1.running_var", "layer4.0.conv2.weight", "layer4.0.bn2.weight", "layer4.0.bn2.bias", "layer4.0.bn2.running_mean", "layer4.0.bn2.running_var", "layer4.0.downsample.0.weight", "layer4.0.downsample.1.weight", "layer4.0.downsample.1.bias", "layer4.0.downsample.1.running_mean", "layer4.0.downsample.1.running_var", "layer4.1.conv1.weight", "layer4.1.bn1.weight", "layer4.1.bn1.bias", "layer4.1.bn1.running_mean", "layer4.1.bn1.running_var", "layer4.1.conv2.weight", "layer4.1.bn2.weight", "layer4.1.bn2.bias", "layer4.1.bn2.running_mean", "layer4.1.bn2.running_var", "fc.weight", "fc.bias". 
	Unexpected key(s) in state_dict: "0.weight", "0.bias", "3.weight", "3.bias", "7.weight", "7.bias". 