In [86]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os
import torch
import torch.nn as nn
import torch.nn.functional as F

In [87]:
# Check if CUDA is available
print("CUDA available:", torch.cuda.is_available())

# Check the CUDA version used by PyTorch
print("PyTorch CUDA version:", torch.version.cuda)

# Check the installed GPU
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("No GPU detected")


CUDA available: True
PyTorch CUDA version: 12.6
GPU: NVIDIA GeForce RTX 4070


In [88]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing import image

def load_images_from_directory(directory, target_size=(256, 256)):
    images = []
    labels = []
    for label in os.listdir(directory):
        label_dir = os.path.join(directory, label)
        if os.path.isdir(label_dir):
            for img_name in os.listdir(label_dir):
                img_path = os.path.join(label_dir, img_name)
                img = image.load_img(img_path, target_size=target_size)
                img_array = image.img_to_array(img)
                images.append(img_array)
                labels.append(label)
    return images, labels

# Load all images and labels
all_images, all_labels = load_images_from_directory(r"C:\Users\antoi\Documents\Nell_Antoine_Project\DATA")

# Split into training and validation datasets
train_images, validation_images, train_labels, validation_labels = train_test_split(
    all_images, all_labels, test_size=0.2, random_state=42
)

In [89]:
train_images = np.array(train_images)
train_labels = np.array(train_labels)
validation_images = np.array(validation_images)
validation_labels = np.array(validation_labels)

print(train_images.shape)
print(train_labels.shape)
print(validation_images.shape)
print(validation_labels.shape)

(2272, 256, 256, 3)
(2272,)
(569, 256, 256, 3)
(569,)


In [26]:
print(train_images[0])

[[[  0.   0.   0.]
  [  0.   0.   0.]
  [  0.   0.   0.]
  ...
  [  0.   0.   0.]
  [  0.   0.   0.]
  [  0.   0.   0.]]

 [[  0.   0.   0.]
  [  0.   0.   0.]
  [  0.   0.   0.]
  ...
  [  0.   0.   0.]
  [  0.   0.   0.]
  [  0.   0.   0.]]

 [[  0.   0.   0.]
  [  0.   0.   0.]
  [  0.   0.   0.]
  ...
  [  0.   0.   0.]
  [  0.   0.   0.]
  [  0.   0.   0.]]

 ...

 [[ 95.  84.  92.]
  [ 96.  85.  91.]
  [120. 109. 113.]
  ...
  [ 99. 131.  82.]
  [109. 136.  93.]
  [109. 136.  95.]]

 [[149. 135. 134.]
  [150. 136. 136.]
  [150. 136. 135.]
  ...
  [ 96. 129.  76.]
  [104. 131.  86.]
  [105. 132.  89.]]

 [[175. 163. 147.]
  [174. 162. 150.]
  [173. 160. 154.]
  ...
  [ 91. 124.  71.]
  [ 98. 126.  78.]
  [104. 131.  86.]]]


In [90]:
print(train_labels[:5])


['Collared_Dove' 'Wren' 'Starling' 'Collared_Dove' 'Long_Tailed_Tit']


In [91]:
print(set(train_labels))
print(len(set(train_labels)))

{np.str_('Greenfinch'), np.str_('Song_Thrush'), np.str_('Coal_Tit'), np.str_('Long_Tailed_Tit'), np.str_('Collared_Dove'), np.str_('Blackbird'), np.str_('Bluetit'), np.str_('Starling'), np.str_('Robin'), np.str_('Chaffinch'), np.str_('House_Sparrow'), np.str_('Magpie'), np.str_('Dunnock'), np.str_('Wood_Pigeon'), np.str_('Carrion_Crow'), np.str_('Great_Tit'), np.str_('Feral_Pigeon'), np.str_('Wren'), np.str_('Goldfinch'), np.str_('Jackdaw')}
20


In [94]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SimplifiedCNNModel(nn.Module):
    def __init__(self):
        super(SimplifiedCNNModel, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3)
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        #self.conv4 = nn.Conv2d(128, 256, kernel_size=3)  # New convolutional layer
        #self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2)  # New pooling layer
        
        # Use a dummy input to calculate the size
        with torch.no_grad():
            dummy_input = torch.randn(1, 3, 256, 256)  # Batch size 1
            x = self.pool1(F.relu(self.conv1(dummy_input)))
            x = self.pool2(F.relu(self.conv2(x)))
            x = self.pool3(F.relu(self.conv3(x)))
            #x = self.pool4(F.relu(self.conv4(x)))  # Pass through the new layer
            self.flatten_size = x.numel()

        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(self.flatten_size, 256)
        self.dropout = nn.Dropout(p=0.5)  # Add dropout layer with 50% probability
        self.fc2 = nn.Linear(256, 20)  # Assuming 20 classes
        
        # Initialize weights
        self._initialize_weights()
        
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                nn.init.constant_(m.bias, 0)
        
    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = self.pool3(F.relu(self.conv3(x)))
        #x = self.pool4(F.relu(self.conv4(x)))  # Pass through the new layer
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)  # Apply dropout
        x = self.fc2(x)  # No softmax needed
        return x

# Example of model instantiation
model = SimplifiedCNNModel()
print(model)

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

SimplifiedCNNModel(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1))
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1))
  (pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=115200, out_features=256, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=256, out_features=20, bias=True)
)


SimplifiedCNNModel(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1))
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1))
  (pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=115200, out_features=256, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=256, out_features=20, bias=True)
)

In [95]:
from torchsummary import summary
summary(model, input_size=(3, 256, 256))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 32, 254, 254]             896
         MaxPool2d-2         [-1, 32, 127, 127]               0
            Conv2d-3         [-1, 64, 125, 125]          18,496
         MaxPool2d-4           [-1, 64, 62, 62]               0
            Conv2d-5          [-1, 128, 60, 60]          73,856
         MaxPool2d-6          [-1, 128, 30, 30]               0
           Flatten-7               [-1, 115200]               0
            Linear-8                  [-1, 256]      29,491,456
           Dropout-9                  [-1, 256]               0
           Linear-10                   [-1, 20]           5,140
Total params: 29,589,844
Trainable params: 29,589,844
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.75
Forward/backward pass size (MB): 34.47
Params size (MB): 112.88
Es

In [96]:
import torch.optim as optim

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()  # For classification tasks
optimizer = optim.Adam(model.parameters(), lr=0.0007)  # Adam optimizer

In [97]:
from sklearn.preprocessing import LabelEncoder

# Encode the labels
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
validation_labels_encoded = label_encoder.transform(validation_labels)

In [74]:
print("Encoded labels:", train_labels_encoded[:5])
print("Encoded labels shape:", train_labels_encoded.shape)
print("Validation labels shape:", validation_labels_encoded.shape)
print("Number of classes:", len(set(train_labels_encoded)))

Encoded labels: [ 5 19 17  5 13]
Encoded labels shape: (2272,)
Validation labels shape: (569,)
Number of classes: 20


In [98]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Permute if in HWC format (i.e., (N, H, W, C))
if train_images.shape[-1] == 3:
    train_images = train_images.transpose(0, 3, 1, 2)  # (N, H, W, C) → (N, C, H, W)
    
# Normalize only when converting, avoid extra copies
train_images_tensor = torch.from_numpy(train_images).float().div(255)
train_labels_tensor = torch.from_numpy(train_labels_encoded).long()

train_dataset = TensorDataset(train_images_tensor, train_labels_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True)

if validation_images.shape[-1] == 3:
    validation_images = validation_images.transpose(0, 3, 1, 2)  # (N, H, W, C) → (N, C, H, W)

validation_images_tensor = torch.from_numpy(validation_images).float().div(255)
validation_labels_tensor = torch.from_numpy(validation_labels_encoded).long()

validation_dataset = TensorDataset(validation_images_tensor, validation_labels_tensor)
validation_dataloader = DataLoader(validation_dataset, batch_size=256, shuffle=False)


In [34]:
# Collect unique labels from the training dataloader
train_dataloader_labels = set()
for _, labels in train_dataloader:
    train_dataloader_labels.update(labels.numpy())

print("Unique labels in training dataloader:", sorted(train_dataloader_labels))

# Collect unique labels from the validation dataloader
validation_dataloader_labels = set()
for _, labels in validation_dataloader:
    validation_dataloader_labels.update(labels.numpy())

print("Unique labels in validation dataloader:", sorted(validation_dataloader_labels))

Unique labels in training dataloader: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19)]
Unique labels in validation dataloader: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19)]


In [39]:
inputs, labels = next(iter(train_dataloader))
inputs, labels = inputs.to(device), labels.to(device)

for step in range(200):
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    print(f"Step {step}, Loss: {loss.item()}")


KeyboardInterrupt: 

In [76]:
inputs, labels = next(iter(train_dataloader))
print("Input shape:", inputs.shape)          # should be (16, 3, 256, 256)
print("Input min/max:", inputs.min(), inputs.max())  # should be ~0–1
print("Label shape:", labels.shape)          # should be (16,)
print("Label dtype:", labels.dtype)          # should be torch.long
print("Unique labels:", torch.unique(labels))


Input shape: torch.Size([256, 3, 256, 256])
Input min/max: tensor(0.) tensor(1.)
Label shape: torch.Size([256])
Label dtype: torch.int64
Unique labels: tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19])


In [99]:
# Training loop
num_epochs = 25

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    
    running_loss = 0.0
    
    for inputs, labels in train_dataloader:
        # Move data to GPU
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        
        # Compute loss
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_dataloader)}")
    #with torch.no_grad(): #doesn't interfere with training
    #    print(model(torch.randn(1, 3, 100, 100).to(device)))



Epoch 1/25, Loss: 4.630975749757555
Epoch 2/25, Loss: 2.9525300661722818
Epoch 3/25, Loss: 2.7732063399420843
Epoch 4/25, Loss: 2.4554997285207114
Epoch 5/25, Loss: 2.1296990977393255
Epoch 6/25, Loss: 1.7675209045410156
Epoch 7/25, Loss: 1.4756604300604925
Epoch 8/25, Loss: 1.1667420996559992
Epoch 9/25, Loss: 0.8817438019646539
Epoch 10/25, Loss: 0.6986303726832072
Epoch 11/25, Loss: 0.5613496171103584
Epoch 12/25, Loss: 0.4042259057362874
Epoch 13/25, Loss: 0.31756026877297294
Epoch 14/25, Loss: 0.25861866606606376
Epoch 15/25, Loss: 0.192703268594212
Epoch 16/25, Loss: 0.15628764198886025
Epoch 17/25, Loss: 0.14456101258595785
Epoch 18/25, Loss: 0.12979380041360855
Epoch 19/25, Loss: 0.13377374162276587
Epoch 20/25, Loss: 0.10477032760779063
Epoch 21/25, Loss: 0.08961563102073139
Epoch 22/25, Loss: 0.08721682926019032
Epoch 23/25, Loss: 0.08702624175283644
Epoch 24/25, Loss: 0.0856590283413728
Epoch 25/25, Loss: 0.07160089910030365


In [13]:
print("Sample outputs:", outputs[0])


Sample outputs: tensor([ 0.1387,  0.0215, -0.1125, -0.0194,  0.0965,  0.0040,  0.1223, -0.4186,
         0.0743,  0.1042,  0.0977,  0.1231, -0.1692,  0.0762, -0.3756,  0.0763,
         0.1036, -0.0341,  0.0636,  0.0942], device='cuda:0',
       grad_fn=<SelectBackward0>)


In [100]:
# Evaluation
model.eval()

correct = 0
total = 0

with torch.no_grad():  # Disable gradients for evaluation
    for test_X, test_y in validation_dataloader:
        # Move data to GPU
        test_X, test_y = test_X.to(device), test_y.to(device)

        # Forward pass
        test_outputs = model(test_X)
        
        # Get predictions
        _, predicted = torch.max(test_outputs, 1)  # Get the class with highest probability
        
        # Update total and correct predictions
        correct += (predicted == test_y).sum().item()
        total += test_y.size(0)

# Compute final accuracy across all batches
accuracy = correct / total
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Test Accuracy: 47.63%
