# Data Preprocessing

In [1]:
# Mount the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
# Import all packages
import cv2
import numpy as np
import os
import pandas as pd
import pickle
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [6]:
folder_paths = ['/content/drive/MyDrive/Data/Plankton/WHOI_unzipped_data/2006','/content/drive/MyDrive/Data/Plankton/WHOI_unzipped_data/2007','/content/drive/MyDrive/Data/Plankton/WHOI_unzipped_data/2013', '/content/drive/MyDrive/Data/Plankton/WHOI_unzipped_data/2014-VALIDATION YEAR']

all_images_data = []

print("--- Starting to Scan Image Folders ---")
for folder_path in folder_paths:
    print(f"--> Processing main folder: {folder_path}")

    for root, subdirs, files in os.walk(folder_path):
        if files:
            image_class = os.path.basename(root)

            for file_name in files:
                file_path = os.path.join(root, file_name)

                # Append the data to our list
                all_images_data.append({
                    'Image Class': image_class,
                    'Image Path': file_path
                })

print("\n...Initial file scan complete!")

# Create a Pandas DataFrame from the collected path data
path_df = pd.DataFrame(all_images_data)

--- Starting to Scan Image Folders ---
--> Processing main folder: /content/drive/MyDrive/Data/Plankton/WHOI_unzipped_data/2006
--> Processing main folder: /content/drive/MyDrive/Data/Plankton/WHOI_unzipped_data/2007
--> Processing main folder: /content/drive/MyDrive/Data/Plankton/WHOI_unzipped_data/2013
--> Processing main folder: /content/drive/MyDrive/Data/Plankton/WHOI_unzipped_data/2014-VALIDATION YEAR

...Initial file scan complete!


## Load Data

In [8]:
load_path = '/content/drive/MyDrive/Plankton/image_dataframe.pkl'
with open(load_path, 'rb') as f:
    loaded_img_dataframe = pickle.load(f)

In [10]:
loaded_img_dataframe, loaded_img_dataframe.shape, loaded_img_dataframe['Image_Matrix'][0].shape

(      Image_Class                                       Image_Matrix
 0             mix  [[204, 204, 204, 203, 203, 203, 203, 200, 202,...
 1             mix  [[200, 201, 203, 201, 199, 200, 202, 201, 199,...
 2             mix  [[221, 222, 223, 224, 223, 222, 221, 222, 222,...
 3             mix  [[192, 191, 188, 183, 182, 189, 185, 180, 185,...
 4             mix  [[212, 212, 212, 212, 213, 213, 213, 211, 212,...
 ...           ...                                                ...
 10995      dino30  [[194, 193, 194, 195, 195, 195, 192, 193, 194,...
 10996      dino30  [[204, 205, 207, 209, 209, 208, 207, 207, 206,...
 10997      dino30  [[179, 189, 187, 183, 191, 186, 186, 187, 194,...
 10998      dino30  [[215, 216, 217, 219, 218, 218, 217, 217, 217,...
 10999      dino30  [[205, 205, 205, 204, 204, 206, 206, 205, 205,...
 
 [11000 rows x 2 columns],
 (11000, 2),
 (45, 113))

In [13]:
TARGET_SIZE = (64, 64)
def resize_image_robust(image_matrix):
    """
    Safely resizes an image matrix.
    Returns the resized image or None if an error occurs.
    """
    try:
        # Convert to a numpy array, just in case it's a list
        img_array = np.array(image_matrix)
        # Check for empty or invalid array after conversion
        if img_array.size == 0:
            return None
        # Ensure the data type is uint8, which is required by cv2.resize
        if img_array.dtype != np.uint8:
            img_array = img_array.astype(np.uint8)
        # Perform the resize
        return cv2.resize(img_array, TARGET_SIZE, interpolation=cv2.INTER_AREA)
    except (TypeError, ValueError) as e:
        return None
# Apply the ROBUST resizing function
loaded_img_dataframe['Image_Matrix_64x64'] = loaded_img_dataframe['Image_Matrix'].apply(resize_image_robust)
print("Image resizing complete.")
# Post-processing: Handle failed resizes
failed_count = loaded_img_dataframe['Image_Matrix_64x64'].isnull().sum()
print(f"\nNumber of images that failed to resize: {failed_count}")
# Remove the rows where resizing failed before proceeding
cleaned_dataframe = loaded_img_dataframe.dropna(subset=['Image_Matrix_64x64']).copy()
print(f"DataFrame shape after removing failed images: {cleaned_dataframe.shape}")

# Encode the 'Image_Class' column on the cleaned data
label_encoder = LabelEncoder()
cleaned_dataframe['Image_Class_Encoded'] = label_encoder.fit_transform(cleaned_dataframe['Image_Class'])
print("Label encoding complete.")

# Display the updated DataFrame
print("\nCleaned DataFrame head with new columns:")
print(cleaned_dataframe[['Image_Class', 'Image_Class_Encoded', 'Image_Matrix_64x64']].head())

Image resizing complete.

Number of images that failed to resize: 239
DataFrame shape after removing failed images: (10761, 3)
Label encoding complete.

Cleaned DataFrame head with new columns:
  Image_Class  Image_Class_Encoded  \
0         mix                   20   
1         mix                   20   
2         mix                   20   
3         mix                   20   
4         mix                   20   

                                  Image_Matrix_64x64  
0  [[204, 204, 203, 203, 201, 204, 204, 202, 204,...  
1  [[200, 203, 199, 201, 200, 199, 200, 199, 200,...  
2  [[221, 223, 223, 222, 222, 222, 222, 221, 220,...  
3  [[192, 189, 182, 189, 181, 187, 189, 185, 188,...  
4  [[212, 212, 213, 213, 212, 213, 214, 213, 213,...  


In [14]:
cleaned_dataframe.columns, cleaned_dataframe['Image_Matrix_64x64'][1000], cleaned_dataframe['Image_Class_Encoded'][1000]

(Index(['Image_Class', 'Image_Matrix', 'Image_Matrix_64x64',
        'Image_Class_Encoded'],
       dtype='object'),
 array([[209, 208, 209, ..., 203, 203, 203],
        [209, 208, 210, ..., 204, 204, 202],
        [209, 208, 209, ..., 204, 204, 203],
        ...,
        [207, 208, 208, ..., 203, 203, 202],
        [207, 206, 207, ..., 203, 204, 203],
        [208, 208, 209, ..., 204, 204, 203]], dtype=uint8),
 np.int64(7))

## CNN Model

In [17]:
# Convert data to tensors
X = torch.tensor(np.array(cleaned_dataframe['Image_Matrix_64x64'].tolist()), dtype=torch.float32)
y = torch.tensor(cleaned_dataframe['Image_Class_Encoded'].values, dtype=torch.long)
# Reshape X for CNN input: [batch_size, channels, height, width]
X = X.view(-1, 1, 64, 64) # Gray scale: channels = 1
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X shape: torch.Size([10761, 1, 64, 64])
y shape: torch.Size([10761])


In [18]:
# Split the model into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape:  {X_test.shape}")
# Create separate dataloaders
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

Training data shape: torch.Size([8608, 1, 64, 64])
Testing data shape:  torch.Size([2153, 1, 64, 64])


In [None]:
# Define the CNN model
class CNN(nn.Module):
  """
       Parameters:
           * in_channels: Number of channels in the input image (for grayscale images, 1)
           * num_classes: Number of classes to predict.
       """
  def __init__(self, in_channels, num_classes):
      super(CNN, self).__init__()
# Convolutional layers
      self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=32, kernel_size=3, padding=1)
      self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
      self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
      self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
      self.dropout = nn.Dropout(0.3)
      self.relu = nn.ReLU()

# Fully connected layers
      self.fc1 = nn.Linear(128 * 8 * 8, 512)
      self.fc2 = nn.Linear(512, num_classes)
  def forward(self, x):
    """
      Parameters:
      x: Input Tensor
    """
    x = self.pool(F.relu(self.conv1(x)))
    x = self.pool(F.relu(self.conv2(x)))
    x = self.pool(F.relu(self.conv3(x)))
    x = x.reshape(x.shape[0], -1)  # Flatten the tensor
    x = self.relu(self.fc1(x))
    x = self.dropout(x)
    x = self.fc2(x)
    return x
device = "cuda" if torch.cuda.is_available() else "cpu"
dataset = TensorDataset(X, y)
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)
model = CNN(in_channels=1, num_classes=22).to(device)

# Loss function for multi-class classification
criterion = nn.CrossEntropyLoss()

# Adam optimizer
optimizer = optim.Adam(model.parameters(), lr=0.0005)

# Training loops
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        # Backward pass and optimize
        loss.backward() # Comput gradients
        optimizer.step() # update paramers

        # Track accuracy
        _, predicted = torch.max(outputs.data, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()
        total_loss += loss.item()

    # Print statistics
    avg_loss = total_loss / len(train_loader)
    accuracy = 100 * correct / total
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%')

# Evaluate on the test dataset
model.eval()
test_correct = 0
test_total = 0
with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        outputs = model(batch_X)
        _, predicted = torch.max(outputs.data, 1)
        test_total += batch_y.size(0)
        test_correct += (predicted == batch_y).sum().item()
test_accuracy = 100 * test_correct / test_total
print(f"Epoch [{epoch+1}/{num_epochs}] | "
      f"Train Loss: {avg_loss:.4f} | "
      f"Train Acc: {accuracy:.2f}% | "
      f"Test Acc: {test_accuracy:.2f}%")

Epoch [1/100], Loss: 3.3032, Accuracy: 22.12%
Epoch [2/100], Loss: 1.6399, Accuracy: 46.79%
Epoch [3/100], Loss: 1.3280, Accuracy: 57.65%
Epoch [4/100], Loss: 1.1047, Accuracy: 64.50%
Epoch [5/100], Loss: 0.9572, Accuracy: 69.19%
Epoch [6/100], Loss: 0.8654, Accuracy: 71.93%
Epoch [7/100], Loss: 0.7771, Accuracy: 74.41%
Epoch [8/100], Loss: 0.6864, Accuracy: 77.24%
Epoch [9/100], Loss: 0.6105, Accuracy: 79.01%
Epoch [10/100], Loss: 0.5443, Accuracy: 81.70%
Epoch [11/100], Loss: 0.4890, Accuracy: 83.84%
Epoch [12/100], Loss: 0.4241, Accuracy: 86.09%
Epoch [13/100], Loss: 0.3908, Accuracy: 86.87%
Epoch [14/100], Loss: 0.3303, Accuracy: 88.63%
Epoch [15/100], Loss: 0.2846, Accuracy: 90.64%
Epoch [16/100], Loss: 0.2666, Accuracy: 90.96%
Epoch [17/100], Loss: 0.2271, Accuracy: 92.48%
Epoch [18/100], Loss: 0.2075, Accuracy: 93.06%
Epoch [19/100], Loss: 0.1751, Accuracy: 93.99%
Epoch [20/100], Loss: 0.1739, Accuracy: 94.15%
Epoch [21/100], Loss: 0.1481, Accuracy: 94.93%
Epoch [22/100], Loss: 