# Cogito Computer Vision Course

Welcome to the Cogito Computer Vision Course!

One can either run the code here locally, or in a Google Colab notebook: https://colab.research.google.com/github/CogitoNTNU/course-computer-vision/blob/main/course.ipynb 

In this course you will learn about the fundamentals of computer vision, including image processing, feature extraction, and object classification.
An additional resource for learning about convolutional neural networks (CNNs) can be found in [this link](https://poloclub.github.io/cnn-explainer/).

In [None]:
! pip install torch torchvision matplotlib

In [None]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import torch
import matplotlib.pyplot as plt


# Define transforms
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor()
])

# Load datasets
dataset = datasets.FashionMNIST(root='data', train=True, download=True, transform=transform)


# Fashion MNIST
A dataset of fashion images, to practice computer vision. The dataset contains 60,000 training images and 10,000 test images of clothing items, such as shirts, shoes, and bags.

In [None]:
class_names = dataset.classes
print("The class names are: ", class_names)

Lets look at a picture from the dataset.

In [None]:
first_image, first_label = dataset[0]
img = first_image.numpy().transpose((1, 2, 0))
print(f"This is a {class_names[first_label]}")
print(f"The shape of the image is {img.shape}")
plt.imshow(img)

# Max Pooling
We can also apply max pooling to the image. Max pooling is a downsampling technique that reduces the spatial dimensions of the image, while retaining the most important features. Here, we use a kernel size of 2 and a stride of 4.

In [None]:
img = dataset[0][0].unsqueeze(0)  # Add batch dimension
pool = torch.nn.MaxPool2d(kernel_size=2, stride=4)
pooled_img = pool(img)
plt.imshow(pooled_img.squeeze(0).permute(1, 2, 0).numpy())


This can be done multiple times. You can see how the image gets smaller and smaller.

In [None]:
dobbel_pooled = pool(pooled_img)
plt.imshow(dobbel_pooled.squeeze(0).permute(1, 2, 0).numpy())

# CNN Filters
Lets look at the filters in a convolutional neural network. Here is an example picture of a cat: 

In [None]:
import requests
from io import BytesIO

url = "https://cdn.britannica.com/70/234870-050-D4D024BB/Orange-colored-cat-yawns-displaying-teeth.jpg"

# Download the image
response = requests.get(url)
image = plt.imread(BytesIO(response.content), format="jpg")

plt.imshow(image)
plt.axis("off")
plt.show()

If we apply a filter to the image, we can see how it highlights certain features. For example, a filter that detects edges will highlight the edges in the image.

In [None]:
filter = torch.tensor([[[[-1, -1, -1],
                         [-1,  8, -1],
                         [-1, -1, -1]]]], dtype=torch.float32)  # Edge detection filter
conv_layer = torch.nn.Conv2d(in_channels=3, out_channels=1, kernel_size=3, padding=1, bias=False)
relu_layer = torch.nn.ReLU()
conv_layer.weight = torch.nn.Parameter(filter.repeat(1, 3, 1, 1))  # Repeat filter for 3 input channels
image_tensor = torch.tensor(image).permute(2, 0, 1).unsqueeze(0).float()  # Add batch dimension and convert to float

Without Relu:

In [None]:
filtered_image = conv_layer(image_tensor)
plt.imshow(filtered_image.squeeze(0).permute(1, 2, 0).detach().numpy())

With Relu:

In [None]:
filtered_image = conv_layer(image_tensor)
filtered_image = relu_layer(filtered_image)
plt.imshow(filtered_image.squeeze(0).permute(1, 2, 0).detach().numpy())

# Task 1 - Experiment with Filters
We can see how another filter might affect the image. Try to change the filter values and see how it affects the image.

Here are some examples of filters:
![image.png](https://miro.medium.com/v2/resize:fit:1400/1*UaO9cemImbhwMVQOoUTPLQ.png)


In [None]:
filter = torch.tensor([[[[ 0, 1, 0],
                         [-1, 1, 1],
                         [ 0, 0, 1]]]], dtype=torch.float32)  # Another filter
conv_layer.weight = torch.nn.Parameter(filter.repeat(1, 3, 1, 1))  # Repeat filter for 3 input channels
filtered_image = conv_layer(image_tensor)
filtered_image = relu_layer(filtered_image)
plt.imshow(filtered_image.squeeze(0).permute(1, 2, 0).detach().numpy())

# Task 2 - Explore Data Augmentation

Uncomment a transformation and run the cell to see what it does. You can experiment with the parameter values too, if you like. (The factor parameters should be greater than 0 and, generally, less than 1.) Run the cell again if you'd like to get a new random image.



In [None]:
from torchvision.transforms import RandomVerticalFlip, RandomRotation, ColorJitter, GaussianBlur, RandomResizedCrop

In [None]:
# Try to change some of the parameters below and see how the image changes. Also try to add/remove transforms.
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    RandomVerticalFlip(p=0.5),
    RandomRotation(degrees=30),
    RandomResizedCrop(size=128, scale=(0.8, 1.0), ratio=(0.75, 1.33)),
    ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    GaussianBlur(kernel_size=5, sigma=(0.1, 2.0)),
    transforms.ToTensor(),

])
dataset = datasets.FashionMNIST(root='data', train=True, download=True, transform=transform)
image, label = dataset[0]
plt.imshow(image.permute(1, 2, 0).numpy())

# Create a model


In [None]:
from torch.nn import Conv2d, MaxPool2d, Flatten, Linear, Sequential
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device: ", device) # The device should be CUDA

class_names = dataset.classes
# Create a model
model = Sequential(
    Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=1),
    MaxPool2d(kernel_size=2, stride=2),
    Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1),
    MaxPool2d(kernel_size=2, stride=2),
    Flatten(),
    Linear(in_features=32*32*32, out_features=128),
    Linear(in_features=128, out_features=10)
).to(device)
# Test the model with the first image
img = dataset[0][0].unsqueeze(0).to(device)  # Add batch dimension
output = model(img)
predicted_class = torch.argmax(output, dim=1).item()
print(f"The model predicts this image as a {class_names[predicted_class]}")

This model is not trained, so the output will be random. 

# Training a model
Below is an example of how to train a model on the Fashion MNIST dataset.

In [None]:
training_data = datasets.FashionMNIST(root='data/FashionMNIST', train=True, download=True, transform=transform)
test_data = datasets.FashionMNIST(root='data/FashionMNIST', train=False, download=True, transform=transform)

train_loader = DataLoader(training_data, batch_size=128, shuffle=True)
test_loader = DataLoader(test_data, batch_size=128, shuffle=False)

loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [None]:
num_epochs = 1
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for i, (images, labels) in enumerate(train_loader):
        if i % 10 == 0:
            print(f"Training on batch number: {i}")
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")



We can then evaluate the model on the test data.

In [None]:
total = 0
correct = 0
model.eval()
for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        with torch.no_grad():
                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
accuracy = 100 * correct / total
print(f'Accuracy of the model on the test images: {accuracy:.2f}%')

# Task 3 - Design your own model
Now you can try to create your own model and train it on the Fashion MNIST dataset!

In [None]:
model = Sequential(
    # Design your own model
).to(device)

transform = transforms.Compose([
    transforms.Resize((128, 128)),
    # Add your data augmentations here
    transforms.ToTensor(),
])

# You may change these values
batch_size = 32
learning_rate = 0.001
epochs = 5


# Training

In [None]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

# Testing

In [None]:
total = 0
correct = 0
model.eval()
for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        with torch.no_grad():
                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
accuracy = 100 * correct / total
print(f'Accuracy of the model on the test images: {accuracy:.2f}%')