In [1]:
# import libraries
import numpy as np
import pandas as pd
import datasets 
from datasets import Features, Image, load_dataset, load_dataset_builder, config, concatenate_datasets
# import matplotlib.pyplot as plt
# import os
# import shutil
# import pathlib
# import PIL
# from PIL import Image

import torch
import torchvision
from torchvision.transforms import v2, ToTensor, Lambda
from torchvision.io import read_image
from torch.utils.data import Dataset, DataLoader, Subset

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
# get data (from https://huggingface.co/datasets/YakupAkdin/instrument-images/tree/main)
dataset = load_dataset("YakupAkdin/instrument-images", split='train').with_format("torch")

Resolving data files:   0%|          | 0/1000 [00:00<?, ?it/s]

In [4]:
dataset_RGB = dataset.filter(lambda x: x['image'].dim() == 3)
len(dataset_RGB)

989

In [5]:
# the RGB images are [H,W,3] and PyTorch needs them as [3,H,W]
# including [:3, :, :] to adjust any RGBA images to RGB
print(dataset_RGB[0]['image'].shape)

# permute the tensors
def permute_images(batch):
    # Apply the permute operation to each tensor in the batch
    batch['image'] = [img.permute(2, 0, 1)[:3, :, :] for img in batch['image']]
    return batch

dataset_RGB = dataset_RGB.map(permute_images, batched=True, batch_size=100)

dataset_RGB[0]['image'].shape

torch.Size([720, 1280, 3])


torch.Size([3, 720, 1280])

In [6]:
dataset_greyscale = dataset.filter(lambda x: x['image'].dim() == 2)
len(dataset_greyscale)

11

In [7]:
# some of the images are greyscale, so we convert to RGB as well
print(dataset_greyscale[0]['image'].shape)

def grayscale_to_rgb(item):
    item['image'] = item['image'].unsqueeze(0).repeat(3, 1, 1)
    return item

dataset_greyscale = dataset_greyscale.map(grayscale_to_rgb)
print(dataset_greyscale[0]['image'].shape)

torch.Size([849, 900])
torch.Size([3, 849, 900])


In [8]:
# concatenate both reformatted RGB datasets
dataset = concatenate_datasets([dataset_RGB, dataset_greyscale])

In [9]:
# confirming 0-255 scaling of image tensors
sample_image = dataset[0]['image']

print("Sample values:", sample_image.flatten()[:10])
print("Max value:", sample_image.max().item())
print("Min value:", sample_image.min().item())

Sample values: tensor([40, 39, 41, 41, 41, 44, 48, 53, 54, 52])
Max value: 255
Min value: 0


In [10]:
# rescale tensors from 0-255 to 0-1
class ScaleTensor(object):
    def __call__(self, tensor):
        return tensor / 255.

# compose transformation function -> this is a more standard transform (according to pytorch documentation) to use later. Fitting my transform to the CIFAR tutorial model for now
# basic_transform = v2.Compose([
#     ScaleTensor(),
#     v2.RandomResizedCrop(size=(224, 224), antialias=True),
#     v2.RandomHorizontalFlip(p=0.5),
#     v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
# ])

basic_transform = v2.Compose([
    ScaleTensor(),
    v2.Resize(32),
    v2.CenterCrop(32),
    v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

# application function for transforms
def apply_transform(example):
    # Apply the transformation to the 'image' field
    example['image'] = [basic_transform(img) for img in example['image']]
    return example

# apply transforms
dataset = dataset.map(apply_transform, batched=True, batch_size=100)

In [11]:
# checking the first image tensor
sample_image = dataset[0]['image']
print("Sample values:", sample_image.flatten()[:10])
print("Max value:", sample_image.max().item())
print("Min value:", sample_image.min().item())

Sample values: tensor([-0.7207, -0.7665, -0.5314, -0.7465, -0.6850, -0.7010, -0.4317, -0.4633,
        -0.1801, -0.5389])
Max value: 0.8591049909591675
Min value: -0.8577016592025757


In [12]:
# split test/train
dataset = dataset.train_test_split(test_size=0.2, seed=1, stratify_by_column="label")

### Resources

https://huggingface.co/docs/datasets/use_with_pytorch
https://huggingface.co/docs/datasets/v2.14.5/en/image_load

**Process images:**
- image processing: https://huggingface.co/docs/datasets/image_process#map
- transforms methods: https://pytorch.org/vision/stable/transforms.html#transforms
- HF general processing: https://huggingface.co/docs/datasets/process
- transforms v2 reference: https://pytorch.org/vision/stable/auto_examples/transforms/plot_transforms_getting_started.html#sphx-glr-auto-examples-transforms-plot-transforms-getting-started-py

**Test/Train split**
- use this: https://huggingface.co/docs/datasets/v2.15.0/en/package_reference/main_classes#datasets.Dataset.train_test_split

**Follow rest of tutorial**
- https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html

**Building a Model Basics**
- https://pytorch.org/tutorials/beginner/basics/buildmodel_tutorial.html

### Dataloader Setup

In [35]:
class HF_to_PyTorch(Dataset):
    def __init__(self, huggingface_dataset):
        self.huggingface_dataset = huggingface_dataset

    def __len__(self):
        return len(self.huggingface_dataset)

    def __getitem__(self, idx):
        # Assuming each item is a dictionary with 'image' and 'label' keys
        item = self.huggingface_dataset[idx]
        image = item['image']
        label = item['label']
        return image, label

train_dataset = HF_to_PyTorch(dataset['train'])

In [36]:
trainloader = DataLoader(train_dataset, batch_size=80, shuffle=True, num_workers=2)

In [27]:
# trainloader = DataLoader(dataset['train'], batch_size=64, shuffle=True)
# testloader = DataLoader(dataset['test'], batch_size=4, shuffle=True)

In [37]:
# start with basic model from the tutorial

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

net = Net()

In [38]:
# define loss function: Classification Cross-Entropy loss and SGD with momentum.

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [39]:
for epoch in range(2):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        # if i % 2000 == 1999:    # print every 2000 mini-batches
        print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
        running_loss = 0.0

print('Finished Training')

[1,     1] loss: 0.001
[1,     2] loss: 0.001
[1,     3] loss: 0.001
[1,     4] loss: 0.001
[1,     5] loss: 0.001
[1,     6] loss: 0.001
[1,     7] loss: 0.001
[1,     8] loss: 0.001
[1,     9] loss: 0.001
[1,    10] loss: 0.001
[2,     1] loss: 0.001
[2,     2] loss: 0.001
[2,     3] loss: 0.001
[2,     4] loss: 0.001
[2,     5] loss: 0.001
[2,     6] loss: 0.001
[2,     7] loss: 0.001
[2,     8] loss: 0.001
[2,     9] loss: 0.001
[2,    10] loss: 0.001
Finished Training
