In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from torchvision.datasets import EMNIST
import timm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torchvision import transforms

In [2]:
# import kagglehub

# # Download latest version
# path = kagglehub.dataset_download("crawford/emnist")

# print("Path to dataset files:", path)

In [3]:
path = 'C:\\Users\\scanu\\.cache\\kagglehub\\datasets\\crawford\\emnist\\versions\\3'

dataset_file_train = "/emnist-letters-train.csv"
dataset_file_test = "/emnist-letters-test.csv"

train = pd.read_csv(path + dataset_file_train, delimiter=',')
test = pd.read_csv(path + dataset_file_test, delimiter=',')

In [4]:
y_train = np.array(train.iloc[:,0].values)
x_train = np.array(train.iloc[:,1:].values)

y_test = np.array(test.iloc[:,0].values)
x_test = np.array(test.iloc[:,1:].values)

In [5]:
n_data = len(x_train)
height = 28
resizer = transforms.Resize((224,224),interpolation=transforms.InterpolationMode.BICUBIC)

print(x_train.reshape(n_data,height,height).shape)
rgb_batch = np.repeat(x_train.reshape(n_data,height,height), 3, axis = 0).reshape(n_data,3,height,height)[0:256]
print(rgb_batch.shape)

train_dataloader = DataLoader( [[rgb_batch[i], y_train[i]] for i in range(len(rgb_batch))], batch_size=32, shuffle=False)
# test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)

(88799, 28, 28)
(256, 3, 28, 28)


In [6]:
# Modello Transformer
class TextRecognitionModel(nn.Module):
    def __init__(self, num_classes=27):  # 26 lettere + background
        super(TextRecognitionModel, self).__init__()
        self.backbone = timm.create_model("vit_base_patch16_224", pretrained=True)
        self.backbone.head = nn.Identity()  # Rimuoviamo la testa di classificazione
        
        self.decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=768, nhead=2), num_layers=1
        )
        self.fc = nn.Linear(768, num_classes)
        
    def forward(self, x):
        features = self.backbone(x)
        features = features.unsqueeze(0)  # Adatta la dimensione per il decoder
        decoded = self.decoder(features, features)
        output = self.fc(decoded.squeeze(0))
        return output

In [7]:
# Istanziamento del modello
model = TextRecognitionModel(num_classes=27)

# Definizione della funzione di perdita e ottimizzatore
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Addestramento del modello
epochs = 5
for epoch in range(epochs):
    for images, labels in train_dataloader:
        images = resizer(images).float()
#         images = images.unsqueeze(0)
        optimizer.zero_grad()
        outputs = model(images)
        _, pred = torch.max(outputs,1)
        acc = (pred == labels).sum()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        print(f"This Batch, Loss: {loss.item():.4f}")
        print(f"This Batch, Acc: {loss.item():.4f}")
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

print("Addestramento completato!")

This Batch, Loss: 3.5333
This Batch, Loss: 8.5697
This Batch, Loss: 5.4542
This Batch, Loss: 8.4695
This Batch, Loss: 8.9610
This Batch, Loss: 8.4569
This Batch, Loss: 6.4070
This Batch, Loss: 7.5368
Epoch [1/5], Loss: 7.5368
This Batch, Loss: 10.5832
This Batch, Loss: 8.6373
This Batch, Loss: 11.1935
This Batch, Loss: 9.3550
This Batch, Loss: 9.9524
This Batch, Loss: 6.7562
This Batch, Loss: 4.7860
This Batch, Loss: 6.3031
Epoch [2/5], Loss: 6.3031
This Batch, Loss: 5.9480
This Batch, Loss: 4.8725
This Batch, Loss: 6.4826
This Batch, Loss: 4.9270
This Batch, Loss: 3.9919
This Batch, Loss: 5.4005
This Batch, Loss: 5.0326
This Batch, Loss: 5.3961
Epoch [3/5], Loss: 5.3961
This Batch, Loss: 4.4977
This Batch, Loss: 4.4513
This Batch, Loss: 4.9279
This Batch, Loss: 4.3687
This Batch, Loss: 4.5385
This Batch, Loss: 4.3100
This Batch, Loss: 4.0384
This Batch, Loss: 3.8962
Epoch [4/5], Loss: 3.8962
This Batch, Loss: 4.1503
This Batch, Loss: 3.8197
This Batch, Loss: 3.5459
This Batch, Loss: 3

In [8]:
a = model(images)

In [9]:
_, pred = torch.max(a,1)

In [10]:
for i in range(len(pred)):
    print(pred[i], labels[i])

tensor(3) tensor(9)
tensor(9) tensor(8)
tensor(9) tensor(12)
tensor(9) tensor(8)
tensor(3) tensor(17)
tensor(9) tensor(19)
tensor(9) tensor(3)
tensor(9) tensor(19)
tensor(9) tensor(4)
tensor(9) tensor(21)
tensor(9) tensor(5)
tensor(10) tensor(9)
tensor(9) tensor(23)
tensor(9) tensor(10)
tensor(9) tensor(22)
tensor(10) tensor(17)
tensor(3) tensor(19)
tensor(10) tensor(18)
tensor(9) tensor(1)
tensor(9) tensor(20)
tensor(10) tensor(4)
tensor(9) tensor(6)
tensor(3) tensor(4)
tensor(3) tensor(22)
tensor(9) tensor(15)
tensor(9) tensor(14)
tensor(10) tensor(14)
tensor(10) tensor(14)
tensor(9) tensor(25)
tensor(9) tensor(7)
tensor(9) tensor(19)
tensor(9) tensor(17)


## Attempt with Cuda

In [None]:
availability = torch.cuda.is_available()
device = torch.device("cuda" if availability else "cpu")
print("Device:", device)

In [None]:
# Istanziamento del modello
model = TextRecognitionModel(num_classes=27).to(device)

In [None]:
# Definizione della funzione di perdita e ottimizzatore
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Addestramento del modello
epochs = 1
for epoch in range(epochs):
    for images, labels in train_dataloader:
        images = resizer(images).float().to(device)
        labels = labels.to(device)
#         images = images.unsqueeze(0)
        optimizer.zero_grad()
        outputs = model(images).to(device)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        print(f"This Batch, Loss: {loss.item():.4f}")
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}", end = "\n")

print("Addestramento completato!")