In [1]:
import pandas as pd
import os
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

import torch
import torch.nn as nn
import torch.optim as optim
import wandb

import matplotlib.pyplot as plt
import cv2
import numpy as np 

import torch.nn.init as init
import torch.nn.functional as F

In [2]:
device = 'cuda'
# Load the CSVs
train_data = pd.read_csv("train_data2.csv")
test_data = pd.read_csv("test_data2.csv")
val_data = pd.read_csv("val_data2.csv")

In [3]:
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torch

class CustomDataset(Dataset):
    def __init__(self, data_frame, transform=None):
        self.data_frame = data_frame
        self.transform = transform
        self.voc = '0123456789abcdefghijklmnopqrstuvwxyz'
        self.num_classes = len(self.voc) + 1  # +1 para el carácter en blanco de CTC

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        img_name = self.data_frame.iloc[idx, 0]
        try:
            image = Image.open(img_name).convert('RGB')
        except FileNotFoundError:
            print(f"File not found: {img_name}")
            return self.__getitem__((idx + 1) % len(self))  # Intentar con la siguiente imagen
        
        text = self.data_frame.iloc[idx, 1].lower()
        label = self.text_to_label(text)
        
        if self.transform:
            image = self.transform(image)

        return image, label, len(label)

    def text_to_label(self, text):
        return [self.voc.find(char) for char in text]

# Definir transformaciones
transform = transforms.Compose([
    transforms.Resize((32, 128)),  # Cambiar tamaño de la imagen
    transforms.Grayscale(num_output_channels=1),  # Convertir a escala de grises
    transforms.ToTensor(),  # Convertir a tensor
    transforms.Normalize((0.5,), (0.5,))  # Normalizar con media y desviación estándar 0.5
])


# Crear dataset y dataloader
train_dataset = CustomDataset(train_data, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, collate_fn=lambda x: x)

val_dataset = CustomDataset(val_data, transform=transform)
val_loader = DataLoader(val_dataset, batch_size=512, shuffle=True, collate_fn=lambda x: x)

test_dataset = CustomDataset(test_data, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=True, collate_fn=lambda x: x)


In [4]:
def custom_collate_fn(batch):
    images, labels, label_lengths = zip(*batch)

    images = torch.stack(images, 0)
    labels = [torch.tensor(label) for label in labels]
    label_lengths = torch.tensor(label_lengths)

    return images, labels, label_lengths


In [5]:
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=512, shuffle=True, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=True, collate_fn=custom_collate_fn)

In [6]:
class CRNN(nn.Module):
    def __init__(self, num_classes):
        super(CRNN, self).__init__()
        self.cnn = nn.Sequential(
            # Entrada: [batch_size, 1, 32, 128]
            nn.Conv2d(1, 32, kernel_size=3, padding=1),  # Salida: [batch_size, 32, 32, 128]
            nn.SELU(),
            nn.MaxPool2d(2, 2),  # Salida: [batch_size, 32, 16, 64]
            
            nn.Conv2d(32, 64, kernel_size=3, padding=1),  # Salida: [batch_size, 64, 16, 64]
            nn.SELU(),
            nn.MaxPool2d(2, 2),  # Salida: [batch_size, 64, 8, 32]
            
            nn.Conv2d(64, 128, kernel_size=3, padding=1),  # Salida: [batch_size, 128, 8, 32]
            nn.SELU(),
            nn.MaxPool2d((2, 1)),  # Salida: [batch_size, 128, 4, 32]

            nn.Conv2d(128, 256, kernel_size=3, padding=1),  # Salida: [batch_size, 256, 4, 32]
            nn.SELU(),
            nn.BatchNorm2d(256),
            nn.MaxPool2d((2, 1)),  # Salida: [batch_size, 256, 2, 32]

            nn.Conv2d(256, 512, kernel_size=3, padding=1),  # Salida: [batch_size, 512, 2, 32]
            nn.SELU(),
            nn.BatchNorm2d(512),
            nn.MaxPool2d((2, 1)),  # Salida: [batch_size, 512, 1, 32]
        )
        
        self.rnn = nn.LSTM(512, 128, bidirectional=True, batch_first=True, num_layers=2)  # Entrada: [batch_size, 31, 512], Salida: [batch_size, 31, 256]
        self.fc = nn.Linear(256, num_classes)  # Entrada: [batch_size, 31, 256], Salida: [batch_size, 31, num_classes]
        
        self._initialize_weights()

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                init.kaiming_normal_(m.weight, nonlinearity='selu')
                if m.bias is not None:
                    init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                init.constant_(m.weight, 1)
                init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                init.kaiming_normal_(m.weight, nonlinearity='selu')
                init.constant_(m.bias, 0)
            elif isinstance(m, nn.LSTM):
                for name, param in m.named_parameters():
                    if 'weight' in name:
                        init.kaiming_normal_(param, nonlinearity='selu')
                    elif 'bias' in name:
                        init.constant_(param, 0)

    def forward(self, x):
        # x inicial: [batch_size, 1, 32, 128]
        x = self.cnn(x)
        # Salida: [batch_size, 512, 1, 31]
        x = x.squeeze(2)  # Remover la dimensión 2 que es 1, resultado: [batch_size, 512, 31]
        x = x.permute(0, 2, 1)  # Reorganizar a [batch_size, 31, 512]
        
        x, _ = self.rnn(x)
        # Salida esperada: [batch_size, 31, 256]
        
        x = self.fc(x)
        # Salida esperada: [batch_size, 31, num_classes]
        
        return x 

In [17]:
import torch
import torch.nn.functional as F

def ctc_decode(log_probs, voc, blank_index=0):
    """
    Decodificar las probabilidades de salida de la RNN para obtener la secuencia de texto.

    log_probs: Salida logarítmica del modelo RNN después de aplicar softmax (T, N, C)
    voc: El vocabulario usado para el mapeo de caracteres
    blank_index: Índice usado para el carácter en blanco en CTC
    """
    # Obtener los índices con la mayor probabilidad en cada timestep
    max_probs = torch.argmax(log_probs, dim=-1)
    
    decoded_batch = []
    for sequence in max_probs.permute(1, 0):  # Cambiar a (N, T)
        decoded_sequence = []
        previous_char = None
        for index in sequence:
            if index != blank_index:  # Ignorar el carácter en blanco
                char = voc[index] if index < len(voc) else ''
                if char != previous_char:  # Eliminar duplicados consecutivos
                    decoded_sequence.append(char)
                previous_char = char
        decoded_sequence = ''.join(decoded_sequence)  # Unir los caracteres
        decoded_batch.append(decoded_sequence)
    return decoded_batch

# Asegúrate de que la configuración del vocabulario sea correcta
train_dataset.voc = '0123456789abcdefghijklmnopqrstuvwxyz'

# Definir el número de clases incluyendo el índice en blanco
num_classes = len(train_dataset.voc) + 1
model = CRNN(num_classes).to(device)
model.load_state_dict(torch.load("model_epoch_24.pth"))

model.eval()
pred_total = []
true_total = []

with torch.no_grad():
    for images, labels, label_lengths in val_loader:
        images = images.to(device)
        labels = [label.to(device) for label in labels]
        label_lengths = label_lengths.to(device)

        logits = model(images)
        
        
        log_probs = F.log_softmax(logits, dim=2).permute(1, 0, 2)
        input_lengths = torch.full((logits.size(0),), logits.size(1), dtype=torch.long).to(device)
        labels_concat = torch.cat(labels).to(device)
        
        pred_texts = ctc_decode(log_probs, train_dataset.voc)
        true_texts = ["".join([train_dataset.voc[char] for char in label]) for label in labels]
        pred_total.extend(pred_texts)
        true_total.extend(true_texts)

In [18]:
import Levenshtein as lev
from sklearn.metrics import accuracy_score


# Calcular accuracy por palabras
word_acc = accuracy_score(true_total, pred_total)

# Calcular accuracy por caracteres
correct_chars = sum([1 for gt_word, pred_word in zip(true_total, pred_total) for gt_char, pred_char in zip(gt_word, pred_word) if gt_char == pred_char])
total_chars = sum([len(gt_word) for gt_word in true_total])
char_acc = correct_chars / total_chars

# Calcular distancia de Levenshtein promedio
lev_distances = [lev.distance(gt_word, pred_word) for gt_word, pred_word in zip(true_total, pred_total)]
avg_lev_distance = sum(lev_distances) / len(lev_distances)

# Imprimir resultados
print(f'Validation Word Accuracy: {word_acc:.2f}')
print(f'Validation Character Accuracy: {char_acc:.2f}')
print(f'Validation Average Levenshtein Distance: {avg_lev_distance:.2f}')


Validation Word Accuracy: 0.54
Validation Character Accuracy: 0.76
Validation Average Levenshtein Distance: 0.94


### Interpretation of Metrics

* Word Accuracy

This metric represents the proportion of predicted words that exactly match the ground truth words. A higher value indicates that more words are predicted correctly.

* Character Accuracy

This metric measures the percentage of characters in the predicted words that match the characters in the ground truth words. It shows how accurate the model is at the character level, even if the entire word is not correct.

* Average Levenshtein Distance

This metric indicates the average number of edits (insertions, deletions, or substitutions) needed to transform a predicted word into the corresponding ground truth word. A lower value means the predicted words are more similar to the ground truth words.

In [19]:
import torch
import torch.nn.functional as F

# Asegúrate de que la configuración del vocabulario sea correcta
train_dataset.voc = '0123456789abcdefghijklmnopqrstuvwxyz'

# Definir el número de clases incluyendo el índice en blanco

model.eval()
pred_total = []
true_total = []

with torch.no_grad():
    for images, labels, label_lengths in test_loader:
        images = images.to(device)
        labels = [label.to(device) for label in labels]
        label_lengths = label_lengths.to(device)

        logits = model(images)
        
        
        log_probs = F.log_softmax(logits, dim=2).permute(1, 0, 2)
        input_lengths = torch.full((logits.size(0),), logits.size(1), dtype=torch.long).to(device)
        labels_concat = torch.cat(labels).to(device)
        
        pred_texts = ctc_decode(log_probs, train_dataset.voc)
        true_texts = ["".join([train_dataset.voc[char] for char in label]) for label in labels]
        pred_total.extend(pred_texts)
        true_total.extend(true_texts)

In [20]:
# Calcular accuracy por palabras
word_acc = accuracy_score(true_total, pred_total)

# Calcular accuracy por caracteres
correct_chars = sum([1 for gt_word, pred_word in zip(true_total, pred_total) for gt_char, pred_char in zip(gt_word, pred_word) if gt_char == pred_char])
total_chars = sum([len(gt_word) for gt_word in true_total])
char_acc = correct_chars / total_chars

# Calcular distancia de Levenshtein promedio
lev_distances = [lev.distance(gt_word, pred_word) for gt_word, pred_word in zip(true_total, pred_total)]
avg_lev_distance = sum(lev_distances) / len(lev_distances)

# Imprimir resultados
print(f'Test Word Accuracy: {word_acc:.2f}')
print(f'Test Character Accuracy: {char_acc:.2f}')
print(f'Test Average Levenshtein Distance: {avg_lev_distance:.2f}')

Test Word Accuracy: 0.54
Test Character Accuracy: 0.76
Test Average Levenshtein Distance: 0.97
