# Task 2 and 3 - Generation and Bonus
- This is the CRNN Model for the PreCog Task

## Dependencies

imports everything

In [150]:
import os
from PIL import Image, ImageDraw, ImageFont
import random
from random_word import RandomWords
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms

In [151]:
# 1000 random words for dataset, changed for this to make diverse words for character predictions from 100 in task 1 to 1000 in task 2
# will also reduce the number of images per word from 100 to 10, to prevent overfitting (checked by running the code for both the values, loss was higher for the 1000 word model, indicating less of an overfit)

def generate_random_words(num_words):
    r = RandomWords()
    words = []
    while len(words) < num_words:
        word = r.get_random_word()
        if word and 5 <= len(word) <= 9:
            words.append(word)

    return words

words = generate_random_words(1000)
print(words)

['bumped', 'encinder', 'kuvaszok', 'butyrin', 'tabour', 'smirtle', 'discount', 'cosset', 'woolfells', 'exorcises', 'handsale', 'sighs', 'bookward', 'underhum', 'belesprit', 'filarious', 'curbline', 'hyporight', 'confed', 'shifts', 'tictoc', 'gnapweed', 'unplow', 'coercive', 'portsale', 'pogromist', 'deathworm', 'selenates', 'tuskar', 'button', 'surmising', 'stampeder', 'rancors', 'subungual', 'sixfolds', 'astre', 'meropic', 'senocular', 'runfish', 'vulpinism', 'unfusible', 'eyeserver', 'tamandu', 'cobus', 'upsweeps', 'praus', 'lester', 'ataractic', 'bowmaking', 'caraho', 'emissory', 'wambais', 'reaccuses', 'disna', 'atreptic', 'baggers', 'probits', 'shoved', 'precludes', 'gimmicks', 'boleros', 'gnatty', 'navarch', 'hamital', 'florida', 'sacatons', 'pieforts', 'tecali', 'uncrated', 'cirrosely', 'reddish', 'blowbys', 'mapach', 'remicate', 'typhosis', 'galravage', 'pitmen', 'tokays', 'astound', 'practicum', 'busks', 'mesotroch', 'baloneys', 'rashing', 'bugeyed', 'leegte', 'gutser', 'roset

In [None]:
# same as task 0

def add_noise(draw, width, height):
    for _ in range(random.randint(4000, 5000)):
        x, y = random.randint(0, width - 1), random.randint(0, height - 1)
        color = tuple(random.randint(0, 255) for _ in range(3))
        draw.point((x, y), fill=color)

FONTS = [
    "Fonts/akira.ttf",
    "Fonts/dejavu-sans-bold.ttf",
    "Fonts/Roboto-Italic.ttf",
    "Fonts/Roboto.ttf",
    "Fonts/Harabara.ttf",
    "Fonts/Designer.ttf",
    "Fonts/OpenSans-Italic.ttf",
    "Fonts/OpenSans-Regular.ttf"
]

def generate_hard_set():
    for i in range(0,10):
        for word in words:
            width, height = 248, 80
            # generate image with white only
            img = Image.new("RGB", (width, height), color="white")
            draw = ImageDraw.Draw(img)
            
            # generate fonts with random fonts and colors
            font = ImageFont.truetype(random.choice(FONTS), size = 30)
            text_color = tuple(random.randint(0, 255) for _ in range(3))
            
            # generate fonts with random capitalization
            word_variation = ''.join(random.choice([c.upper(), c.lower()]) for c in word)
            text_bbox = font.getbbox(word)
            text_width = text_bbox[2] - text_bbox[0]
            text_height = text_bbox[3] - text_bbox[1]

            position = ((width - text_width) // 2, (height - text_height) // 2)

            draw.text(position, word_variation, fill=text_color, font=font)

            # Add noise
            add_noise(draw, width, height)

            img.save(os.path.join("content/dataset/hard", f"{word}_{i}.png"))

generate_hard_set()

def generate_bonus_set():
    for i in range(0, 10):
        for word in words:
            width, height = 248, 80
            bg_color = random.randint(0, 1)
            if bg_color == 0 :
                bg = "green"
            else :
                bg = "red"

            img = Image.new("RGB", (width, height), color=bg)
            draw = ImageDraw.Draw(img)

            font = ImageFont.truetype(random.choice(FONTS), size=30)
            text_color = "black"

            word_variation = ''.join(random.choice([c.upper(), c.lower()]) for c in word)
            text_bbox = font.getbbox(word)
            text_width = text_bbox[2] - text_bbox[0]
            text_height = text_bbox[3] - text_bbox[1]

            position = ((width - text_width) // 2, (height - text_height) // 2)

            if bg_color == 1:
                final_word = word_variation [::-1]
            else :
                final_word = word_variation

            draw.text(position, final_word, fill=text_color, font=font)

            # Add noise
            add_noise(draw, width, height)

            img.save(os.path.join("content/dataset/bonus", f"{word}_{i}.png"))

generate_bonus_set()
            

In [152]:
# generating a single hard image for testing

def generate_hard_image(word):  
    width, height = 248, 80
    # generate image with white only
    img = Image.new("RGB", (width, height), color="white")
    draw = ImageDraw.Draw(img)
    
    # generate fonts with random fonts and colors
    font = ImageFont.truetype(random.choice(FONTS), size = 30)
    text_color = tuple(random.randint(0, 255) for _ in range(3))
    
    # generate fonts with random capitalization
    word_variation = ''.join(random.choice([c.upper(), c.lower()]) for c in word)
    text_bbox = font.getbbox(word)
    text_width = text_bbox[2] - text_bbox[0]
    text_height = text_bbox[3] - text_bbox[1]

    position = ((width - text_width) // 2, (height - text_height) // 2)

    draw.text(position, word_variation, fill=text_color, font=font)

    # Add noise
    add_noise(draw, width, height)

    img.save(f"{word}.png")

generate_hard_image(word = "arihant")

In [153]:
# generating a single bonus word for testing

def generate_bonus_image(word):  
    width, height = 248, 80
    bg_color = random.randint(0, 1)
    if bg_color == 0 :
        bg = "green"
    else :
        bg = "red"

    img = Image.new("RGB", (width, height), color=bg)
    draw = ImageDraw.Draw(img)

    font = ImageFont.truetype(random.choice(FONTS), size=30)
    text_color = "black"

    word_variation = ''.join(random.choice([c.upper(), c.lower()]) for c in word)
    text_bbox = font.getbbox(word)
    text_width = text_bbox[2] - text_bbox[0]
    text_height = text_bbox[3] - text_bbox[1]

    position = ((width - text_width) // 2, (height - text_height) // 2)

    if bg_color == 1:
        final_word = word_variation [::-1]
    else :
        final_word = word_variation

    draw.text(position, final_word, fill=text_color, font=font)

    add_noise(draw, width, height)

    img.save(f"{word}.png")

generate_bonus_image(word = "arihant")

In [154]:
# generating a single easy word for testing

def generate_image(word):  
    font_path = "Fonts/OpenSans-Regular.ttf"  
    width, height = 248, 80
    color = 255
    image = Image.new("L", (width, height), color)
    draw = ImageDraw.Draw(image)
    try:
        font = ImageFont.truetype(font_path, 36)
    except OSError as e:
        print(f"Error loading font: {e}")
        return
    text = word.title()
    text_bbox = font.getbbox(word)
    text_width = text_bbox[2] - text_bbox[0]
    text_height = text_bbox[3] - text_bbox[1]
    x = random.randint((width - text_width)//8, 7*((width - text_width)//8))
    y = random.randint((height - text_height)//8, 7*((height - text_height)//8))
    position = (x, y)
    text_color = 0
    draw.text(position, text, font=font, fill=text_color)
    image.save(f"{word}.png")

generate_image(word = "arihant")

## Dataset and Collate function

In [155]:
def collate_fn(batch):
    images, labels, lengths = zip(*batch)
    
    images = torch.stack(images)

    # Find max label length in batch for padding
    max_len = max(lengths)
    padded_labels = torch.full((len(labels), max_len), fill_value=0, dtype=torch.long)

    for i, label in enumerate(labels):
        padded_labels[i, :len(label)] = label

    lengths = torch.tensor(lengths, dtype=torch.long)

    return images, padded_labels, lengths

class WordImageDataset(Dataset):
    def __init__(self, image_dir, transform=None):
        self.image_dir = image_dir
        self.transform = transform
        self.images = []
        self.labels = []
        self.label_to_idx = {'<blank>': 0}
        self.idx_to_label = {0: '<blank>'}
        
        chars = set()
        for filename in os.listdir(image_dir):
            if filename.endswith('.png'):
                word = filename.split('_')[0]
                chars.update(word)
        
        for idx, char in enumerate(sorted(chars), start=1):
            self.label_to_idx[char] = idx
            self.idx_to_label[idx] = char
        
        for filename in os.listdir(image_dir):
            if filename.endswith('.png'):
                word = filename.split('_')[0]
                char_indices = [self.label_to_idx[c] for c in word]
                self.images.append(os.path.join(image_dir, filename))
                self.labels.append(torch.tensor(char_indices, dtype=torch.long))

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image_path = self.images[idx]
        image = Image.open(image_path).convert('L')
        
        if self.transform:
            image = self.transform(image)
            
        label = self.labels[idx]
        length = len(label)
        
        return image, label, length

## Architecture for CRNN Model

In [156]:
class CRNN(nn.Module):
    def __init__(self, num_classes, hidden_size=256):
        super(CRNN, self).__init__()
        
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d((2, 1)),
            
            nn.Conv2d(256, hidden_size, kernel_size=3, padding=1),
            nn.BatchNorm2d(hidden_size),
            nn.ReLU()
        )
        
        self.rnn = nn.LSTM(hidden_size, hidden_size, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        conv = self.cnn(x)
        # Reshape for RNN
        b, c, h, w = conv.size()
        conv = conv.view(b, c, -1)  # Merging h for 3D
        conv = conv.permute(0, 2, 1)

        # RNN and final prediction
        rnn, _ = self.rnn(conv)
        output = self.fc(rnn)

        return output

## Training

In [157]:
def train_ocr_model(model, train_loader, criterion, optimizer, num_epochs=20, device='cuda'):
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        for batch_idx, (images, targets, target_lengths) in enumerate(train_loader):
            images = images.to(device)
            targets = targets.to(device)
            target_lengths = target_lengths.to(device)
            
            optimizer.zero_grad()
            outputs = model(images)

            batch_size = outputs.size(0)
            input_lengths = torch.full((batch_size,), outputs.size(1), dtype=torch.long).to(device)

            # Computing CTC loss
            loss = criterion(outputs.log_softmax(2).permute(1, 0, 2), 
                             targets, input_lengths, target_lengths)
            
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            
            if batch_idx % 10 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx}/{len(train_loader)}], Loss: {loss.item():.4f}')
        
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch [{epoch+1}/{num_epochs}], Average Loss: {avg_loss:.4f}')

    # Saving model to use it
    torch.save(model.state_dict(), "crnn_model.pth")
    print("Model saved successfully.")

## Decoder for Prediction

In [158]:
def decode_predictions(outputs, idx_to_label):
    _, predictions = outputs.max(2)
    predictions = predictions.cpu().numpy()
    
    decoded_words = []
    for pred in predictions:
        word = []
        prev_idx = -1
        
        for idx in pred:
            if idx != 0 and idx != prev_idx: 
# Improved accuracy when i ran it multiple times by removing spaces and all (didnt formally test it but found good enough proof over a bunch of examples)
                word.append(idx_to_label[idx])
            prev_idx = idx
            
        decoded_words.append(''.join(word))
    
    return decoded_words

## Testing function

In [159]:
def preprocess_image(image_path):
    transform = transforms.Compose([
        transforms.Resize((32, 128)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5], std=[0.5])
    ])
    
    image = Image.open(image_path).convert('L') # can change for RGB but for my tests it was actually better when i forcefully converted to greyscale
    image = transform(image).unsqueeze(0)
    
    return image

def predict_image(model, image_path, idx_to_label, device='cuda'):
    model.to(device)
    model.eval()
    
    image = preprocess_image(image_path).to(device)

    with torch.no_grad():
        output = model(image)

    predicted_text = decode_predictions(output, idx_to_label)
    
    return predicted_text[0]

## Main code

In [160]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

transform = transforms.Compose([
    transforms.Resize((32, 128)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

dataset = WordImageDataset('content/dataset/easy', transform=transform)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

model = CRNN(num_classes=len(dataset.label_to_idx))
criterion = nn.CTCLoss(zero_infinity=True)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# train
train_ocr_model(model, train_loader, criterion, optimizer, device=device)

# load for test
model.load_state_dict(torch.load("crnn_model.pth"))


Epoch [1/20], Batch [0/623], Loss: 48.7920
Epoch [1/20], Batch [10/623], Loss: 5.3036
Epoch [1/20], Batch [20/623], Loss: 3.2787
Epoch [1/20], Batch [30/623], Loss: 3.2456
Epoch [1/20], Batch [40/623], Loss: 3.2499
Epoch [1/20], Batch [50/623], Loss: 3.2310
Epoch [1/20], Batch [60/623], Loss: 3.2166
Epoch [1/20], Batch [70/623], Loss: 3.1425
Epoch [1/20], Batch [80/623], Loss: 3.1011
Epoch [1/20], Batch [90/623], Loss: 3.1990
Epoch [1/20], Batch [100/623], Loss: 3.1897
Epoch [1/20], Batch [110/623], Loss: 3.2132
Epoch [1/20], Batch [120/623], Loss: 3.1245
Epoch [1/20], Batch [130/623], Loss: 3.1723
Epoch [1/20], Batch [140/623], Loss: 3.0769
Epoch [1/20], Batch [150/623], Loss: 3.0377
Epoch [1/20], Batch [160/623], Loss: 3.0797
Epoch [1/20], Batch [170/623], Loss: 3.1376
Epoch [1/20], Batch [180/623], Loss: 3.0886
Epoch [1/20], Batch [190/623], Loss: 3.0754
Epoch [1/20], Batch [200/623], Loss: 3.0622
Epoch [1/20], Batch [210/623], Loss: 3.1038
Epoch [1/20], Batch [220/623], Loss: 3.090

<All keys matched successfully>

## Testing for easy dataset

In [161]:
generate_image(word="testing")
test_image_path= "testing.png"
predicted_text = predict_image(model, test_image_path, dataset.idx_to_label, device)
print("Predicted Text:", predicted_text)

Predicted Text: testing


In [162]:
import shutil

if os.path.exists("dataset/easy"):
    shutil.rmtree("dataset/easy")

if not os.path.exists("dataset"):
    os.makedirs("dataset")
if not os.path.exists("dataset/easy"):
    os.makedirs("dataset/easy")

def generate_random_images(words, image_path="dataset/easy"):
    # Generate and save an image for each word
    for word in words:
        # Assuming generate_image is a function that generates an image for the word
        generate_image(word)
        
        # Use the word as the filename (ensure it is not too long)
        image_file_path = os.path.join(image_path, f"{word}.png")
        
        # Save the image
        Image.open(f"{word}.png").save(image_file_path)

def test_model_accuracy(model, words, device, image_path="images/"):
    correct_predictions = 0

    # Generate random images for each word
    generate_random_images(words, image_path)

    for word in words:
        # Use the model to predict the word from the image
        test_image_path = os.path.join(image_path, f"{word}.png")
        predicted_text = predict_image(model, test_image_path, dataset.idx_to_label, device)

        # Check if the predicted text matches the word
        if predicted_text.lower() == word.lower():
            correct_predictions += 1

    accuracy = correct_predictions / len(words) * 100
    return accuracy

# Generate random words
words = generate_random_words(1000)

# Test the model accuracy
accuracy = test_model_accuracy(model, words, device)
print(f"Model Accuracy: {accuracy:.2f}%")


Model Accuracy: 99.60%


## Train and Testing for hard dataset

- rewrote the same code here for convenience, added RBG but doesn't matter beacuse of previosuly explained reasons.

In [163]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
import os
import numpy as np

def collate_fn(batch):
    images, labels, lengths = zip(*batch)
    images = torch.stack(images)
    max_len = max(lengths)
    padded_labels = torch.full((len(labels), max_len), fill_value=0, dtype=torch.long)
    for i, label in enumerate(labels):
        padded_labels[i, :len(label)] = label
    lengths = torch.tensor(lengths, dtype=torch.long)
    return images, padded_labels, lengths

class WordImageDataset(Dataset):
    def __init__(self, image_dir, transform=None):
        self.image_dir = image_dir
        self.transform = transform
        self.images = []
        self.labels = []
        self.label_to_idx = {'<blank>': 0}
        self.idx_to_label = {0: '<blank>'}
        chars = set()
        for filename in os.listdir(image_dir):
            if filename.endswith('.png'):
                word = filename.split('_')[0]
                chars.update(word)
        for idx, char in enumerate(sorted(chars), start=1):
            self.label_to_idx[char] = idx
            self.idx_to_label[idx] = char
        for filename in os.listdir(image_dir):
            if filename.endswith('.png'):
                word = filename.split('_')[0]
                char_indices = [self.label_to_idx[c] for c in word]
                self.images.append(os.path.join(image_dir, filename))
                self.labels.append(torch.tensor(char_indices, dtype=torch.long))
    def __len__(self):
        return len(self.images)
    def __getitem__(self, idx):
        image_path = self.images[idx]
        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        label = self.labels[idx]
        length = len(label)
        return image, label, length

class CRNN(nn.Module):
    def __init__(self, num_classes, hidden_size=256):
        super(CRNN, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d((2, 1)),
            nn.Conv2d(256, hidden_size, kernel_size=3, padding=1),
            nn.BatchNorm2d(hidden_size),
            nn.ReLU()
        )
        self.rnn = nn.LSTM(hidden_size, hidden_size, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)
    def forward(self, x):
        conv = self.cnn(x)
        b, c, h, w = conv.size()
        conv = conv.view(b, c, -1)
        conv = conv.permute(0, 2, 1)
        rnn, _ = self.rnn(conv)
        output = self.fc(rnn)
        return output

def train_ocr_model(model, train_loader, criterion, optimizer, num_epochs=20, device='cuda'):
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch_idx, (images, targets, target_lengths) in enumerate(train_loader):
            images = images.to(device)
            targets = targets.to(device)
            target_lengths = target_lengths.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            batch_size = outputs.size(0)
            input_lengths = torch.full((batch_size,), outputs.size(1), dtype=torch.long).to(device)
            loss = criterion(outputs.log_softmax(2).permute(1, 0, 2), targets, input_lengths, target_lengths)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            if batch_idx % 10 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx}/{len(train_loader)}], Loss: {loss.item():.4f}')
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch [{epoch+1}/{num_epochs}], Average Loss: {avg_loss:.4f}')
    torch.save(model.state_dict(), "crnn_model.pth")
    print("Model saved successfully.")

def decode_predictions(outputs, idx_to_label):
    _, predictions = outputs.max(2)
    predictions = predictions.cpu().numpy()
    decoded_words = []
    for pred in predictions:
        word = []
        prev_idx = -1
        for idx in pred:
            if idx != 0 and idx != prev_idx:
                word.append(idx_to_label[idx])
            prev_idx = idx
        decoded_words.append(''.join(word))
    return decoded_words

def preprocess_image(image_path):
    transform = transforms.Compose([
        transforms.Resize((32, 128)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
    ])
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0)
    return image

def predict_image(model, image_path, idx_to_label, device='cuda'):
    model.to(device)
    model.eval()
    image = preprocess_image(image_path).to(device)
    with torch.no_grad():
        output = model(image)
    predicted_text = decode_predictions(output, idx_to_label)
    return predicted_text[0]

if __name__ == "__main__":
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    transform = transforms.Compose([
        transforms.Resize((32, 128)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
    ])
    dataset = WordImageDataset('content/dataset/hard', transform=transform)
    train_loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
    model = CRNN(num_classes=len(dataset.label_to_idx))
    criterion = nn.CTCLoss(zero_infinity=True)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    train_ocr_model(model, train_loader, criterion, optimizer, device=device)
    model.load_state_dict(torch.load("crnn_model.pth"))

Epoch [1/20], Batch [0/311], Loss: 53.0280
Epoch [1/20], Batch [10/311], Loss: 5.1527
Epoch [1/20], Batch [20/311], Loss: 3.4116
Epoch [1/20], Batch [30/311], Loss: 3.2314
Epoch [1/20], Batch [40/311], Loss: 3.2566
Epoch [1/20], Batch [50/311], Loss: 3.2195
Epoch [1/20], Batch [60/311], Loss: 3.1520
Epoch [1/20], Batch [70/311], Loss: 3.1574
Epoch [1/20], Batch [80/311], Loss: 3.1222
Epoch [1/20], Batch [90/311], Loss: 3.1806
Epoch [1/20], Batch [100/311], Loss: 3.2106
Epoch [1/20], Batch [110/311], Loss: 3.1607
Epoch [1/20], Batch [120/311], Loss: 3.1689
Epoch [1/20], Batch [130/311], Loss: 3.1424
Epoch [1/20], Batch [140/311], Loss: 3.1862
Epoch [1/20], Batch [150/311], Loss: 3.1909
Epoch [1/20], Batch [160/311], Loss: 3.0745
Epoch [1/20], Batch [170/311], Loss: 3.2101
Epoch [1/20], Batch [180/311], Loss: 3.1638
Epoch [1/20], Batch [190/311], Loss: 3.1880
Epoch [1/20], Batch [200/311], Loss: 3.0706
Epoch [1/20], Batch [210/311], Loss: 3.1426
Epoch [1/20], Batch [220/311], Loss: 3.153

In [164]:
## Actual testing for hard
generate_hard_image(word="testing")
test_image_path= "testing.png"
predicted_text = predict_image(model, test_image_path, dataset.idx_to_label, device)
print("Predicted Text:", predicted_text)

Predicted Text: restng


In [165]:
import shutil

if os.path.exists("dataset/hard"):
    shutil.rmtree("dataset/hard")

if not os.path.exists("dataset"):
    os.makedirs("dataset")
if not os.path.exists("dataset/hard"):
    os.makedirs("dataset/hard")

def generate_random_images(words, image_path="dataset/hard"):
    # Generate and save an image for each word
    for word in words:
        # Assuming generate_image is a function that generates an image for the word
        generate_image(word)
        
        # Use the word as the filename (ensure it is not too long)
        image_file_path = os.path.join(image_path, f"{word}.png")
        
        # Save the image
        Image.open(f"{word}.png").save(image_file_path)

def test_model_accuracy(model, words, device, image_path="images/"):
    correct_predictions = 0

    # Generate random images for each word
    generate_random_images(words, image_path)

    for word in words:
        # Use the model to predict the word from the image
        test_image_path = os.path.join(image_path, f"{word}.png")
        predicted_text = predict_image(model, test_image_path, dataset.idx_to_label, device)

        # Check if the predicted text matches the word
        if predicted_text.lower() == word.lower():
            correct_predictions += 1

    accuracy = correct_predictions / len(words) * 100
    return accuracy

# Generate random words
words = generate_random_words(1000)

# Test the model accuracy
accuracy = test_model_accuracy(model, words, device)
print(f"Model Accuracy: {accuracy:.2f}%")

Model Accuracy: 24.40%


## Testing for Bonus dataset

- rewrote the same code here for convenience

In [None]:
def collate_fn(batch):
    images, labels, lengths = zip(*batch)
    images = torch.stack(images)
    max_len = max(lengths)
    padded_labels = torch.full((len(labels), max_len), fill_value=0, dtype=torch.long)
    for i, label in enumerate(labels):
        padded_labels[i, :len(label)] = label
    lengths = torch.tensor(lengths, dtype=torch.long)
    return images, padded_labels, lengths

class WordImageDataset(Dataset):
    def __init__(self, image_dir, transform=None):
        self.image_dir = image_dir
        self.transform = transform
        self.images = []
        self.labels = []
        self.label_to_idx = {'<blank>': 0}
        self.idx_to_label = {0: '<blank>'}
        chars = set()
        for filename in os.listdir(image_dir):
            if filename.endswith('.png'):
                word = filename.split('_')[0]
                chars.update(word)
        for idx, char in enumerate(sorted(chars), start=1):
            self.label_to_idx[char] = idx
            self.idx_to_label[idx] = char
        for filename in os.listdir(image_dir):
            if filename.endswith('.png'):
                word = filename.split('_')[0]
                char_indices = [self.label_to_idx[c] for c in word]
                self.images.append(os.path.join(image_dir, filename))
                self.labels.append(torch.tensor(char_indices, dtype=torch.long))
    def __len__(self):
        return len(self.images)
    def __getitem__(self, idx):
        image_path = self.images[idx]
        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        label = self.labels[idx]
        length = len(label)
        return image, label, length

class CRNN(nn.Module):
    def __init__(self, num_classes, hidden_size=256):
        super(CRNN, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d((2, 1)),
            nn.Conv2d(256, hidden_size, kernel_size=3, padding=1),
            nn.BatchNorm2d(hidden_size),
            nn.ReLU()
        )
        self.rnn = nn.LSTM(hidden_size, hidden_size, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)
    def forward(self, x):
        conv = self.cnn(x)
        b, c, h, w = conv.size()
        conv = conv.view(b, c, -1)
        conv = conv.permute(0, 2, 1)
        rnn, _ = self.rnn(conv)
        output = self.fc(rnn)
        return output

def train_ocr_model(model, train_loader, criterion, optimizer, num_epochs=20, device='cuda'):
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch_idx, (images, targets, target_lengths) in enumerate(train_loader):
            images = images.to(device)
            targets = targets.to(device)
            target_lengths = target_lengths.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            batch_size = outputs.size(0)
            input_lengths = torch.full((batch_size,), outputs.size(1), dtype=torch.long).to(device)
            loss = criterion(outputs.log_softmax(2).permute(1, 0, 2), targets, input_lengths, target_lengths)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            if batch_idx % 10 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx}/{len(train_loader)}], Loss: {loss.item():.4f}')
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch [{epoch+1}/{num_epochs}], Average Loss: {avg_loss:.4f}')
    torch.save(model.state_dict(), "crnn_model.pth")
    print("Model saved successfully.")

def decode_predictions(outputs, idx_to_label):
    _, predictions = outputs.max(2)
    predictions = predictions.cpu().numpy()
    decoded_words = []
    for pred in predictions:
        word = []
        prev_idx = -1
        for idx in pred:
            if idx != 0 and idx != prev_idx:
                word.append(idx_to_label[idx])
            prev_idx = idx
        decoded_words.append(''.join(word))
    return decoded_words

def preprocess_image(image_path):
    transform = transforms.Compose([
        transforms.Resize((32, 128)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
    ])
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0)
    return image

def predict_image(model, image_path, idx_to_label, device='cuda'):
    model.to(device)
    model.eval()
    image = preprocess_image(image_path).to(device)
    with torch.no_grad():
        output = model(image)
    predicted_text = decode_predictions(output, idx_to_label)
    return predicted_text[0]

if __name__ == "__main__":
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    transform = transforms.Compose([
        transforms.Resize((32, 128)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
    ])
    dataset = WordImageDataset('content/dataset/bonus', transform=transform)
    train_loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
    model = CRNN(num_classes=len(dataset.label_to_idx))
    criterion = nn.CTCLoss(zero_infinity=True)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    train_ocr_model(model, train_loader, criterion, optimizer, device=device)
    model.load_state_dict(torch.load("crnn_model.pth"))

Epoch [1/20], Batch [0/311], Loss: 47.9432
Epoch [1/20], Batch [10/311], Loss: 5.2826
Epoch [1/20], Batch [20/311], Loss: 3.3528
Epoch [1/20], Batch [30/311], Loss: 3.1932
Epoch [1/20], Batch [40/311], Loss: 3.2909
Epoch [1/20], Batch [50/311], Loss: 3.2842
Epoch [1/20], Batch [60/311], Loss: 3.2847
Epoch [1/20], Batch [70/311], Loss: 3.2310
Epoch [1/20], Batch [80/311], Loss: 3.2638
Epoch [1/20], Batch [90/311], Loss: 3.2020
Epoch [1/20], Batch [100/311], Loss: 3.2001
Epoch [1/20], Batch [110/311], Loss: 3.1556
Epoch [1/20], Batch [120/311], Loss: 3.2204
Epoch [1/20], Batch [130/311], Loss: 3.1527
Epoch [1/20], Batch [140/311], Loss: 3.1933
Epoch [1/20], Batch [150/311], Loss: 3.1952
Epoch [1/20], Batch [160/311], Loss: 3.1147
Epoch [1/20], Batch [170/311], Loss: 3.1610
Epoch [1/20], Batch [180/311], Loss: 3.1804
Epoch [1/20], Batch [190/311], Loss: 3.1771
Epoch [1/20], Batch [200/311], Loss: 3.1334
Epoch [1/20], Batch [210/311], Loss: 3.0811
Epoch [1/20], Batch [220/311], Loss: 3.122

In [None]:
# Actual testing for bonus dataset, randomly got an extra i, showing some limitations of the model

generate_bonus_image(word="testing")
test_image_path= "testing.png"
predicted_text = predict_image(model, test_image_path, dataset.idx_to_label, device)
print("Predicted Text:", predicted_text)

Predicted Text: tesiting


## Test function for bonus dataset

In [None]:
def generate_random_images(words, image_path="images/"):
    # Generate and save an image for each word
    for word in words:
        # Assuming generate_image is a function that generates an image for the word
        generate_bonus_image(word)
        
        # Use the word as the filename (ensure it is not too long)
        image_file_path = os.path.join(image_path, f"{word}.png")
        
        # Save the image
        Image.open(f"{word}.png").save(image_file_path)

def test_model_accuracy(model, words, device, image_path="images/"):
    correct_predictions = 0

    # Generate random images for each word
    generate_random_images(words, image_path)

    for word in words:
        # Use the model to predict the word from the image
        test_image_path = os.path.join(image_path, f"{word}.png")
        predicted_text = predict_image(model, test_image_path, dataset.idx_to_label, device)

        # Check if the predicted text matches the word
        if predicted_text.lower() == word.lower():
            correct_predictions += 1

    accuracy = correct_predictions / len(words) * 100
    return accuracy

# Generate random words
words = generate_random_words(1000)

# Test the model accuracy
accuracy = test_model_accuracy(model, words, device)
print(f"Model Accuracy: {accuracy:.2f}%")


Model Accuracy: 44.00%
