shape of image:

```
SCREEN_WIDTH = 450
SCREEN_HEIGHT = 200
```


In [15]:
SCREEN_WIDTH = 450
SCREEN_HEIGHT = 200

CLASStorch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros', device=None, dtype=None)

input: (N, C_in, H_in, W_in)

output: (N, C_out, H_out, W_out)

$$H_{out} = \frac{H_{in} + 2 \times padding[0] - dilation[0] \times (kernel\_size[0] -1) -1}{stride[0]}+1 $$

In [16]:
import torch
from torch import nn

class CNN(nn.Module):
    def __init__(self, out_channels=6, kernel_size=1, stride=1, padding=0, vocab_size=5000, normalization=False):
        super().__init__()
        self.conv1 = nn.Conv2d(
            in_channels=3, out_channels=out_channels, 
            kernel_size=kernel_size, stride=stride, padding=padding
        )
        self.relu1 = nn.ReLU()
        self.maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(135000, vocab_size) 

    def forward(self, x):
        print("original shape:", x.shape)

        x = self.conv1(x)
        x = self.relu1(x)
        x = self.maxpool1(x)
        print("after C shape:", x.shape)
        x = x.view(x.size(0), -1)
        print("after view:", x.shape)
        x = self.fc1(x)
        return x


  from .autonotebook import tqdm as notebook_tqdm


render image

In [17]:
# !pip install pygame

In [19]:
import pygame

pygame.init()
SCREEN_WIDTH = 450
SCREEN_HEIGHT = 200
screen = pygame.display.set_mode((SCREEN_WIDTH, SCREEN_HEIGHT))
font_noto_sans_regular = pygame.font.Font("../converttext/noto-sans.regular.ttf", 70)

# pygame.quit()
def to_image(input, font, noise=False):
  screen.fill((255, 255, 255))
  # draw image
  img = font.render(input, True, (0, 0, 0))
  screen.blit(img, (30, 60))
  for event in pygame.event.get():
    if event.type == pygame.QUIT:
      run = False
  pygame.display.flip() 
  # Save the screen as an image when the program finishes
  if noise == False:
    filename = "./temp_image/" + input + "_notoSans.jpg"
  else:
    filename = "./temp_image/" + input + "_noto_noised.jpg"
  pygame.image.save(screen, filename)
  print("Screen saved as ", filename)
  return filename

image_path = to_image(input, font=font_noto_sans_regular, noise=False)


Screen saved as  ./temp_image/d1fferenœ_notoSans.jpg


Read Image

Tokenizer

In [20]:
# from tokenizers import ByteLevelBPETokenizer
# tokenizer = ByteLevelBPETokenizer()
# paths = ["../dataset/ted_dev_en-de.raw.en.txt"]
# tokenizer.train(files=paths, vocab_size=52_000, min_frequency=1, special_tokens=[])

In [21]:
# tokenizer.token_to_id("so")

In [22]:
# tokenizer.get_vocab_size()

In [23]:
# tokenizer.get_vocab()

create dict:

word -> id

id -> word

word -> count

In [24]:
# from collections import defaultdict
# from nltk.tokenize import word_tokenize

# id_to_word_list = []                    # list: id   -> word
# word_num_dict   = defaultdict(int)      # dict: word -> num(word)
# word_to_id_dict = defaultdict(int)      # dict: word -> id
# id_to_word_dict = defaultdict(str)      # dict: id   -> word
# with open("../dataset/ted_dev_en-de.raw.en.txt", 'r', encoding='utf-8') as f:
#     for line in f:
#         if line != '\n':
#             words = line.strip()
#             tokens = word_tokenize(words)
#             for token in tokens:
#                 if token.isalpha():
#                     if token not in id_to_word_list:
#                         id_to_word_list.append(token)
#                     word_num_dict[token] +=1

# for id, word in enumerate(id_to_word_list):
#     word_to_id_dict[word] = id
#     id_to_word_dict[id]   = word

In [25]:
# print("length of word_num_dict:", len(word_num_dict.keys()))
# print("length of id_to_word_dict:", len(id_to_word_dict.keys()))
# print("length of word_id_dict:", len(word_to_id_dict.keys()))
# print(f"index: [5], word in id_to_word_dict: [{id_to_word_dict[5]}], id in word_id_dict: [{word_to_id_dict[id_to_word_dict[5]]}]")


In [26]:
# # save dicts

# import json

# with open('word_num_dict.json', 'w') as fp:
#     json.dump(word_num_dict, fp)
# with open('word_to_id_dict.json', 'w') as fp:
#     json.dump(word_to_id_dict, fp)
# with open('id_to_word_dict.json', 'w') as fp:
#     json.dump(id_to_word_dict, fp)

In [27]:
import json 

with open('word_num_dict.json', 'r') as fp:
    word_num_dict_test = json.load(fp)
with open('word_to_id_dict.json', 'r') as fp:
    word_to_id_dict_test = json.load(fp)
with open('id_to_word_dict.json', 'r') as fp:
    id_to_word_dict_test = json.load(fp)
    
print("test load: length of word_num_dict:", len(word_num_dict_test.keys()))
print("test load: length of id_to_word_dict:", len(word_to_id_dict_test.keys()))
print("test load: length of word_id_dict:", len(word_to_id_dict_test.keys()))
print('note index needs to use str(index)')
print(f"test load: index: [5], word in id_to_word_dict: [{id_to_word_dict_test[str(5)]}], id in word_id_dict: [{word_to_id_dict_test[id_to_word_dict_test['5']]}]")


test load: length of word_num_dict: 4572
test load: length of id_to_word_dict: 4572
test load: length of word_id_dict: 4572
note index needs to use str(index)
test load: index: [5], word in id_to_word_dict: [defines], id in word_id_dict: [5]


Train

In [30]:
import torch.optim as optim
from PIL import Image

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNN(out_channels=6, kernel_size=1, stride=1, padding=0, vocab_size=VOCAB_SIZE, normalization=False).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [31]:
import torch.optim as optim
from PIL import Image

from torchvision import transforms
transform_norm = transforms.Compose([
    transforms.ToTensor(),
])

# image_path = "../converttext/imgdataset/5AMPLETyponoised.jpg"
img = Image.open(image_path)
dog = transform_norm(img)
# dog = normalize(dog)
dog = torch.unsqueeze(dog, 0)
output = model(dog.to(device))
print(output)

original shape: torch.Size([1, 3, 200, 450])
after C shape: torch.Size([1, 6, 100, 225])
after view: torch.Size([1, 135000])
tensor([[-0.1080, -0.1653, -0.4257,  ...,  0.3773, -0.0302, -0.2515]],
       device='cuda:0', grad_fn=<AddmmBackward0>)


In [None]:
VOCAB_SIZE = 4572