# Model construction

### 1. Initialization

In [125]:
SCREEN_WIDTH    = 50
SCREEN_HEIGHT   = 20
FONT_SIZE       = 10 

### 1.1 Size of convolution layer
CLASStorch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros', device=None, dtype=None)

input: (N, C_in, H_in, W_in)

output: (N, C_out, H_out, W_out)

$$H_{out} = \frac{H_{in} + 2 \times padding[0] - dilation[0] \times (kernel\_size[0] -1) -1}{stride[0]}+1 $$

### 2. Build ConvNet

In [126]:
import torch
from torch import nn

class CNN(nn.Module):
    def __init__(self, out_channels=6, kernel_size=1, stride=1, padding=0, vocab_size=5000, linear_size=5000, normalization=False):
        super().__init__()
        self.conv1 = nn.Conv2d(
            in_channels=1, out_channels=out_channels, 
            kernel_size=kernel_size, stride=stride, padding=padding
        )
        self.relu1 = nn.ReLU()
        self.maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        # self.fc1 = nn.Linear(3000, vocab_size) 
        self.fc1 = nn.Linear(linear_size, vocab_size) 

        # self.cnn = nn.Sequential(
        #     nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1),
        #     nn.ReLU(),
        #     nn.MaxPool2d(kernel_size=2, stride=2)
        # )

    def forward(self, x):
        # print("original shape:", x.shape)

        x = self.conv1(x)
        x = self.relu1(x)
        x = self.maxpool1(x)
        
        # print("after C shape:", x.shape)
        x = x.view(x.size(0), -1)
        # print("after view:", x.shape)
        x = self.fc1(x)
        return x


render image

In [127]:
# !pip install pygame

In [128]:
pwd

'c:\\Users\\hanya\\OneDrive\\Informatik\\LMU\\Informatik-2023SS\\CS\\CV\\group_project\\cvdl_ss23\\model'

### 3. Convert input string to img

In [129]:
import pygame

# SCREEN_WIDTH    = 100
# SCREEN_HEIGHT   = 20
# FONT_SIZE       = 10 

SCREEN_WIDTH    = 130
SCREEN_HEIGHT   = 25
FONT_SIZE       = 15 
# 4680

pygame.init()
font_noto_sans_regular = pygame.font.Font("../converttext/noto-sans.regular.ttf", FONT_SIZE)


def to_image(text:str, font, id:int=None, noise=False):
  # pygame.init()
  # screen = pygame.display.set_mode((SCREEN_WIDTH, SCREEN_HEIGHT))
  screen = pygame.Surface((SCREEN_WIDTH, SCREEN_HEIGHT))
  screen.fill((255, 255, 255))
  # draw image
  img = font.render(str(text), True, (0, 0, 0))
  screen.blit(img, (2, 0))
  for event in pygame.event.get():
    if event.type == pygame.QUIT:
      run = False
  # pygame.display.flip() 
  # Save the screen as an image when the program finishes
  if noise == False:
    filename = f"./temp_image/word_{str(id)}_{str(text)}_notoSans.png"
  else:
    filename = f"./temp_image/word_{str(id)}_{str(text)}_notoSans_noised.png"
  pygame.image.save(screen, filename)
  # print("Screen saved as ", filename)
  # pygame.quit()
  return filename

image_path = to_image(text="1nd1st1nguishαble", font=font_noto_sans_regular, id=5, noise=False)


Read Image

Tokenizer (discarded)

In [130]:
# from tokenizers import ByteLevelBPETokenizer
# tokenizer = ByteLevelBPETokenizer()
# paths = ["../dataset/ted_dev_en-de.raw.en.txt"]
# tokenizer.train(files=paths, vocab_size=52_000, min_frequency=1, special_tokens=[])

In [131]:
# tokenizer.token_to_id("so")

In [132]:
# tokenizer.get_vocab_size()

In [133]:
# tokenizer.get_vocab()

create dict:

word -> id

id -> word

word -> count

In [134]:
# from collections import defaultdict
# from nltk.tokenize import word_tokenize

# id_to_word_list = []                    # list: id   -> word
# word_num_dict   = defaultdict(int)      # dict: word -> num(word)
# word_to_id_dict = defaultdict(int)      # dict: word -> id
# id_to_word_dict = defaultdict(str)      # dict: id   -> word
# with open("../dataset/ted_dev_en-de.raw.en.txt", 'r', encoding='utf-8') as f:
#     for line in f:
#         if line != '\n':
#             words = line.strip()
#             tokens = word_tokenize(words)
#             for token in tokens:
#                 if token.isalpha():
#                     if token not in id_to_word_list:
#                         id_to_word_list.append(token)
#                     word_num_dict[token] +=1

# for id, word in enumerate(id_to_word_list):
#     word_to_id_dict[word] = id
#     id_to_word_dict[id]   = word

In [135]:
# print("length of word_num_dict:", len(word_num_dict.keys()))
# print("length of id_to_word_dict:", len(id_to_word_dict.keys()))
# print("length of word_id_dict:", len(word_to_id_dict.keys()))
# print(f"index: [5], word in id_to_word_dict: [{id_to_word_dict[5]}], id in word_id_dict: [{word_to_id_dict[id_to_word_dict[5]]}]")


In [136]:
# # save dicts

# import json

# with open('word_num_dict.json', 'w') as fp:
#     json.dump(word_num_dict, fp)
# with open('word_to_id_dict.json', 'w') as fp:
#     json.dump(word_to_id_dict, fp)
# with open('id_to_word_dict.json', 'w') as fp:
#     json.dump(id_to_word_dict, fp)

### 3.1 Get dataset and its lenth

In [137]:
import json 

with open('word_num_dict.json', 'r') as fp:
    word_num_dict_test = json.load(fp)
with open('word_to_id_dict.json', 'r') as fp:
    word_to_id_dict_test = json.load(fp)
with open('id_to_word_dict.json', 'r') as fp:
    id_to_word_dict_test = json.load(fp)
    
print("test load: length of word_num_dict:", len(word_num_dict_test.keys()))
print("test load: length of word_to_id_dict_test:", len(word_to_id_dict_test.keys()))
print("test load: length of id_to_word_dict_test:", len(id_to_word_dict_test.keys()))
print('note index needs to use str(index)')
print(f"test load: index: [5], word in id_to_word_dict: [{id_to_word_dict_test[str(5)]}], id in word_id_dict: [{word_to_id_dict_test[id_to_word_dict_test['5']]}]")

VOCAB_SIZE = len(word_num_dict_test.keys())


test load: length of word_num_dict: 4572
test load: length of id_to_word_dict: 4572
test load: length of word_id_dict: 4572
note index needs to use str(index)
test load: index: [5], word in id_to_word_dict: [defines], id in word_id_dict: [5]


### 3.2 Get max length of imput words

In [138]:
# lst = list(word_num_dict_test.keys())
# m = 0
# for word in lst:
#     if len(word) > m:
#         m = len(word)
#         print(word)
# print(m)

One
things
defines
stewardship
preservation
unfortunately
counterintuitive
indistinguishable
17


### 4. Initialize cuda and model input

In [153]:
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNN(out_channels=16, kernel_size=3, stride=1, padding=1, vocab_size=VOCAB_SIZE, normalization=False, linear_size=12480).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [118]:
# from PIL import Image
# from torchvision import transforms

# transform_norm = transforms.Compose([
#     transforms.ToTensor(),
# ])

# # image_path = "../converttext/imgdataset/5AMPLETyponoised.jpg"
# img = Image.open(image_path).convert('L')
# dog = transform_norm(img)
# # dog = normalize(dog)
# dog = torch.unsqueeze(dog, 0)
# output = model(dog.to(device))
# print(output)
# pred = torch.nn.Softmax(output)
# print(pred)


Dataset {word:str, image:path, label:int}

see: https://pytorch.org/tutorials/beginner/data_loading_tutorial.html

In [119]:
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image

transform_norm = transforms.Compose([
    transforms.ToTensor(),
])
class WordImageIDDataset(Dataset):
    def __init__(self, word_to_id_list, font, noise=False):
        self.word_to_id_list = word_to_id_list
        self.font = font
        self.noise = noise
    
    def __len__(self):
        return len(self.word_to_id_list)
    
    def __getitem__(self, index):
        ''' index is not token ID '''
        output_word = self.word_to_id_list[index][0]
        image_path = to_image(
            text=output_word, 
            font=self.font,
            id=index, 
            noise=self.noise)
        
        # for path in image_paths:
        output_img = Image.open(image_path).convert('L')
        output_img = transform_norm(output_img)
        id = self.word_to_id_list[index][1]
        # output_id_onehot = torch.zeros(1, VOCAB_SIZE)
        # output_id_onehot[0][id] = 1
        output_id_onehot = torch.zeros(VOCAB_SIZE)
        output_id_onehot[id] = 1
                
        output = {'word'    : output_word,
                  'image'   : output_img,
                  'id'      : output_id_onehot}
        return output

In [120]:
# type(list(word_to_id_dict_test.items()))
# print(list(word_to_id_dict_test.items()))

try pipeline without dataset

In [121]:
# from torchvision import transforms

# transform_norm = transforms.Compose([
#     transforms.ToTensor(),
# ])

# for word, id in word_to_id_dict_test.items():
    
#     # id to target
#     target = torch.zeros(1, VOCAB_SIZE)
#     target[0][id] = 1
    
#     # word to image
#     image_path = to_image(word, font=font_noto_sans_regular, id=id, noise=False)
#     img = Image.open(image_path).convert('L')
#     img = transform_norm(img)
#     img = torch.unsqueeze(img, 0)
#     output = model(img.to(device))
#     # print(output)
#     softmax = torch.nn.Softmax()
#     pred = softmax(output)
#     print(pred) 
#     loss = criterion(pred, target.to(device))
#     print("Loss:", loss.item())
#     break


### 5. Train the model
try pipline with dataset

In [156]:
from torchvision import transforms


dataset = WordImageIDDataset(word_to_id_list = list(word_to_id_dict_test.items()),
                            font=font_noto_sans_regular,
                            noise=False)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)
num_epochs = 10

for epoch in range(num_epochs):
    print(f'epoch: {epoch}')
    epoch_loss = []
    for data in dataloader:
        words    = data['word']
        imgs     = data['image']
        ids      = data['id']
        optimizer.zero_grad()
        # print(word)
        # print(imgs.shape)
        # print(id)
        # print(ids.shape)
        # read image    
        outputs = model(imgs.to(device))
        # print(output)
        softmax = torch.nn.Softmax()
        # outputs = softmax(outputs)
        # print(pred) 
        # print(preds.shape)
        # print(ids.shape)
        
        loss = criterion(outputs, ids.to(device))
        # print("Loss:", loss.item())
        epoch_loss.append(loss.item())
        loss.backward()
        optimizer.step()
        # torch.cuda.empty_cache()
    print(sum(epoch_loss)/len(epoch_loss))

epoch: 0
8.226514782938924
epoch: 1
7.60472496572908
epoch: 2
5.519001911570142
epoch: 3
1.9904476673036189
epoch: 4
0.7794529113110963
epoch: 5
0.3541651229289445
epoch: 6
0.18261963604473985
epoch: 7
0.10275357683886077
epoch: 8
0.06643135929838397
epoch: 9
0.04610278003383428


In [157]:
torch.save(model, "./CNN_130_250_15_16_3_1_1.pth")

### 6. Test with just one word

In [173]:
word = "definitely"
true_id = word_to_id_dict_test[word]

word_image_path = f"./temp_image/word_4571_definitely_notoSans.png"

# img_path = to_image(word, font=font_noto_sans_regular)
img = Image.open(word_image_path).convert('L')
img = transform_norm(img)           # 16, 25, 130. 16: num of feature maps
img = torch.unsqueeze(img, 0)       # 1, 16, 25, 160. 1: batch, 16: num feature maps
# print(img.shape)

output = model(img.to(device))
# print(output)
pred = torch.argmax(output)
# print(pred)
print(f"original word:\t{word}")
print("prediction:\t", id_to_word_dict_test[str(pred.item())])

original word:	definitely
prediction:	 definitely


In [172]:
word = "definitely"
true_id = word_to_id_dict_test[word]
word_noise = "def1n1te1y"

print(f"original word:\t{word}")
print(f"noised word:\t{word_noise}")

img_path = to_image(word_noise, font=font_noto_sans_regular)
img = Image.open(img_path).convert('L')
img = transform_norm(img)           # 16, 25, 130. 16: num of feature maps
img = torch.unsqueeze(img, 0)       # 1, 16, 25, 160. 1: batch, 16: num feature maps
# print(img.shape)

output = model(img.to(device))
# print(output)
pred = torch.argmax(output)
# print(pred)

print("prediction:\t", id_to_word_dict_test[str(pred.item())])

original word:	definitely
noised word:	def1n1te1y
prediction:	 deficiencies


In [171]:
word = "definitely"
true_id = word_to_id_dict_test[word]
word_noise = "def1nitely"

print(f"original word:\t{word}")
print(f"noised word:\t{word_noise}")

img_path = to_image(word_noise, font=font_noto_sans_regular)
img = Image.open(img_path).convert('L')
img = transform_norm(img)           # 16, 25, 130. 16: num of feature maps
img = torch.unsqueeze(img, 0)       # 1, 16, 25, 160. 1: batch, 16: num feature maps
# print(img.shape)

output = model(img.to(device))
# print(output)
pred = torch.argmax(output)
# print(pred)

print("prediction:\t", id_to_word_dict_test[str(pred.item())])

original word:	definitely
noised word:	def1nitely
prediction:	 described


In [170]:
word = "definitely"
true_id = word_to_id_dict_test[word]
word_noise = "definite1y"

print(f"original word:\t{word}")
print(f"noised word:\t{word_noise}")
img_path = to_image(word_noise, font=font_noto_sans_regular)
img = Image.open(img_path).convert('L')
img = transform_norm(img)           # 16, 25, 130. 16: num of feature maps
img = torch.unsqueeze(img, 0)       # 1, 16, 25, 160. 1: batch, 16: num feature maps
# print(img.shape)

output = model(img.to(device))
# print(output)
pred = torch.argmax(output)
# print(pred)

print("prediction:\t", id_to_word_dict_test[str(pred.item())])

original word:	definitely
noised word:	definite1y
prediction:	 definitely
