In [2]:
import numpy as np
import tensorflow as tf
import cv2
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
datadir = "./data/"
wordsFile = datadir + "words.txt"

In [4]:
def loadWords():
    columns=['filename', 'word', 'greylvl']
    
    with open(wordsFile, 'r') as words:
        rowsList = []
        for line in words:
            
            # if comment
            if line[0] == "#":
                continue
                
            data = line.split()
            data = [data[0], data[8], data[2]]        
            row = dict( (colName, data[i]) for i, colName in enumerate(columns))
            
            rowsList.append(row)
    
    df = pd.DataFrame(rowsList, columns=columns)
    return df

In [5]:
df = loadWords()
print(df.head())
print(df.shape)

         filename  word greylvl
0  a01-000u-00-00     A     154
1  a01-000u-00-01  MOVE     154
2  a01-000u-00-02    to     154
3  a01-000u-00-03  stop     154
4  a01-000u-00-04   Mr.     154
(115320, 3)


In [6]:
def preprocessImg(filename):
    #     print("process", filename)
    # Read and load
    filename = filename.split("-")
    path = "/".join([datadir + "words", filename[0], "-".join(filename[:2]), "-".join(filename)+'.png'])
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    
    # Resize
    (targetW, targetH) = (128, 32)
    (imgH, imgW) = img.shape
    
    fy = targetH / imgH
    fx = targetW / imgW
    f = min(fx, fy)
    
    newSize = (int(np.trunc(imgW*f)), int(np.trunc(imgH*f)))
    newImg = cv2.resize(img, newSize)
    
    
    # Fill to NN pattern
    pattern = np.ones((32, 128)) * 255
    pattern[0:newSize[1], 0:newSize[0]] = newImg

    return pattern

In [7]:
class BatchGenerator:
    def __init__(self, batchSize=512, mode='train'):
        self.batchSize = batchSize
        self.start = 0
        self.stop  = int(df.shape[0] * 0.95)
        
        if (mode == 'valid'):
            # 5% for validation
            self.start = int(df.shape[0]*0.95)
            self.stop = df.shape[0]
        
    
    def hasNext(self):
        return self.start + self.batchSize < self.stop
    
    def getNext(self):
        if (not self.hasNext()):
            raise ValueError('No batch to get')
        
        start = self.start
        stop = start + self.batchSize - 1
        pathes = df.loc[start:stop, 'filename']
        imgs = [preprocessImg(path) for path in pathes]
        batch = np.stack(imgs, axis=0)
        print("return batch %d:%d" % (start, start+self.batchSize))
        self.start += self.batchSize
        return batch

In [None]:
from time import sleep
bgen = BatchGenerator()
while bgen.hasNext():
    print(bgen.getNext().shape)
    sleep(2)
    

return batch 0:512
(512, 32, 128)
return batch 512:1024
(512, 32, 128)
return batch 1024:1536
(512, 32, 128)
return batch 1536:2048
(512, 32, 128)
return batch 2048:2560
(512, 32, 128)
return batch 2560:3072
(512, 32, 128)
return batch 3072:3584
(512, 32, 128)
return batch 3584:4096
(512, 32, 128)


In [9]:
for row in df.loc[3584:4096, 'filename']: print(row)

a01-096u-02-04
a01-096u-02-05
a01-096u-02-06
a01-096u-02-07
a01-096u-03-00
a01-096u-03-01
a01-096u-03-02
a01-096u-03-03
a01-096u-03-04
a01-096u-03-05
a01-096u-03-06
a01-096u-03-07
a01-096u-03-08
a01-096u-03-09
a01-096u-04-00
a01-096u-04-01
a01-096u-04-02
a01-096u-04-03
a01-096u-04-04
a01-096u-04-05
a01-096u-04-06
a01-096u-04-07
a01-096u-05-00
a01-096u-05-01
a01-096u-05-02
a01-096u-05-03
a01-096u-05-04
a01-096u-05-05
a01-096u-05-06
a01-096u-05-07
a01-096u-05-08
a01-096u-05-09
a01-096u-06-00
a01-096u-06-01
a01-096u-06-02
a01-096u-06-03
a01-096u-06-04
a01-096u-06-05
a01-096u-06-06
a01-096u-07-00
a01-096u-07-01
a01-096u-07-02
a01-096u-07-03
a01-096u-07-04
a01-096u-07-05
a01-096u-07-06
a01-096u-08-00
a01-096u-08-01
a01-096u-08-02
a01-096u-08-03
a01-096u-08-04
a01-096u-08-05
a01-096u-08-06
a01-096u-09-00
a01-096u-09-01
a01-096u-09-02
a01-096u-09-03
a01-096u-09-04
a01-096u-09-05
a01-096u-09-06
a01-096u-09-07
a01-096u-10-00
a01-096u-10-01
a01-102-00-00
a01-102-00-01
a01-102-00-02
a01-102-00-03