In [1]:
import glob
import os
import lmdb
import cv2
import numpy as np
import codecs

In [2]:
imagePath = '/home/gu/media/crnn_data/train_data/default/*.jpg'
labelPath = '/home/gu/media/crnn_data/train_data/default/tmp_labels.txt'
outputPath = '/home/gu/media/crnn_data/lmdb/mytrain/'

In [3]:
with open(labelPath,'r') as file:
    for _,line in enumerate(file):
        textName,label = line.strip()[:8],line.strip()[9:]
        with open('./output/train_data/default/%s.txt'%textName,'w') as textfile:
            textfile.write(label)

In [47]:
imagePath = '/home/gu/media/crnn_data/test_data/default/*.jpg'
labelPath = '/home/gu/media/crnn_data/test_data/default/tmp_labels.txt'
outputPath = '/home/gu/media/crnn_data/lmdb/myval/'

In [37]:
with open(labelPath,'r') as file:
    for _,line in enumerate(file):
        textName,label = line.strip()[:8],line.strip()[9:]
        with open('./output/test_data/default/%s.txt'%textName,'w') as textfile:
            textfile.write(label)

In [38]:
imagePathList = glob.glob(imagePath)

In [39]:
len(imagePathList)

1000

In [40]:
def read_text(path):
    with open(path) as f:
        text = f.read()
    text = text.strip()
    return text

In [41]:
imgLabelLists = []
for p in imagePathList:
        try:
            imgLabelLists.append((p,read_text(p.replace('.jpg','.txt'))))
        except:
            continue

In [42]:
imgLabelLists

[('./output/test_data/default/00000349.jpg', '立顿醇萃茶选清润饮250gX24'),
 ('./output/test_data/default/00000097.jpg', '多芬日常损伤理护SP12/发膜精华CD12*120'),
 ('./output/test_data/default/00000939.jpg', '旁氏滋润倍护洁面摩丝促销装2x6x165ml'),
 ('./output/test_data/default/00000379.jpg', '多芬舒活水润沐浴乳200mlX12'),
 ('./output/test_data/default/00000651.jpg', '中华双重皓白柠檬薄荷味 (2011) 54X75g'),
 ('./output/test_data/default/00000617.jpg', '奥妙全自动洁彩升级装洗衣液3kgx4'),
 ('./output/test_data/default/00000731.jpg', '力士灵动丰盈洗发乳6X750ml'),
 ('./output/test_data/default/00000885.jpg', '立顿绝品醇奶茶日式抹茶S1 6x20x21g'),
 ('./output/test_data/default/00000872.jpg', '多芬日常损伤理护(去屑)洗发乳12X400毫升'),
 ('./output/test_data/default/00000122.jpg', '立顿醇萃茶选港式奶茶样品50gX100'),
 ('./output/test_data/default/00000163.jpg', '麗仕水嫩護膚沐浴乳 12X750ml'),
 ('./output/test_data/default/00000798.jpg', '中华皓清沁醒柠檬味 (2011) 54X130g'),
 ('./output/test_data/default/00000906.jpg', '夏士莲浴露(水润+沁爽)6X(1L+650ml)'),
 ('./output/test_data/default/00000287.jpg', '力士水润丝滑发膜120X30ml'),
 ('./output/tes

In [43]:
imgLabelList = sorted(imgLabelLists,key = lambda x:len(x[1]))

In [44]:
imgPaths = [ p[0] for p in imgLabelList]
txtLists = [ p[1] for p in imgLabelList]

In [45]:
def checkImageIsValid(imageBin):
    if imageBin is None:
        return False
    imageBuf = np.fromstring(imageBin, dtype=np.uint8)
    img = cv2.imdecode(imageBuf, cv2.IMREAD_GRAYSCALE)
    if img is None:
        return False
    imgH, imgW = img.shape[0], img.shape[1]
    if imgH * imgW == 0:
        return False
    return True
 
def writeCache(env, cache):
    with env.begin(write=True) as txn:
        for k, v in cache.items():
            txn.put(k.encode(), v)

In [46]:
def checkImageIsValid(imageBin):
    if imageBin is None:
        return False
    imageBuf = np.fromstring(imageBin, dtype=np.uint8)
    img = cv2.imdecode(imageBuf, cv2.IMREAD_GRAYSCALE)
    if img is None:
        return False
    imgH, imgW = img.shape[0], img.shape[1]
    if imgH * imgW == 0:
        return False
    return True


def writeCache(env, cache):
    with env.begin(write=True) as txn:
        for k, v in cache.items():
            txn.put(k.encode(), v)


def createDataset(outputPath, imagePathList, labelList, lexiconList=None, checkValid=True):
    """
    Create LMDB dataset for CRNN training.

    ARGS:
        outputPath    : LMDB output path
        imagePathList : list of image path
        labelList     : list of corresponding groundtruth texts
        lexiconList   : (optional) list of lexicon lists
        checkValid    : if true, check the validity of every image
    """
    # print (len(imagePathList) , len(labelList))
    assert (len(imagePathList) == len(labelList))
    nSamples = len(imagePathList)
    print('...................')
    env = lmdb.open(outputPath, map_size=1099511627776)

    cache = {}
    cnt = 1
    for i in range(nSamples):
        imagePath = imagePathList[i]
        label = labelList[i]
        if not os.path.exists(imagePath):
            print('%s does not exist' % imagePath)
            continue
        with open(imagePath, 'rb') as f:
            imageBin = f.read()
        if checkValid:
            if not checkImageIsValid(imageBin):
                print('%s is not a valid image' % imagePath)
                continue

        imageKey = 'image-%09d' % cnt
        labelKey = 'label-%09d' % cnt
        cache[imageKey] = imageBin
        cache[labelKey] = label.encode()
        if lexiconList:
            lexiconKey = 'lexicon-%09d' % cnt
            cache[lexiconKey] = ' '.join(lexiconList[i]).encode()
        if cnt % 1000 == 0:
            writeCache(env, cache)
            cache = {}
            print('Written %d / %d' % (cnt, nSamples))
        cnt += 1
    nSamples = cnt - 1
    cache['num-samples'] = str(nSamples).encode()
    writeCache(env, cache)
    print('Created dataset with %d samples' % nSamples)


def read_text(path):
    with open(path) as f:
        text = f.read()
    text = text.strip()

    return text

In [48]:
createDataset(outputPath, imgPaths, txtLists, lexiconList=None, checkValid=True)

...................
Written 1000 / 1000
Created dataset with 1000 samples


  after removing the cwd from sys.path.
