## Formulas 数据集预处理

### 一些 import 和设置

In [1]:
import os
import shutil
import random

datasets_no_test 位置设置

In [2]:
datasets_no_test_path = './datasets_no_test/'
image_to_latex_datasets_path = './image-to-latex/data/'

创建一些文件夹

In [3]:
def mkdir(path):
    folder = os.path.exists(path)
 
    if not folder:                   #判断是否存在文件夹如果不存在则创建为文件夹
        os.makedirs(path)            #makedirs 创建文件时如果路径不存在会创建这个路径

In [4]:
mkdir(datasets_no_test_path + "dev/images_after_filter")
mkdir(datasets_no_test_path + "dev/labels_after_filter")
mkdir(datasets_no_test_path + "dev/labels_after_tokenize")
mkdir(datasets_no_test_path + "train/images_after_filter")
mkdir(datasets_no_test_path + "train/labels_after_filter")
mkdir(datasets_no_test_path + "train/labels_after_tokenize")

# image-to-latex 数据集路径
mkdir(image_to_latex_datasets_path)
mkdir(image_to_latex_datasets_path + "formula_images/images_train")
mkdir(image_to_latex_datasets_path + "formula_images/images_val")
mkdir(image_to_latex_datasets_path + "formula_images/images_test")


### Tokenization，根据词表进⾏分词，并根据词表初步过滤数据

依据 no_chinese.py

指明路径

In [5]:
input_label_dirs = ['./datasets_no_test/dev/labels/', './datasets_no_test/train/labels/']
output_label_dirs = ['./datasets_no_test/dev/labels_after_tokenize/', './datasets_no_test/train/labels_after_tokenize/']

读取字符表

In [6]:
with open('./data_preprocess/vocab.txt', 'r', encoding='utf-8') as f:
    vocab = f.read().split()

max_token_len = 0
for v in vocab:
    if len(v) > max_token_len:
        # print(len(v))
        max_token_len = len(v)
# print(max_token_len)

搜索函数

In [7]:
def FMM_func(user_dict, sentence):
    """
    正向最大匹配（FMM）
    :param user_dict: 词典
    :param sentence: 句子
    """
    # 词典中最长词长度
    max_len = max([len(item) for item in user_dict])
    start = 0
    token_list = []
    while start != len(sentence):
        index = start+max_len
        if index>len(sentence):
            index = len(sentence)
        for i in range(max_len):
            if (sentence[start:index] in user_dict) or (len(sentence[start:index])==1):
                token_list.append(sentence[start:index])
                # print(sentence[start:index], end='/')
                start = index
                break
            index += -1
    return token_list

处理函数

In [8]:
chinese_token_list=[]

def tokenize(input_label_dir, output_label_dir):
    label_name_list = os.listdir(input_label_dir)
    index = 1
    for label_name in label_name_list:
        # print(index, ':')
        index += 1
        # print(label_name, ':',end='')
        label_file_name = input_label_dir + label_name
        with open(label_file_name, 'r', encoding='utf-8') as f1:
            content = f1.read()

        # print(content)

        token_list = FMM_func(vocab, content)
        token_list = [token_list[i] for i in range(len(token_list)) if token_list[i] != ' '] # 去除空格
        # print(token_list)

        new_content = ' '.join(token_list)

        # print(new_content)
        
        have_chinese = False

        for token in token_list:
            if token not in vocab and token not in ['', ' ']:
                # print(label_name, ':',end='')
                # print(token)
                chinese_token_list.append(token)
                have_chinese = True

        if have_chinese is not True:
            # 保存数据
            # shutil.copy(label_file_name, output_label_dir + label_name)
            with open(output_label_dir + label_name, 'w', encoding='utf-8') as f:
                f.write(new_content)
        else:
            # 不管了
            pass
            # with open('./data/math_210421/formula_labels_210421_chinese/' + label_name, 'w', encoding='utf-8') as f:
                # f.write(new_content)

        # if have_chinese is True:
        #     print()

运行处理

In [9]:
for input_label_dir, output_label_dir in zip(input_label_dirs, output_label_dirs): 
    tokenize(input_label_dir, output_label_dir)

保存出现的中文

In [10]:
with open('./data_preprocess/chinese_token.txt', 'w', encoding='utf-8') as f:
    chinese_token_list = list(set(chinese_token_list))
    for chinese_token in chinese_token_list:
        f.write(chinese_token + '\n')

### 过滤多行和内容为error mathpix的标签

依据 data_filter.py

指明路径

In [11]:
input_label_dirs = ['./datasets_no_test/dev/labels_after_tokenize/', './datasets_no_test/train/labels_after_tokenize/']
output_label_dirs = ['./datasets_no_test/dev/labels_after_filter/', './datasets_no_test/train/labels_after_filter/']

过滤函数

In [12]:
def filter(input_label_dir, output_label_dir):
    label_name_list = os.listdir(input_label_dir)
    for label_name in label_name_list:
        label_file_name = input_label_dir + label_name
        with open(label_file_name, 'r', encoding='utf-8') as f1:
            lines = f1.readlines()
        with open(label_file_name, 'r', encoding='utf-8') as f1:
            content = f1.read()

        # print(lines)
        # print(content)

        if len(lines) > 1 or 'e r r o r m a t h p i x' in content:
            # 多行和错误的数据直接不要了
            continue
        

        # 通过筛选的数据
        shutil.copy(label_file_name, output_label_dir + label_name)

运行处理

In [13]:
for input_label_dir, output_label_dir in zip(input_label_dirs, output_label_dirs): 
    filter(input_label_dir, output_label_dir)

### 对⻬过滤后的数据

依据 extract_image_according_to_label_list.py

指明路径

In [14]:
label_dirs = ['./datasets_no_test/dev/labels_after_filter/', './datasets_no_test/train/labels_after_filter/']
image_dirs = ['./datasets_no_test/dev/images/', './datasets_no_test/train/images/']
output_dirs = ['./datasets_no_test/dev/images_after_filter/', './datasets_no_test/train/images_after_filter/']

处理函数

In [15]:
def extract(label_dir, image_dir, output_dir):
    label_name_list = os.listdir(label_dir)

    for i in range(len(label_name_list)):
        label_name_list[i] = label_name_list[i][:-4]

    # print(label_list)

    image_name_list = os.listdir(image_dir)

    for image_name in image_name_list:
        if image_name[:-4] in label_name_list:
            # print(image_name)
            shutil.copy(image_dir + image_name, output_dir + image_name)

运行处理

In [16]:
for label_dir, image_dir, output_dir in zip(label_dirs, image_dirs, output_dirs):
    extract(label_dir, image_dir, output_dir)

### 关于 img_to_latex 的专属标签处理

In [17]:
validate_label_dir = './datasets_no_test/dev/labels_after_filter/'
train_label_dir = './datasets_no_test/train/labels_after_filter/'

output_dir = './image-to-latex/data/'
output_file = './image-to-latex/data/im2latex_formulas.norm.lst'

validate_image_dir = './datasets_no_test/dev/images_after_filter/'
train_image_dir = './datasets_no_test/train/images_after_filter/'
test_image_dir = './datasets_no_test/test/images/'
image_output_dir = './image-to-latex/data/formula_images/'

validate_image_name_list = os.listdir(validate_image_dir)
validate_label_name_list = os.listdir(validate_label_dir)
train_image_name_list = os.listdir(train_image_dir)
train_label_name_list = os.listdir(train_label_dir)
test_image_name_list = os.listdir(test_image_dir)

random.shuffle(validate_label_name_list)
random.shuffle(train_label_name_list)

In [18]:
with open(output_file, 'w', encoding='utf-8') as f0:
    index = 0

    # train
    with open(output_dir + 'im2latex_train_filter.lst', 'w', encoding='utf-8') as f1:
        i = 0
        for train_label_name in train_label_name_list:
            i += 1
            print(index, end='\r')
            image_name = train_label_name[:-4] + '.png'
            if image_name in train_image_name_list:
                # f1.write(image_name + ' ' + str(index) + '\n')
                f1.write(str(i) + '.png' + ' ' + str(index) + '\n')
                shutil.copy(train_image_dir + image_name, image_output_dir + 'images_train/' + str(i) + '.png')
                with open(train_label_dir + train_label_name, 'r', encoding='utf-8') as f2:
                    line = f2.read()
                    f0.write(line + '\n')
                index += 1

    # dev
    with open(output_dir + 'im2latex_validate_filter.lst', 'w', encoding='utf-8') as f1:
        i = 0
        for val_label_name in validate_label_name_list:
            i += 1
            print(index, end='\r')
            image_name = val_label_name[:-4] + '.png'
            if image_name in validate_image_name_list:
                # f1.write(image_name + ' ' + str(index) + '\n')
                f1.write(str(i) + '.png' + ' ' + str(index) + '\n')
                shutil.copy(validate_image_dir + image_name, image_output_dir + 'images_val/' + str(i) + '.png')
                with open(validate_label_dir + val_label_name, 'r', encoding='utf-8') as f2:
                    line = f2.read()
                    f0.write(line + '\n')
                index += 1

    # test
    with open(output_dir + 'im2latex_test_filter.lst', 'w', encoding='utf-8') as f1:
        i = 0
        for image_name in test_image_name_list:
            i += 1
            print(index, end='\r')
            test_label_name = image_name[:-4] + '.txt'
            # f1.write(image_name + ' ' + str(index) + '\n')                
            f1.write(str(i) + '.png' + ' ' + str(index) + '\n')                
            shutil.copy(test_image_dir + image_name, image_output_dir + 'images_test/' + str(i) + '.png')
            line = "1"
            f0.write(line + '\n')
            index += 1
    

90418

### LaTx_OCR_PRO - tensorflow 数据处理

In [19]:
# input path
validate_label_dir = './datasets_no_test/dev/labels_after_filter/'
train_label_dir = './datasets_no_test/train/labels_after_filter/'

validate_image_dir = './datasets_no_test/dev/images_after_filter/'
train_image_dir = './datasets_no_test/train/images_after_filter/'
test_image_dir = './datasets_no_test/test/images/'

In [20]:
# output path
path = os.path.join(os.curdir, 'latex_ocr', 'data', 'hand')
dir_train_output = os.path.join(path, 'train')
dir_val_output = os.path.join(path, 'val')
dir_test_output = os.path.join(path, 'test')

def copy(src, dst):
    """ 
    src: source dir
    dst: destination dir
    """
    if not os.path.exists(dst):
        os.makedirs(dst)
    
    files = os.listdir(src)
    for file in files:
        src_file = os.path.join(src, file)
        dst_file = os.path.join(dst, file)
        shutil.copy(src_file, dst_file)

copy(train_image_dir, os.path.join(dir_train_output, 'images'))
copy(train_label_dir, os.path.join(dir_train_output, 'formulas'))
copy(validate_image_dir, os.path.join(dir_val_output, 'images'))
copy(validate_label_dir, os.path.join(dir_val_output, 'formulas'))
copy(test_image_dir, os.path.join(dir_test_output, 'images'))


# build match.txt
def build_match(dir, withANS = True):
    img_dir = os.path.join(dir, 'images')
    files = os.listdir(img_dir)
    with open(os.path.join(dir, 'match.txt'), 'w') as f:
        for file in files:
            f.write(f'{file} {file[:-4]}\n')

build_match(dir_train_output)
build_match(dir_val_output)
# without test_formula, so build an empty formulas.txt
with open(os.path.join(dir_test_output, 'formulas.txt'), 'w') as f:
    f.write('0 \n')

# copy vocab
with open(os.path.join(os.curdir, 'latex_ocr', 'data', 'vocab.txt'), 'w') as f:
    for v in vocab:
        f.write(f'{v}\n')

# copy test_ids.txt and build test/match
with open(os.path.join(os.curdir, 'datasets_no_test', 'test_ids.txt'), 'r') as f:
    ids = f.read().split()
    with open(os.path.join(path, 'test', 'match.txt'), 'w') as fl:
        for id in ids:
            fl.write(f'{id}.png 0\n')
