In [24]:
import pandas as pd
import os
import json

import shutil
import random
from PIL import Image

In [25]:
def make_caption(json_path,img_path,caption_path):
     # 检查路径是否存在
    if not os.path.exists(json_path):
        raise FileNotFoundError(f"JSON path {json_path} does not exist.")
    
    if not os.path.exists(img_path):
        raise FileNotFoundError(f"Image path {img_path} does not exist.")
    
    captions = []
    files = os.listdir(json_path)
    img_files = [f.split(".")[0] for f  in os.listdir(img_path)]
    
    
    for f in files:
        with open(os.path.join(json_path,f), 'r', encoding='utf-8') as file:
            data = json.load(file)
            # 获取文件名（不包含扩展名）
            filename = os.path.splitext(f)[0]
            # 确保文件名在图像列表中
            if filename in img_files:
                captions.append([filename, data['latex_styled']])
    # load data count
    
    print("Loading data count:",len(captions))
    with open(os.path.join(caption_path,'caption.txt'), 'w') as file:
        for row in captions:
            file.write(f"{row[0]}\t{row[1]}\n")


In [26]:
# make_caption("../data/source/json","../data/source/png","../data/source/")

In [27]:
def split_train_test(source_dir, train_dir, test_dir, valid_dir,test_size=0.2, seed=7):
    if seed is not None:
        random.seed(seed)
    

    # 获取所有文件的列表
    files = [f for f in os.listdir(source_dir) if os.path.isfile(os.path.join(source_dir, f))]
    
    # 随机化文件列表
    random.shuffle(files)
    
    # 计算测试集的大小
    test_count = int(len(files) * test_size)
    
    # 分配文件到训练集和测试集，和验证集
    validation_files = files[:test_count // 3]
    test_files = files[:test_count]
    train_files = files[test_count:]
    
    
    # 创建目标目录
    os.makedirs(valid_dir, exist_ok=True)
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)
    
    # resize image to small than 32e4 
    def convert_and_save(source_file, dest_file):
        with Image.open(source_file) as img:
            width, height = img.size
            # resize 图片大小小于160000
            if width * height > 160000:
                scaling_factor = (160000 / (width * height)) ** 0.5
                new_width = int(width * scaling_factor)
                new_height = int(height * scaling_factor)
                img = img.resize((new_width, new_height), Image.ANTIALIAS)
            gray_img = img.convert('L')
            gray_img.save(dest_file, format='BMP')
    # 移动文件到相应的目录，并转换为灰度图和BMP格式
    for f in train_files:
        source_file = os.path.join(source_dir, f)
        dest_file = os.path.join(train_dir, f.replace('.png', '.bmp'))
        convert_and_save(source_file, dest_file)
    
    for f in test_files:
        source_file = os.path.join(source_dir, f)
        dest_file = os.path.join(test_dir, f.replace('.png', '.bmp'))
        convert_and_save(source_file, dest_file)
        
    for f in validation_files:
        source_file = os.path.join(source_dir, f)
        dest_file = os.path.join(valid_dir, f.replace('.png', '.bmp'))
        convert_and_save(source_file, dest_file)
    
    print(f'Total files: {len(files)}')
    print(f'Training files: {len(train_files)}')
    print(f'Testing files: {len(test_files)}')
    print(f'Validation files: {len(validation_files)}')
    
    # test every division is same

    last_train_sum ,last_test_sum,last_valid_sum = 739164792,182438981,60031392
    train_sum ,test_sum,valid_sum = 0,0,0
    for f in train_files:
        train_sum += int(f.split(".")[0])
    for f in test_files:
        test_sum += int(f.split(".")[0])
    for f in validation_files:
        valid_sum += int(f.split(".")[0])
    print(f'Training sum: {train_sum}')
    print(f'Testing sum: {test_sum}')
    print(f'Validation sum: {valid_sum}')
    print(f'is same? {train_sum == last_train_sum and test_sum == last_test_sum and last_valid_sum == valid_sum}')


In [28]:
split_train_test("../data/source/png","../data/train/img","../data/test/img","../data/validation/img")



Total files: 15000
Training files: 12000
Testing files: 3000
Validation files: 1000
Training sum: 739164792
Testing sum: 182438981
Validation sum: 60031392
is same? True


In [29]:
make_caption("../data/source/json","../data/train/img","../data/train/")
make_caption("../data/source/json","../data/test/img","../data/test/")
make_caption("../data/source/json","../data/validation/img","../data/validation/")


Loading data count: 12000
Loading data count: 3000
Loading data count: 1000


: 