In [25]:
import json

def CompareResult(annotation_file, prediction_file, output_file):
    # 加载真实的标签数据
    true_labels = {}
    with open(annotation_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            # 从文件路径中提取基本的文件名用于匹配
            filename = data['filename'].split('/')[-1]
            true_labels[filename] = data['text'].lower()  # 确保大小写一致

    # 加载预测的结果数据
    with open(prediction_file, 'r', encoding='utf-8') as f, open(output_file, 'w', encoding='utf-8') as output_f:
        for line in f:
            parts = line.strip().split('\t')
            if not parts:
                continue
            # 从文件路径中提取基本的文件名用于匹配
            filename = parts[0].split('/')[-1]
            predicted_text = parts[1].lower()  # 确保大小写一致
            confidence = float(parts[2])

            # 比较预测结果和真实标签
            if filename in true_labels:
                if true_labels[filename] != predicted_text:
                    diff_text = f"Filename: {filename}\nTrue Label:\t{true_labels[filename]}\nPredicted:\t{predicted_text}, Confidence: {confidence}\n\n"
                    print(diff_text)
                    output_f.write(diff_text)
    # 注意替换 'your_annotation_file.jsonl' 和 'your_prediction_file.txt'
    # 为你的文件路径。

## cloformer_cppd_base

In [None]:
datasets=['IC13', 'SVT', 'IIIT5K', 'IC15', 'SVTP', 'CUTE80']

for dataset in datasets:
    annotation_file = f'train_data/common_benchmarks/{dataset}/annotation.jsonl'
    prediction_file = f'output/rec/rec_cloformer_cppd_base/rec_cloformer_cppd_base_{dataset}.txt'
    output_file = f'output/rec/rec_cloformer_cppd_base/diff_cloformer_cppd_base_{dataset}.txt'
    
    CompareResult(annotation_file, prediction_file, output_file)
    

## svtr_base

In [None]:
datasets=['IC13', 'SVT', 'IIIT5K', 'IC15', 'SVTP', 'CUTE80']

for dataset in datasets:
    annotation_file = f'train_data/common_benchmarks/{dataset}/annotation.jsonl'
    prediction_file = f'output/rec/rec_svtr_base_none_ctc_en_train/predicts_svtr_base_{dataset}.txt'
    output_file = f'output/rec/rec_svtr_base_none_ctc_en_train/diff_predicts_svtr_base_{dataset}.txt'
    
    CompareResult(annotation_file, prediction_file, output_file)
    

## r45_abinet

In [None]:
datasets=['IC13', 'SVT', 'IIIT5K', 'IC15', 'SVTP', 'CUTE80']

for dataset in datasets:
    annotation_file = f'train_data/common_benchmarks/{dataset}/annotation.jsonl'
    prediction_file = f'output/rec/rec_r45_abinet_train/predicts_r45_abinet_{dataset}.txt'
    output_file = f'output/rec/rec_r45_abinet_train/diff_predicts_r45_abinet_{dataset}.txt'
    
    CompareResult(annotation_file, prediction_file, output_file)
    

## r45_visionlan

In [None]:
datasets=['IC13', 'SVT', 'IIIT5K', 'IC15', 'SVTP', 'CUTE80']

for dataset in datasets:
    annotation_file = f'train_data/common_benchmarks/{dataset}/annotation.jsonl'
    prediction_file = f'output/rec/rec_r45_visionlan_train/predicts_r45_visionlan_{dataset}.txt'
    output_file = f'output/rec/rec_r45_visionlan_train/diff_predicts_r45_visionlan_{dataset}.txt'
    
    CompareResult(annotation_file, prediction_file, output_file)
    

## vit_parseq

In [None]:
datasets=['IC13', 'SVT', 'IIIT5K', 'IC15', 'SVTP', 'CUTE80']

for dataset in datasets:
    annotation_file = f'train_data/common_benchmarks/{dataset}/annotation.jsonl'
    prediction_file = f'output/rec/rec_vit_parseq_train/predicts_vit_parseq_{dataset}.txt'
    output_file = f'output/rec/rec_vit_parseq_train/diff_predicts_vit_parseq_{dataset}.txt'
    
    CompareResult(annotation_file, prediction_file, output_file)
    

## Diff common six benchmark

In [54]:
import json


def diff_all(dataset):
    # 真实标签文件路径
    annotation_file = f'train_data/common_benchmarks/{dataset}/annotation.jsonl'

    # 预测文件路径，假设你有四个预测文件，分别对应四种不同的方法
    prediction_files = [
        f'output/rec/rec_cloformer_cppd_base/rec_cloformer_cppd_base_{dataset}.txt',
        f'output/rec/rec_svtr_base_none_ctc_en_train/predicts_svtr_base_{dataset}.txt',
        f'output/rec/rec_r45_abinet_train/predicts_r45_abinet_{dataset}.txt',
        f'output/rec/rec_r45_visionlan_train/predicts_r45_visionlan_{dataset}.txt'
    ]

    method_list = ['le_cp', 'svtr', 'abinet', 'vislan']

    # 用于存储结果
    output_file = f'output/rec/comparison_results_{dataset}.txt'

    # 加载真实标签
    true_labels = {}
    with open(annotation_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            filename = data['filename'].split('/')[-1].lower()  # 为了大小写不敏感
            true_labels[filename] = data['text'].lower()

    # 函数用于加载预测结果
    def load_predictions(file_path):
        predictions = {}
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split('\t')
                if not parts:
                    continue
                filename = parts[0].split('/')[-1].lower()
                predicted_text = parts[1].lower()
                predictions[filename] = predicted_text
        return predictions

    # 加载所有方法的预测结果
    all_predictions = [load_predictions(pf) for pf in prediction_files]

    # 比较并保存结果
    with open(output_file, 'w', encoding='utf-8') as out_f:
        for filename, true_text in true_labels.items():
            # 检查所有方法对该文件的预测
            predictions = [preds.get(filename, "").lower() for preds in all_predictions]
            
            # 方法1正确，其他至少一个错误
            if predictions[0] == true_text and not all(pred == true_text for pred in predictions[1:]):
                incorrect_methods = [f"{method_list[i]}" for i, pred in enumerate(predictions[1:], start=1) if pred != true_text]
                print(f"Filename: {filename}")
                out_f.write(f"Filename: {filename}\n")
                print(f"True Label: \t{true_text}")
                out_f.write(f"True Label: \t{true_text}\n")
                for i, pred in enumerate(predictions, start=0):
                    print(f"Method {method_list[i]}: \t{pred}")
                    out_f.write(f"Method {method_list[i]}: \t{pred}\n")
                print(f"Incorrect Methods: {', '.join(incorrect_methods)}\n")
                out_f.write(f"Incorrect Methods: {', '.join(incorrect_methods)}\n\n")


    # 请确保替换 'path_to_your_annotation_file.jsonl' 和 'prediction_method_X.txt'
    # 为你的实际文件路径。



def diff_all_wrong(dataset):
    # 真实标签文件路径
    annotation_file = f'train_data/common_benchmarks/{dataset}/annotation.jsonl'

    # 预测文件路径，假设你有四个预测文件，分别对应四种不同的方法
    prediction_files = [
        f'output/rec/rec_cloformer_cppd_base/rec_cloformer_cppd_base_{dataset}.txt',
        f'output/rec/rec_svtr_base_none_ctc_en_train/predicts_svtr_base_{dataset}.txt',
        f'output/rec/rec_r45_abinet_train/predicts_r45_abinet_{dataset}.txt',
        f'output/rec/rec_r45_visionlan_train/predicts_r45_visionlan_{dataset}.txt'
    ]

    method_list = ['le_cp', 'svtr', 'abinet', 'vislan']

    # 用于存储结果
    output_file = f'output/rec/all_wrong_comparison_results_{dataset}.txt'

    # 加载真实标签
    true_labels = {}
    with open(annotation_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            filename = data['filename'].split('/')[-1].lower()  # 为了大小写不敏感
            true_labels[filename] = data['text'].lower()

    # 函数用于加载预测结果
    def load_predictions(file_path):
        predictions = {}
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split('\t')
                if not parts:
                    continue
                filename = parts[0].split('/')[-1].lower()
                predicted_text = parts[1].lower()
                predictions[filename] = predicted_text
        return predictions

    # 加载所有方法的预测结果
    all_predictions = [load_predictions(pf) for pf in prediction_files]

    # 比较并保存结果
    with open(output_file, 'w', encoding='utf-8') as out_f:
        for filename, true_text in true_labels.items():
            # 检查所有方法对该文件的预测
            predictions = [preds.get(filename, "").lower() for preds in all_predictions]
            
            # 检查是否所有方法均预测错误
            if all(pred != true_text for pred in predictions):
                print(f"Filename: {filename}")
                out_f.write(f"Filename: {filename}\n")
                print(f"True Label: \t{true_text}")
                out_f.write(f"True Label: \t{true_text}\n")
                for i, pred in enumerate(predictions, start=0):
                    print(f"Method {method_list[i]}: \t{pred}")
                    out_f.write(f"Method {method_list[i]}: \t{pred}\n")
                print("\n")
                out_f.write("\n\n")
                
                

def diff_special_case(dataset):
    # 真实标签文件路径
    annotation_file = f'train_data/common_benchmarks/{dataset}/annotation.jsonl'

    # 预测文件路径，假设你有四个预测文件，分别对应四种不同的方法
    prediction_files = [
        f'output/rec/rec_cloformer_cppd_base/rec_cloformer_cppd_base_{dataset}.txt',
        f'output/rec/rec_svtr_base_none_ctc_en_train/predicts_svtr_base_{dataset}.txt',
        f'output/rec/rec_r45_abinet_train/predicts_r45_abinet_{dataset}.txt',
        f'output/rec/rec_r45_visionlan_train/predicts_r45_visionlan_{dataset}.txt'
    ]

    method_list = ['le_cp', 'svtr', 'abinet', 'vislan']

    # 用于存储结果的文件
    output_file = f'output/rec/special_case_results_{dataset}.txt'

    # 加载真实标签
    true_labels = {}
    with open(annotation_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            filename = data['filename'].split('/')[-1].lower()  # 为了大小写不敏感
            true_labels[filename] = data['text'].lower()

    # 函数用于加载预测结果
    def load_predictions(file_path):
        predictions = {}
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split('\t')
                if not parts:
                    continue
                filename = parts[0].split('/')[-1].lower()
                predicted_text = parts[1].lower()
                predictions[filename] = predicted_text
        return predictions

    # 加载所有方法的预测结果
    all_predictions = [load_predictions(pf) for pf in prediction_files]

    # 比较并保存结果
    with open(output_file, 'w', encoding='utf-8') as out_f:
        for filename, true_text in true_labels.items():
            # 检查所有方法对该文件的预测
            predictions = [preds.get(filename, "").lower() for preds in all_predictions]
            
            # le_cp预测错误，其他任一方法成功预测
            if predictions[0] != true_text and any(pred == true_text for pred in predictions[1:]):
                successful_methods = [f"{method_list[i]}" for i, pred in enumerate(predictions[1:], start=1) if pred == true_text]
                print(f"Filename: {filename}")
                out_f.write(f"Filename: {filename}\n")
                print(f"True Label: \t{true_text}")
                out_f.write(f"True Label: \t{true_text}\n")
                for i, pred in enumerate(predictions, start=0):
                    print(f"Method {method_list[i]}: \t{pred}")
                    out_f.write(f"Method {method_list[i]}: \t{pred}\n")
                print(f"Successful Methods: {', '.join(successful_methods)}\n")
                out_f.write(f"Successful Methods: {', '.join(successful_methods)}\n\n")



In [None]:
datasets=['IC13', 'SVT', 'IIIT5K', 'IC15', 'SVTP', 'CUTE80']

for dataset in datasets:
    diff_all(dataset)
    print('compare compelet!')

In [None]:
datasets=['IC13', 'SVT', 'IIIT5K', 'IC15', 'SVTP', 'CUTE80']

for dataset in datasets:
    diff_all_wrong(dataset)
    print('compare compelet!')

In [None]:
datasets=['IC13', 'SVT', 'IIIT5K', 'IC15', 'SVTP', 'CUTE80']

for dataset in datasets:
    diff_special_case(dataset)
    print('compare compelet!')

## Diff ours method on u14m benchmark

In [56]:
import json


def diff_all(dataset):
    # 真实标签文件路径
    annotation_file = f'/root/autodl-tmp/Union14M-L/Union14M-Benchmarks/{dataset}/annotation.jsonl'

    # 预测文件路径，假设你有四个预测文件，分别对应四种不同的方法
    prediction_files = [
        f'output/rec/rec_cloformer_cppd_base/predicts_cloformer_cppd_base_{dataset}.txt',
        f'output/rec/rec_cloformer_cppd_small/predicts_cloformer_cppd_small_{dataset}.txt',
        f'output/rec/rec_cloformer_cppd_tiny/predicts_cloformer_cppd_tiny_{dataset}.txt',

    ]

    method_list = ['base', 'small', 'tiny', ]

    # 用于存储结果
    output_file = f'output/rec/comparison_results_{dataset}.txt'

    # 加载真实标签
    true_labels = {}
    with open(annotation_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            filename = data['filename'].split('/')[-1].lower()  # 为了大小写不敏感
            true_labels[filename] = data['text'].lower()

    # 函数用于加载预测结果
    def load_predictions(file_path):
        predictions = {}
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split('\t')
                if not parts:
                    continue
                filename = parts[0].split('/')[-1].lower()
                predicted_text = parts[1].lower()
                predictions[filename] = predicted_text
        return predictions

    # 加载所有方法的预测结果
    all_predictions = [load_predictions(pf) for pf in prediction_files]

    # 比较并保存结果
    with open(output_file, 'w', encoding='utf-8') as out_f:
        for filename, true_text in true_labels.items():
            # 检查所有方法对该文件的预测
            predictions = [preds.get(filename, "").lower() for preds in all_predictions]
            
            # 方法1正确，其他至少一个错误
            if predictions[0] == true_text and not all(pred == true_text for pred in predictions[1:]):
                incorrect_methods = [f"{method_list[i]}" for i, pred in enumerate(predictions[1:], start=1) if pred != true_text]
                print(f"Filename: {filename}")
                out_f.write(f"Filename: {filename}\n")
                print(f"True Label: \t{true_text}")
                out_f.write(f"True Label: \t{true_text}\n")
                for i, pred in enumerate(predictions, start=0):
                    print(f"Method {method_list[i]}: \t{pred}")
                    out_f.write(f"Method {method_list[i]}: \t{pred}\n")
                print(f"Incorrect Methods: {', '.join(incorrect_methods)}\n")
                out_f.write(f"Incorrect Methods: {', '.join(incorrect_methods)}\n\n")


    # 请确保替换 'path_to_your_annotation_file.jsonl' 和 'prediction_method_X.txt'
    # 为你的实际文件路径。



def diff_all_wrong(dataset):
    # 真实标签文件路径
    annotation_file = f'train_data/common_benchmarks/{dataset}/annotation.jsonl'

    # 预测文件路径，假设你有四个预测文件，分别对应四种不同的方法
    prediction_files = [
        f'output/rec/rec_cloformer_cppd_base/rec_cloformer_cppd_base_{dataset}.txt',
        f'output/rec/rec_svtr_base_none_ctc_en_train/predicts_svtr_base_{dataset}.txt',
        f'output/rec/rec_r45_abinet_train/predicts_r45_abinet_{dataset}.txt',
        f'output/rec/rec_r45_visionlan_train/predicts_r45_visionlan_{dataset}.txt'
    ]

    method_list = ['le_cp', 'svtr', 'abinet', 'vislan']

    # 用于存储结果
    output_file = f'output/rec/all_wrong_comparison_results_{dataset}.txt'

    # 加载真实标签
    true_labels = {}
    with open(annotation_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            filename = data['filename'].split('/')[-1].lower()  # 为了大小写不敏感
            true_labels[filename] = data['text'].lower()

    # 函数用于加载预测结果
    def load_predictions(file_path):
        predictions = {}
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split('\t')
                if not parts:
                    continue
                filename = parts[0].split('/')[-1].lower()
                predicted_text = parts[1].lower()
                predictions[filename] = predicted_text
        return predictions

    # 加载所有方法的预测结果
    all_predictions = [load_predictions(pf) for pf in prediction_files]

    # 比较并保存结果
    with open(output_file, 'w', encoding='utf-8') as out_f:
        for filename, true_text in true_labels.items():
            # 检查所有方法对该文件的预测
            predictions = [preds.get(filename, "").lower() for preds in all_predictions]
            
            # 检查是否所有方法均预测错误
            if all(pred != true_text for pred in predictions):
                print(f"Filename: {filename}")
                out_f.write(f"Filename: {filename}\n")
                print(f"True Label: \t{true_text}")
                out_f.write(f"True Label: \t{true_text}\n")
                for i, pred in enumerate(predictions, start=0):
                    print(f"Method {method_list[i]}: \t{pred}")
                    out_f.write(f"Method {method_list[i]}: \t{pred}\n")
                print("\n")
                out_f.write("\n\n")
                
                

def diff_special_case(dataset):
    # 真实标签文件路径
    annotation_file = f'train_data/common_benchmarks/{dataset}/annotation.jsonl'

    # 预测文件路径，假设你有四个预测文件，分别对应四种不同的方法
    prediction_files = [
        f'output/rec/rec_cloformer_cppd_base/rec_cloformer_cppd_base_{dataset}.txt',
        f'output/rec/rec_svtr_base_none_ctc_en_train/predicts_svtr_base_{dataset}.txt',
        f'output/rec/rec_r45_abinet_train/predicts_r45_abinet_{dataset}.txt',
        f'output/rec/rec_r45_visionlan_train/predicts_r45_visionlan_{dataset}.txt'
    ]

    method_list = ['le_cp', 'svtr', 'abinet', 'vislan']

    # 用于存储结果的文件
    output_file = f'output/rec/special_case_results_{dataset}.txt'

    # 加载真实标签
    true_labels = {}
    with open(annotation_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            filename = data['filename'].split('/')[-1].lower()  # 为了大小写不敏感
            true_labels[filename] = data['text'].lower()

    # 函数用于加载预测结果
    def load_predictions(file_path):
        predictions = {}
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split('\t')
                if not parts:
                    continue
                filename = parts[0].split('/')[-1].lower()
                predicted_text = parts[1].lower()
                predictions[filename] = predicted_text
        return predictions

    # 加载所有方法的预测结果
    all_predictions = [load_predictions(pf) for pf in prediction_files]

    # 比较并保存结果
    with open(output_file, 'w', encoding='utf-8') as out_f:
        for filename, true_text in true_labels.items():
            # 检查所有方法对该文件的预测
            predictions = [preds.get(filename, "").lower() for preds in all_predictions]
            
            # le_cp预测错误，其他任一方法成功预测
            if predictions[0] != true_text and any(pred == true_text for pred in predictions[1:]):
                successful_methods = [f"{method_list[i]}" for i, pred in enumerate(predictions[1:], start=1) if pred == true_text]
                print(f"Filename: {filename}")
                out_f.write(f"Filename: {filename}\n")
                print(f"True Label: \t{true_text}")
                out_f.write(f"True Label: \t{true_text}\n")
                for i, pred in enumerate(predictions, start=0):
                    print(f"Method {method_list[i]}: \t{pred}")
                    out_f.write(f"Method {method_list[i]}: \t{pred}\n")
                print(f"Successful Methods: {', '.join(successful_methods)}\n")
                out_f.write(f"Successful Methods: {', '.join(successful_methods)}\n\n")



In [None]:
datasets=['artistic', 'contextless', 'curve', 'multi_oriented', 'multi_words', 'salient']

for dataset in datasets:
    diff_all(dataset)
    print('compare compelet!')

In [None]:
datasets=['general']

for dataset in datasets:
    diff_all(dataset)
    print('compare compelet!')