In [1]:
import subprocess
from matplotlib import pyplot as plt
import os
%matplotlib inline
import pytesseract
from PIL import Image

import jellyfish

In [2]:
TEXT_IMAGE = "ideal_version.jpg"
PHOTO_IMAGE = "image0.jpg"
PROCESSED_IMAGE = "image1.jpg"
PROCESSED_IMAGE2 = "output.png"

In [3]:
def binarize(input_file_path, output_file_path="binarized_image.jpg"):
    process = subprocess.run(['./binarizewolfjolion', input_file_path, output_file_path])
    return Image.open(output_file_path)

In [4]:
def image_to_text(image_path, binarization_needed=True):
    if binarization_needed:
        image = binarize(image_path)
    else:
        image = Image.open(image_path)
    text = pytesseract.image_to_string(image)
    
    print("Recognized text:")
    end = "...\n" if len(text) > 50 else "\n"
    print(text[:50], end=end)
    
    return text

Можно применять разные меры схожести двух текстов:

https://stackabuse.com/levenshtein-distance-and-text-similarity-in-python/

Остановимся на Cходствe Джаро—Винклера. (позднее: 1 - (редакторское расстояние / длина оригинала))

In [5]:
def get_texts_similarity(original_text, image_path, binarization_needed=False, append=True, log_path="log.txt"):
    text = image_to_text(image_path, binarization_needed=binarization_needed)
    
#     similarity = jellyfish.jaro_winkler(original_text, text) 
    similarity = 1 - jellyfish.levenshtein_distance(original_text, text) / max(len(original_text), len(text))
    print(len(original_text), len(text))
    
    mode = "a" if append else "w"
    with open(log_path, mode) as file:
        print(similarity, file=file)                                            
    
    return similarity

In [6]:
original_text = image_to_text(TEXT_IMAGE, binarization_needed=False)

Recognized text:
DANDELION WINE

It was a quiet morning, the town c...


In [7]:
# get_texts_similarity(original_text, PHOTO_IMAGE, append=False)

In [8]:
# get_texts_similarity(original_text, PROCESSED_IMAGE)

In [9]:
# get_texts_similarity(original_text, PROCESSED_IMAGE2)

In [10]:
for i in range(24):
    print()
    print(i)
    dir_path = f"./for_metrics/example{i}/"
    PHOTO_IMAGE = dir_path + "0.jpg"
    PROCESSED_IMAGE = dir_path + "7.jpg"
    PROCESSED_IMAGE2 = dir_path + "out1.png"
    
    if not os.path.isfile(PROCESSED_IMAGE):
        continue
        
    s1 = get_texts_similarity(original_text, PHOTO_IMAGE, log_path=dir_path+"log.txt", append=False)
    s2 = get_texts_similarity(original_text, PROCESSED_IMAGE, log_path=dir_path+"log.txt")
    s3 = get_texts_similarity(original_text, PROCESSED_IMAGE2, log_path=dir_path+"log.txt")
    
    print(s1)
    print(s2)
    print(s3)
    if s1 < s2 < s3:
        print(f"{i} is GOOD EXAMPLE")
    elif s3 > s1:
        print(f"{i} is NOT BAD EXAMPLE")
    else:
        print(f"{i} is AWFUL EXAMPLE")


0
Recognized text:
overed over with darknes mer gathered in the
the b...
2817 1617
Recognized text:
DANDELION WINE

It was a quiet morning, the town c...
2817 2816
Recognized text:
DANDELION WINE

It was a quiet morning, the town c...
2817 2816
0.2818601348952786
0.9960951366702165
0.9982250621228257
0 is GOOD EXAMPLE

1
Recognized text:
DANDELION WINE

It was a quiet morning, the town c...
2817 2829
Recognized text:
DANDELION WINE

It was a quiet morning, the town c...
2817 2824
Recognized text:
DANDELION WINE

It was a quiet morning, the town c...
2817 2822
0.9597030752916225
0.9939801699716714
0.9957476966690291
1 is GOOD EXAMPLE

2
Recognized text:
DANDELION WINE

  
 
 

It was a quiet morning, th...
2817 2532
Recognized text:
hands jump ever
and rivers. He would freeze,
chick...
2817 989
Recognized text:
DANDELION WINE

It was a quiet morning, the town c...
2817 2817
0.7841675541356052
0.3454029108981186
0.9992900248491303
2 is NOT BAD EXAMPLE

3
Recognized text:
DANDEUON WINE

