In [1]:
import utils

import os
import re
import cv2
import numpy as np
import matplotlib.pyplot as plt
import json
import pandas as pd
from PIL import Image
import torch
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import requests
from fuzzywuzzy import process, fuzz



This Script assumes that a .pdf file was already segmented to a folder "base_path". It also assumes there are the labels to the segments in an excel sheet. The segments are colored RGB images, to allow ColorChecks and QuotationmarkChecks

In [2]:
base_path='C:\\Uni\\1M. Semester\\DocDig\\gen_img\\scan_1972_CdB_3_20231125160810-Pipeline'

Read In the Excel Sheet, discard the first two and the last segment of each page (-0.png, -1.png, -last.png)

In [3]:
label_path = '125160810Gelabelt.xlsx'
file_path = os.path.join(base_path, label_path)

# Read all sheets at once
excel_data = pd.read_excel(file_path, sheet_name=None, header=None, dtype=str)

num_sheets_labeled = len(excel_data)

# Initialize a list to store labels for each sheet
label_list = []

# Iterate through each sheet's data
for sheet_name, df in excel_data.items():
    # Extract the first 50 lines from the second column (column 'B')
    labels = df.iloc[:50, 1]  # Get the first 50 rows of column B
    
    # Replace NaN values (empty cells) with empty strings
    labels = labels.fillna('"')

    # Append the labels as a list for this sheet, preserving all values
    label_list.append(labels.values)
    
print(label_list)


[array(['Mehlschwalbe', '"', '"', '"', '"', '"', '"', '"', '"', '"', '"',
       'Tannenmeise', '"', '"', 'Erlenzeisig', '"', '"', '"', 'Blaumeise',
       '"', 'Mehlschwalbe', '"', 'Tannenmeise', '"', '"', '"', '"', '"',
       '"', '"', '"', 'Blaumeise', 'Tannenmeise', 'Blaumeise', '"',
       'Tannenmeise', '"', '"', 'Schafstelze', 'Mehlschwalbe',
       'Tannenmeise', '"', '"', '"', '"', '"', '"', '"', '"', '"'],
      dtype=object), array(['Tannenmeise', '"', '"', '"', '"', '"', '"', '"', '"', '"', '"',
       '"', '"', '"', '"', '"', '"', '"', '"', '"', '"', '"', '"', '"',
       'Mehlschwalbe', 'Tannenmeise', '"', 'Blaumeise', 'Schafstelze',
       'Blaumeise', 'Tannenmeise', '"', 'Blaumeise', 'Tannenmeise', '"',
       '"', '"', '"', '"', 'Blaumeise', 'Tannenmeise', '"', '"', '"', '"',
       '"', '"', '"', '"', '"'], dtype=object), array(['Tannenmeise', '"', '"', '"', '"', 'Mehlschwalbe', 'Blaumeise',
       '"', 'Tannenmeise', '"', 'Blaumeise', 'Tannenmeise', '"',
       'Bla

In [4]:
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-large-handwritten')
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-large-handwritten')

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.46.2"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 1024,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decod

In [5]:
# extract classes
import json

# Load the JSON file
with open("./class_indices_extended.json", "r", encoding="utf-8") as file:
    class_indices = json.load(file)

# Extract the class names (ignoring the index)
class_names = list(class_indices.keys())

# Display the class names
print("Class Names:", class_names)

Class Names: ['"', 'Alpenmeise', 'Baumpieper', 'Birkenzeisig', 'Blaumeise', 'Braunkelchen', 'Distelfink', 'Erlenzeisig', 'Fitis', 'Gartenrotschwanz', 'Gartenrötel', 'Gelbspöter', 'Girlitz', 'Grauschnäpper', 'Hausrotschwanz', 'Hausrötel', 'Hänfling', 'Heckenbraunelle', 'Klappergrasmücke', 'Rotkelchen', 'Tannenmeise', 'Trauerschnäpper', 'Wintergoldhähnchen', 'Zaungrasmücke', 'Zilpzalp', 'Zitroneler', 'Zitronenfink', 'Zitronenzeisig', 'Mehlschwalbe', 'Rauchschwalbe', 'Schafstelze', 'Teichrohrsänger', 'Uferschwalbe', 'Gobemouche noir', 'Linotte mélodieuse', 'Mésange bleu', 'Mésange noir', 'Pipit des arbres', 'Pouillot véloce', 'Rougegorge', 'Rougequeue à front blanc', 'Traquet tarier', 'Bergeronnette grise', 'Accenteur mouchet', 'Tarin', 'Serin']


In [13]:
# Compatible with pipeline

# Array to store the OCR outputs
ocr_outputs = []
threshold = 0
num_sheet = 0

for page in range(num_sheets_labeled): # for all pages
    num_sheet = page + 1
        
    for line in range(50):
        filename = "page_" + str(page) + "_column_0_row_" + str(line+1) + ".jpg"
        
        label = label_list[page][line]
        
        # Construct the full file path
        file_path = os.path.join(base_path, "cells-cleaned", filename)

        # Check if the file is an image (add more extensions if needed)
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
            try:
                print(file_path)
                # Load and process the image
                #image = Image.open(file_path).convert("RGB")
                image_original = cv2.imread(file_path)
                image_original = cv2.bitwise_not(image_original)
                #cv2.imshow("test",image_original)
                #cv2.waitKey(0)
                #cv2.destroyAllWindows()
                
                
                

                # Check colors of original image (remove black if text is written in blue)
                #tmp1 = utils.check_text_color(image_original)
                #if tmp1['predominant_color'] == 'blue':
                #    image = utils.remove_black_pixels(image_original)
                #else:
                #    image = image_original
                    
                # Check quotationmarks (no further processing needed, if quotationmark detected)
                tmp2 = utils.detectQuotationMarks(image_original)
                
                # The network needs a PIL image
                image = cv2.cvtColor(image_original, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
                image = Image.fromarray(image_original)  # Convert NumPy array to PIL Image
                
                
                if tmp2['Quotationmark'] == False:
                
                    pixel_values = processor(images=image, return_tensors="pt").pixel_values

                    # Generate text from the OCR model
                    generated_ids = model.generate(pixel_values)#, output_scores=True, return_dict_in_generate=True)
                    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

                    # Fuzzy match the recognized text to the bird species list
                    best_match, score = process.extractOne(generated_text, class_names)

                    # Post-process to filter only a-z and A-Z characters in the recognized text
                    filtered_text = re.sub(r'[^a-zA-ZäöüÄÖÜß"]', '', utils.replace_numbers_with_letters(generated_text))

                    # Fuzzy match the processed recognized text to the bird species list
                    best_match_filtered, score_filtered = process.extractOne(filtered_text, class_names)
                
                    # Fuzzy match the processed recognized text to the bird species with initial letter bonus
                    best_match_customFuzzy, score_custom_fuzzy = utils.custom_fuzzy_match(filtered_text, class_names)

                    # Append the result to the output array
                    ocr_outputs.append({"page_number": page+1,
                                    "line_number": line+1,
                                    "image": image,
                                    "generated_ids": generated_ids,
                                    "true_text": label,
                                    #"detected_color": tmp1['predominant_color'],
                                    "detected_quoMark": tmp2['Quotationmark'],
                                    "recognized_text": generated_text,
                                    "best_match": best_match if score >= threshold else None,
                                    "score": score,
                                    "filtered_text": filtered_text,
                                    "best_match_filtered": best_match_filtered,
                                    "score_filtered": score_filtered,
                                    "best_match_customFuzzy": best_match_customFuzzy,
                                    "score_customFuzzy": score_custom_fuzzy})
                
                else:
                    # Append the result to the output array
                    ocr_outputs.append({"page_number": page+1,
                                    "line_number": line+1,
                                    "image": image,
                                    "generated_ids": "",
                                    "true_text": label,
                                    #"detected_color": tmp1['predominant_color'],
                                    "detected_quoMark": tmp2['Quotationmark'],
                                    "recognized_text": "\"",
                                    "best_match": "\"",
                                    "score": 100,
                                    "filtered_text": "\"",
                                    "best_match_filtered": "\"",
                                    "score_filtered": 100,
                                    "best_match_customFuzzy": "\"",
                                    "score_customFuzzy": 100})
                                    
            
            
            except Exception as e:
                print(f"Error processing file {filename}: {e}")
                


C:\Uni\1M. Semester\DocDig\gen_img\scan_1972_CdB_3_20231125160810-Pipeline\cells-cleaned\page_0_column_0_row_1.jpg




C:\Uni\1M. Semester\DocDig\gen_img\scan_1972_CdB_3_20231125160810-Pipeline\cells-cleaned\page_0_column_0_row_2.jpg
C:\Uni\1M. Semester\DocDig\gen_img\scan_1972_CdB_3_20231125160810-Pipeline\cells-cleaned\page_0_column_0_row_3.jpg
C:\Uni\1M. Semester\DocDig\gen_img\scan_1972_CdB_3_20231125160810-Pipeline\cells-cleaned\page_0_column_0_row_4.jpg
C:\Uni\1M. Semester\DocDig\gen_img\scan_1972_CdB_3_20231125160810-Pipeline\cells-cleaned\page_0_column_0_row_5.jpg
C:\Uni\1M. Semester\DocDig\gen_img\scan_1972_CdB_3_20231125160810-Pipeline\cells-cleaned\page_0_column_0_row_6.jpg
C:\Uni\1M. Semester\DocDig\gen_img\scan_1972_CdB_3_20231125160810-Pipeline\cells-cleaned\page_0_column_0_row_7.jpg
C:\Uni\1M. Semester\DocDig\gen_img\scan_1972_CdB_3_20231125160810-Pipeline\cells-cleaned\page_0_column_0_row_8.jpg
C:\Uni\1M. Semester\DocDig\gen_img\scan_1972_CdB_3_20231125160810-Pipeline\cells-cleaned\page_0_column_0_row_9.jpg
C:\Uni\1M. Semester\DocDig\gen_img\scan_1972_CdB_3_20231125160810-Pipeline\cells

In [14]:
print(ocr_outputs[1])

{'page_number': 1, 'line_number': 2, 'image': <PIL.Image.Image image mode=RGB size=383x57 at 0x205E9DE8F70>, 'generated_ids': '', 'true_text': '"', 'detected_quoMark': True, 'recognized_text': '"', 'best_match': '"', 'score': 100, 'filtered_text': '"', 'best_match_filtered': '"', 'score_filtered': 100, 'best_match_customFuzzy': '"', 'score_customFuzzy': 100}


In [10]:
'''# Original Version

# Define the path to the folder containing your images
image_folder = "NOPRE-COLOR-col-1"  # Replace with the actual folder path
# Array to store the OCR outputs
ocr_outputs = []
threshold = 0
num_sheet = 0

for page in range(num_sheets_labeled): # for all pages
    num_sheet = page + 1
        
    for line in range(50):
        filename = str(page) + "-" + str(line+2) + ".png"
        
        label = label_list[page][line]
        
        # Construct the full file path
        file_path = os.path.join(base_path, "cells", image_folder, filename)

        # Check if the file is an image (add more extensions if needed)
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
            try:
                print(file_path)
                # Load and process the image
                #image = Image.open(file_path).convert("RGB")
                image_original = cv2.imread(file_path)
                
                # Check colors of original image (remove black if text is written in blue)
                tmp1 = utils.check_text_color(image_original)
                if tmp1['predominant_color'] == 'blue':
                    image = utils.remove_black_pixels(image_original)
                else:
                    image = image_original
                    
                # Check quotationmarks (no further processing needed, if quotationmark detected)
                tmp2 = utils.detectQuotationMarks(image_original)
                
                # The network needs a PIL image
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
                image = Image.fromarray(image)  # Convert NumPy array to PIL Image
                
                if tmp2['Quotationmark'] == False:
                
                    pixel_values = processor(images=image, return_tensors="pt").pixel_values

                    # Generate text from the OCR model
                    generated_ids = model.generate(pixel_values)#, output_scores=True, return_dict_in_generate=True)
                    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

                    # Fuzzy match the recognized text to the bird species list
                    best_match, score = process.extractOne(generated_text, class_names)

                    # Post-process to filter only a-z and A-Z characters in the recognized text
                    filtered_text = re.sub(r'[^a-zA-ZäöüÄÖÜß"]', '', utils.replace_numbers_with_letters(generated_text))

                    # Fuzzy match the processed recognized text to the bird species list
                    best_match_filtered, score_filtered = process.extractOne(filtered_text, class_names)
                
                    # Fuzzy match the processed recognized text to the bird species with initial letter bonus
                    best_match_customFuzzy, score_custom_fuzzy = utils.custom_fuzzy_match(filtered_text, class_names)

                    # Append the result to the output array
                    ocr_outputs.append({"page_number": page+1,
                                    "line_number": line+1,
                                    "image": image,
                                    "generated_ids": generated_ids,
                                    "true_text": label,
                                    "detected_color": tmp1['predominant_color'],
                                    "detected_quoMark": tmp2['Quotationmark'],
                                    "recognized_text": generated_text,
                                    "best_match": best_match if score >= threshold else None,
                                    "score": score,
                                    "filtered_text": filtered_text,
                                    "best_match_filtered": best_match_filtered,
                                    "score_filtered": score_filtered,
                                    "best_match_customFuzzy": best_match_customFuzzy,
                                    "score_customFuzzy": score_custom_fuzzy})
                
                else:
                    # Append the result to the output array
                    ocr_outputs.append({"page_number": page+1,
                                    "line_number": line+1,
                                    "image": image,
                                    "generated_ids": "",
                                    "true_text": label,
                                    "detected_color": tmp1['predominant_color'],
                                    "detected_quoMark": tmp2['Quotationmark'],
                                    "recognized_text": "\"",
                                    "best_match": "\"",
                                    "score": 100,
                                    "filtered_text": "\"",
                                    "best_match_filtered": "\"",
                                    "score_filtered": 100,
                                    "best_match_customFuzzy": "\"",
                                    "score_customFuzzy": 100})

            except Exception as e:
                print(f"Error processing file {filename}: {e}")
                '''

C:\Uni\1M. Semester\DocDig\gen_img\scan_1972_CdB_3_20231125160810-Pipeline\cells\NOPRE-COLOR-col-1\0-2.png
Error processing file 0-2.png: OpenCV(4.10.0) D:\a\opencv-python\opencv-python\opencv\modules\imgproc\src\color.cpp:196: error: (-215:Assertion failed) !_src.empty() in function 'cv::cvtColor'

C:\Uni\1M. Semester\DocDig\gen_img\scan_1972_CdB_3_20231125160810-Pipeline\cells\NOPRE-COLOR-col-1\0-3.png
Error processing file 0-3.png: OpenCV(4.10.0) D:\a\opencv-python\opencv-python\opencv\modules\imgproc\src\color.cpp:196: error: (-215:Assertion failed) !_src.empty() in function 'cv::cvtColor'

C:\Uni\1M. Semester\DocDig\gen_img\scan_1972_CdB_3_20231125160810-Pipeline\cells\NOPRE-COLOR-col-1\0-4.png
Error processing file 0-4.png: OpenCV(4.10.0) D:\a\opencv-python\opencv-python\opencv\modules\imgproc\src\color.cpp:196: error: (-215:Assertion failed) !_src.empty() in function 'cv::cvtColor'

C:\Uni\1M. Semester\DocDig\gen_img\scan_1972_CdB_3_20231125160810-Pipeline\cells\NOPRE-COLOR-col-

In [15]:
# Write output to excel sheet

# Check Sheet numbers
if (num_sheet != num_sheets_labeled):
    print(f"Something is Wrong! Labeled Sheets: {num_sheets_labeled} but sheets in the data: {num_sheet}")
    
# Extract Ringnumber from data (Here we use an exapmle)
ringnumber = 100101
    
# Initialize an Excel workbook with sheets, starting from 100100, and save it as 'output.xlsx'
utils.initialize_excel(num_pages=num_sheet, start_number=ringnumber, output_path="C:/Uni/1M. Semester/DocDig/gen_img/scan_1972_CdB_3_20231125160810-Pipeline/125160810TESTOUTPUT.xlsx")

# Write the output to the initialized excel Sheet
utils.write_ocr_outputs_to_excel(ocr_outputs=ocr_outputs, excel_path="C:/Uni/1M. Semester/DocDig/gen_img/scan_1972_CdB_3_20231125160810-Pipeline/125160810TESTOUTPUT.xlsx")

# Add colors to the excel for better visualization
utils.add_color_to_scores(score_column=3, threshold=70, excel_path="C:/Uni/1M. Semester/DocDig/gen_img/scan_1972_CdB_3_20231125160810-Pipeline/125160810TESTOUTPUT.xlsx")

Excel workbook initialized and saved to C:/Uni/1M. Semester/DocDig/gen_img/scan_1972_CdB_3_20231125160810-Pipeline/125160810TESTOUTPUT.xlsx
OCR outputs written to C:/Uni/1M. Semester/DocDig/gen_img/scan_1972_CdB_3_20231125160810-Pipeline/125160810TESTOUTPUT.xlsx
Colors added to scores in C:/Uni/1M. Semester/DocDig/gen_img/scan_1972_CdB_3_20231125160810-Pipeline/125160810TESTOUTPUT.xlsx


In [16]:
correct = []
wrong = []
# Display each image with its recognized text and best fuzzy match
#fig, axes = plt.subplots(nrows=len(ocr_outputs), ncols=1, figsize=(10, 5 * len(ocr_outputs)))
#fig.suptitle("OCR Outputs and Fuzzy Matches", fontsize=16)

for idx, item in enumerate(ocr_outputs):
    #ax = axes[idx] if len(ocr_outputs) > 1 else axes  # Handles single image case
    #ax.imshow(item["image"])
    #ax.axis('off')

    # Display fuzzy match and weighted fuzzy match below each image
    recognized_text = item["recognized_text"]
    true_text = item["true_text"]
    #detected_color = item["detected_color"]
    detected_quoMark = item["detected_quoMark"]
    best_match = item["best_match"]
    score = item["score"]
    filtered_text = item["filtered_text"]
    best_match_filtered = item["best_match_filtered"]
    score_filtered = item["score_filtered"]
    best_match_customFuzzy = item["best_match_customFuzzy"]
    score_custom_fuzzy = item["score_customFuzzy"]
    
    if true_text == best_match_customFuzzy:
        correct.append(score_filtered)
    else:
        wrong.append(score_filtered)
        print(f" sheet {idx / 50}, line {idx % 50 + 1}")
        print(true_text)
    
    text = f"True Text: '{true_text}'\n"
    #text += f"Detected Color: '{detected_color}'\n"
    text += f"Detected QuoMark: '{detected_quoMark}'\n"
    text += f"\nRecognized Text: '{recognized_text}'\n"
    text += f"Filtered Recognized Text: '{filtered_text}'\n"
    if best_match:
        text += f"Best Match for recognized text: '{best_match}' with Score: {score}\n"
    if best_match_filtered:
        text += f"Best Match for filtered text: '{best_match_filtered}' with Score: {score_filtered}\n"
    if best_match_customFuzzy:
        text += f"Best Match for filtered text with weight: '{best_match_customFuzzy}' with Score: {score_custom_fuzzy}\n"
    else:
        text += f"No confident match found (Score: {score})"

    #ax.set_title(text, fontsize=12)

#plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust layout to fit title
#plt.show()

print("Corrects: ", len(correct), " Wrongs: ", len(wrong), " in Percentages: ", len(correct) / (len(correct) + len(wrong)))

 sheet 0.78, line 40
Mehlschwalbe
 sheet 2.2, line 11
Blaumeise
 sheet 2.4, line 21
Tannenmeise
 sheet 4.0, line 1
Schafstelze
 sheet 4.24, line 13
Rotkehlchen
 sheet 4.26, line 14
Schafstelze
 sheet 4.32, line 17
Trauerschnäpper
 sheet 4.36, line 19
Gartenrötel
 sheet 4.38, line 20
Grauschnäpper
 sheet 4.4, line 21
Trauerschnäpper
 sheet 4.44, line 23
Rotkehlchen
 sheet 4.46, line 24
Grauschnäpper
 sheet 4.5, line 26
Rotkehlchen
 sheet 4.52, line 27
Gartenrötel
 sheet 4.54, line 28
Rotkehlchen
 sheet 4.56, line 29
Trauerschnäpper
 sheet 4.58, line 30
Gelbspötter
 sheet 4.6, line 31
Rotkehlchen
 sheet 4.64, line 33
Trauerschnäpper
 sheet 4.66, line 34
Rotkehlchen
 sheet 4.68, line 35
Trauerschnäpper
 sheet 4.7, line 36
Rotkehlchen
 sheet 4.76, line 39
Trauerschnäpper
 sheet 4.78, line 40
Rotkehlchen
 sheet 4.8, line 41
Gelbspötter
 sheet 4.82, line 42
Gartenrötel
 sheet 4.84, line 43
Trauerschnäpper
 sheet 4.86, line 44
Rotkehlchen
 sheet 4.88, line 45
Gartenrötel
 sheet 4.9, line 46
B