In [1]:
import boto3
import matplotlib.pyplot as plt
import pandas as pd
import preprocess as pp
import eval as ev

**Model Construction**

In [2]:
s3_bucket_name = 'textract-intern'  # Replace 'textract-intern' with your bucket name
folders = ['Weird_invoices']  # Replace 'Weird_invoices' with the names of the folders you want to process

textractmodule = boto3.client('textract')
s3 = boto3.client('s3')

def extract_text_from_image(bucket, document):
    response = textractmodule.detect_document_text(
        Document={
            'S3Object': {
                'Bucket': bucket,
                'Name': document
            }
        })
    lines = []
    for item in response["Blocks"]:
        if item["BlockType"] == "LINE":
            lines.append(item["Text"])
    return lines

all_lines = []

for folder_name in folders:
    response = s3.list_objects_v2(Bucket=s3_bucket_name, Prefix=folder_name)

    if 'Contents' in response:
        for obj in response['Contents']:
            document_name = obj['Key']
            if document_name.endswith('.png'): 
                print(f'Processing {document_name}...')
                lines = extract_text_from_image(s3_bucket_name, document_name)
                all_lines.extend(lines)

Processing Weird_invoices_images/Barely-visible-numérisation_page_1.png...
Processing Weird_invoices_images/Comment-after-item-Facture_48573835_page_1.png...
Processing Weird_invoices_images/Logo-in-the-middle-Facture-N-24000397_page_1.png...
Processing Weird_invoices_images/Logo-in-the-middle-SIGNED_fact_amb_01044700_page_1.png...
Processing Weird_invoices_images/Logo-in-the-middle-SIGNED_fact_amb_01044700_page_2.png...
Processing Weird_invoices_images/Logo-in-the-middle-SIGNED_fact_amb_01044700_page_3.png...
Processing Weird_invoices_images/Mixed-IMG_3307_page_1.png...
Processing Weird_invoices_images/Not-straight-Adobe_Scan_17_juil_2024_page_1.png...
Processing Weird_invoices_images/Scanned-bad-font-0690_001_page_1.png...
Processing Weird_invoices_images/Scanned-bad-font-0690_001_page_2.png...
Processing Weird_invoices_images/Text-at-the-edge-3RDAFACTURW0000678049202407139081320705_page_1.png...
Processing Weird_invoices_images/Text-at-the-edge-3RDAFACTURW000067804920240713908132070

**Preprocess text extracted**

In [8]:
all_lines = pp.normalize_lines(all_lines)

**Read product database**

In [2]:
file_path = 'Data/real_images/Products.xlsm'  # Replace with your actual file path
sheet_name = 'Base_de_Donnees' #Specify the sheet of database

data = pd.read_excel(file_path, sheet_name=sheet_name, usecols='F', skiprows=3, nrows=30659)

liste_produits = data['Designation'].tolist()

**Preprocess database**

In [3]:
liste_produits = [pp.normalize_text(produit) for produit in liste_produits]

**Model Evaluation**

In [None]:
count_exact_matches, count_01, count_02 = ev.count_matches(all_lines, liste_produits)

print('Exact matches:', count_exact_matches)
print('Normalized Levenshtein distance ≤ 0.1:', count_01)
print('Normalized Levenshtein distance ≤ 0.2:', count_02)

In [None]:
#Generating plot
categories = ['0', ']0 ; 0.1]', ']0.1 ; 0.2]', '>0.2']
total_products = 1600  # Replace with the actual total number of products
count_sup02 = total_products - (count_exact_matches + count_01 + count_02)
values = [count_exact_matches, count_01, count_02, count_sup02] 

proportions = [v / total_products * 100 for v in values]

fig, ax = plt.subplots()

bars = ax.bar(categories, values, color=['skyblue', 'lightgreen', 'salmon'])

for bar, proportion in zip(bars, proportions):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2.0, height, f'{proportion:.1f}%', 
            ha='center', va='bottom')

ax.set_title('OCR Performances Distribution')
ax.set_xlabel('Normalized Levenshtein Distance')
ax.set_ylabel('Number of Products')

plt.show()