# Web to text corpus full pipeline

This notebook processes publications published in the [Premsa Aranesa repository](https://ddd.uab.cat/collection/honsaran) to obtain a sentence segmented text corpus. 

The processes involved are:

1. Parse the pages of publication record URL (e.g. https://ddd.uab.cat/record/218748) list specified in a text file. 
2. Download all associated PDFs
3. Extract each page in the PDF as an image
4. Perform optical character recognition (OCR) using Tesseract
5. Correct ortographic and formatting errors using Claude LLM (needs API key but can be skipped also)
6. Extract sentences from each document, save them for each publication and merge them into a single corpus file

## Download PDFs and create download_summary.JSON

In [1]:
import os
import requests
from bs4 import BeautifulSoup
import json

In [83]:
def extract_pdf_links(url, doc_limit=None):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve the page: {response.status_code}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')
    pdf_links = []

    for link in soup.find_all('a', href=True):
        href = link['href']
        if '.pdf' in href:
            pdf_links.append(href)

    pdf_links = [requests.compat.urljoin(url, link) for link in pdf_links]
    
    if doc_limit:
        pdf_links = pdf_links[0:doc_limit]
        print("Processing only", doc_limit, "documents!")

    return pdf_links

def get_publication_name(soup):
    meta_tag = soup.find('meta', {'name': 'dc.title'})
    if meta_tag and 'content' in meta_tag.attrs:
        return meta_tag['content']
    return "Unknown_Publication"

def extract_id_from_url(url):
    parts = url.split('/')
    if len(parts) > 5:
        return parts[5]
    return "Unknown_ID"

def download_pdfs(pdf_links, directory, max_retries=3, wait_time=2):
    if not os.path.exists(directory):
        os.makedirs(directory)

    downloaded_files = []
    no_already_downloaded = 0
    no_new_download = 0
    no_failed_download = 0
        
    for link in pdf_links:
        file_name = os.path.join(directory, os.path.basename(link))
        if os.path.exists(file_name):
#             print(f"Already downloaded: {file_name}")
            downloaded_files.append(file_name)
            no_already_downloaded += 1
            continue

        success = False
        for attempt in range(max_retries):
            try:
                response = requests.get(link)
                if response.status_code == 200:
                    with open(file_name, 'wb') as f:
                        f.write(response.content)
                    downloaded_files.append(file_name)
#                     print(f"Downloaded: {file_name}")
                    success = True
                    no_new_downloaded += 1
                    break
                else:
                    print(f"Failed to download: {link} (status code: {response.status_code})")
            except requests.RequestException as e:
                print(f"Attempt {attempt + 1} failed with error: {e}")
            
            if not success:
                print(f"Retrying in {wait_time} seconds...")
                time.sleep(wait_time)

        if not success:
            print(f"Failed to download: {link} after {max_retries} attempts")
            no_failed_download += 1
            
    print(f"{no_new_download} new, {no_already_downloaded} already downloaded")
    if no_failed_download:
        print(f"{no_failed_download} failed to download")

    return downloaded_files

def process_repository_links(file_path, doc_limit=None):
    results = []

    with open(file_path, 'r') as file:
        urls = [line.strip() for line in file if line.strip()]

    for url in urls:
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to retrieve the page: {response.status_code} for URL: {url}")
            continue

        soup = BeautifulSoup(response.content, 'html.parser')
        publication_name = get_publication_name(soup)
        print(publication_name)
        
        pdf_links = extract_pdf_links(url, doc_limit)
        print(len(pdf_links), "documents")
        
        publication_id = extract_id_from_url(pdf_links[0])
        
        if pdf_links:
            directory = os.path.join("docs", publication_id)
            pdf_directory = os.path.join(directory, "pdf")
            downloaded_files = download_pdfs(pdf_links, pdf_directory)
            results.append({
                "name": publication_name,
                "id": publication_id,
                "url": url,
                "directory": directory,
                "no_docs": len(downloaded_files),
                "docs" : sorted(downloaded_files)
            })
        else:
            print(f"No PDF links found for URL: {url}")

    # Write results to JSON file with proper encoding for non-ASCII characters
    with open(os.path.join("docs", "download_summary.json"), 'w', encoding='utf-8') as json_file:
        json.dump(results, json_file, ensure_ascii=False, indent=4)

    print(f"Download summary written to {os.path.join('docs', 'download_summary.json')}")
    
    return results

### Download PDFs from publication URLS

In [84]:
file_path = "uab_repo_urls_3.txt"
download_summary = process_repository_links(file_path, doc_limit=None)

Vielha de toti : era revista dera gent de Vielha e Mijaran
2 documents
0 new, 2 already downloaded
Aran un país
1 documents
0 new, 1 already downloaded
Aué : suplement setmanau deth diari Avui
165 documents
0 new, 165 already downloaded
Download summary written to docs/download_summary.json


#### Run the cell below if reading download_summary.JSON from disk (skipping the cell above)

In [19]:
with open('docs/download_summary_3.json', 'r') as f:
    download_summary = json.load(f)

# Now download_summary is available for use
print(len(download_summary), "records")

3 records


## Image extraction

In [36]:
import concurrent.futures
import time
from pdf2image import convert_from_path
from PIL import Image
import io

In [37]:
# Function to convert PDF to images
def pdf_to_images(pdf_path):
    try:
        images = convert_from_path(pdf_path)
        return images
    except Exception as e:
        print(f"Error converting {pdf_path} to images: {e}")
        return []

# Function to save images to files with resolution check
def save_image(image, image_path):
    try:
        buffer = io.BytesIO()
        image.save(buffer, format="PNG")
        file_size = buffer.tell()

        with open(image_path, 'wb') as f:
            f.write(buffer.getvalue())
    except Exception as e:
        print(f"Error saving image to {image_path}: {e}")

def extract_images_from_documents(download_summary, overwrite_imgs=False):
    total_no_images = 0
    total_saved_images = 0
    
    for pub in download_summary:
        print(pub['name'])

        for doc_path in pub['docs']:
            doc_name = doc_path.split('/')[-1]
            print(doc_name, end=" - ")

            imgs_dir = os.path.join(pub['directory'], 'img', doc_name)

            os.makedirs(imgs_dir, exist_ok=True)

            # Check if the image directory is empty
            ls_imgs_dir = [i for i in os.listdir(imgs_dir) if not i==".DS_Store" ]
            if not overwrite_imgs and ls_imgs_dir:
                print("✔️")
                total_no_images += len(ls_imgs_dir)
                continue
        
            # Extract images from pdf
            images = pdf_to_images(doc_path)
            print("✅", len(images), "images", end=" ")
            
            # Save images in parallel
            def save_image_task(args):
                image, image_path = args
                save_image(image, image_path)

            with concurrent.futures.ThreadPoolExecutor() as executor:
                future_to_image = {executor.submit(save_image_task, (image, os.path.join(imgs_dir, f"page_{i + 1}.png"))): i for i, image in enumerate(images)}
                for future in concurrent.futures.as_completed(future_to_image):
                    total_saved_images += 1
                    total_no_images += 1
                    
            print("saved")

        print()
            
    print(total_no_images, "images in total from", len(download_summary), "documents")
    print(total_saved_images, "new saved")
        

In [38]:
print("=======Converting each pdf into images=======")

# Start time
start_time = time.time()

# Convert each pdf into images
extract_images_from_documents(download_summary, overwrite_imgs=False) ## overwrite_imgs True for debugging

# End time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time} seconds")

Aué : suplement setmanau deth diari Avui
aueavui_a2004m1d31.pdf - ✅ 4 images saved
aueavui_a2004m2d14.pdf - ✅ 4 images saved
aueavui_a2004m2d21.pdf - ✅ 4 images saved
aueavui_a2004m2d28.pdf - ✅ 4 images saved
aueavui_a2004m2d7.pdf - ✅ 4 images saved

20 images in total from 1 documents
20 new saved
Elapsed time: 15.190072059631348 seconds


### Tesseract OCR + post-editing with Anthropic API

Reminder: Place the anthropic API key into `.env` file as in

```
ANTHROPIC_API_KEY=your_api_key_here
```

Correction step can be skipped with flag `skip_correction` when calling `do_ocr_and_fix`

In [89]:
import pytesseract
import re

#Setup language for Tesseract
LANG = "oci"

# Function to perform OCR on an image
def ocr_image_to_text(image_path):
    try:
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image, lang=LANG)
        return text
    except Exception as e:
        print(f"⚠️Error during OCR for {image_path}⚠️: {e}")
        return ""

#### Note: Correction step can be skipped. Don't execute cell below if so

In [102]:
import anthropic
import io
import re
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get the API key from environment variable
api_key = os.getenv('ANTHROPIC_API_KEY')

if api_key is None:
    raise ValueError("No API key found. Please set the ANTHROPIC_API_KEY environment variable.")

anthropic_client = anthropic.Anthropic()

instructions_detailed = (
    "You are tasked with fixing OCR output text in Aranese, respecting Aranese orthography, merging lines where necessary, "
    "and producing a proper output. This task is crucial for preserving the integrity and readability of the material and "
    "create a large text corpus for this language.\n\n"

    "Follow these steps to process the text:\n\n"

    "0. Only output the corrected text, without any introductory or additional phrases.\n\n"

    "1. Analyze and correct Occitan Aranese characters and diacritical marks:\n"
    "   - Pay close attention to special characters and diacritical marks (e.g., à, è, ò, ï, ü).\n"
    "   - Ensure accurate spelling according to Occitan Aranese conventions.\n\n"

    "2. Maintain sentence integrity:\n"
    "   - Examine each line to determine if it's a complete sentence or part of a continuing sentence.\n"
    "   - Join continuing sentences even if they span multiple lines visually.\n"
    "   - Look for grammatical clues such as:\n"
    "     a. Lack of final punctuation at the end of a line\n"
    "     b. Lines beginning with lowercase letters (unless it's a language-specific exception)\n"
    "     c. Incomplete grammatical structures that continue on the next line\n\n"

    "3. Address justified paragraphs and hyphens:\n"
    "   - Remove hyphens within words that are divided across lines in justified text.\n"
    "   - Rejoin hyphenated words, placing the complete word at the end of the first line.\n"
    "   - Ensure that removing hyphens doesn't create unintended new words.\n\n"

    "4. Preserve paragraph structure:\n"
    "   - Maintain paragraph breaks as they appear in the original text.\n"
    "   - Represent each new paragraph as a new line in the output.\n\n"

    "5. Handle titles and headings:\n"
    "   - Preserve titles and headings as separate lines in the output.\n"
    "   - Do not join titles or headings with the following paragraph text.\n\n"

    "6. Perform a final verification:\n"
    "   - Review the processed text to ensure:\n"
    "     a. All sentences are intact and properly joined.\n"
    "     b. Paragraphs and titles are correctly separated.\n"
    "     c. No artifacts from visual formatting remain.\n"
    "   - Double-check that the Occitan Aranese orthography is correctly preserved throughout."
)

# Function to fix OCR text with the API
def fix_ocr_text_with_api(ocr_text):
    try:
        message = anthropic_client.messages.create(
            model="claude-3-5-sonnet-20240620",
            max_tokens=4096,
            temperature=0,
            system=instructions_detailed,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": ocr_text
                        }
                    ]
                }
            ]
        )
        return message.content[0].text
    except Exception as e:
        print(f"⚠️Error during API call⚠️: {e}")
        return ""


In [103]:
# Function to perform OCR and correction for a single image
def process_image(args, skip_correction=False):
    img_path, ocr_path, fix_path = args
    ocr_status = fix_status = "✔️"

    try:
        # Perform OCR if the OCR text file does not exist
        if not os.path.isfile(ocr_path):
            OCR_text = ocr_image_to_text(img_path)
            if OCR_text:
                with open(ocr_path, 'w') as f:
                    f.write(OCR_text)
                ocr_status = "✅"
            else:
                ocr_status = "❌"
        else:
            with open(ocr_path, 'r') as f:
                OCR_text = f.read()

        # Now send to Anthropic API to fix or do manual fixing
        if OCR_text and not os.path.isfile(fix_path):
            if skip_correction:
                fixed_text = OCR_text
            else:
                fixed_text = fix_ocr_text_with_api(OCR_text)
            
            if fixed_text:
                # Write fixed OCR text to file
                with open(fix_path, 'w') as f:
                    f.write(fixed_text)
                fix_status = "✅"
            else:
                fix_status = "❌"
                
    except Exception as e:
        print(f"Error processing {img_path}: {e}")

    return img_path, ocr_status, fix_status

# Process to perform OCR and correct
def do_ocr_and_fix(download_summary, skip_correction=False):
    for pub in download_summary:
        print(pub['name'])

        imgs_dir = os.path.join(pub['directory'], 'img')

        ocr_dir = os.path.join(pub['directory'], 'ocr')
        os.makedirs(ocr_dir, exist_ok=True)

        fix_dir = os.path.join(pub['directory'], 'fix')
        os.makedirs(fix_dir, exist_ok=True)
        
        # Do a count of all images to process
        no_images_processed = 0
        pub_img_count = sum([len([img_name for img_name in os.listdir(os.path.join(imgs_dir, doc_name)) if img_name.endswith(('.png'))]) for doc_name in os.listdir(imgs_dir) if os.path.isdir(os.path.join(imgs_dir, doc_name))])
        print(pub_img_count, "pages to process in publication\n")
        
        # Iterate over each document in the publication
        for doc_name in sorted(os.listdir(imgs_dir)):
            doc_img_dir = os.path.join(imgs_dir, doc_name)
            doc_ocr_dir = os.path.join(ocr_dir, doc_name)
            doc_fix_dir = os.path.join(fix_dir, doc_name)

            # Skip non-directory files
            if not os.path.isdir(doc_img_dir):
                continue

            print("-", doc_name)

            # Ensure the ocr and fix subdirectory for the document exists
            os.makedirs(doc_ocr_dir, exist_ok=True)
            os.makedirs(doc_fix_dir, exist_ok=True)

            # List all images in the document's image directory
            image_tasks = []
            for img_name in sorted(os.listdir(doc_img_dir)):
                img_path = os.path.join(doc_img_dir, img_name)
                ocr_path = os.path.join(doc_ocr_dir, os.path.splitext(img_name)[0] + '.txt')
                fix_path = os.path.join(doc_fix_dir, os.path.splitext(img_name)[0] + '.txt')

                if os.path.isfile(img_path) and img_path.endswith(('.png', '.jpg', '.jpeg')):
                    image_tasks.append((img_path, ocr_path, fix_path))

            # Process images in parallel
            with concurrent.futures.ThreadPoolExecutor() as executor:
                futures = {executor.submit(process_image, task, skip_correction): task for task in image_tasks}
                for future in concurrent.futures.as_completed(futures):
                    img_path, ocr_status, fix_status = future.result()
                    img_name = os.path.basename(img_path)
                    no_images_processed += 1
                    print(f"-- {img_name} OCR{ocr_status} FIX{fix_status}")

            print(f"- {no_images_processed}/{pub_img_count} processed\n", )

In [104]:
print("===========Performing OCR and correction===========")

# Start time
start_time = time.time()

# Perform OCR and correction
do_ocr_and_fix(download_summary, skip_correction=False)

# End time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time} seconds")

Vielha de toti : era revista dera gent de Vielha e Mijaran
15 pages to process in publication

- vietot_a2017n1.pdf
-- page_11.png OCR✔️ FIX✔️
-- page_12.png OCR✔️ FIX✔️
-- page_2.png OCR✔️ FIX✔️
-- page_6.png OCR✔️ FIX✔️
-- page_13.png OCR✔️ FIX✔️
-- page_8.png OCR✔️ FIX✔️
- 6/15 processed

- vietot_a2018n3.pdf
-- page_12.png OCR✔️ FIX✔️
-- page_10.png OCR✔️ FIX✔️
-- page_11.png OCR✔️ FIX✔️
-- page_5.png OCR✔️ FIX✔️
-- page_6.png OCR✔️ FIX✔️
-- page_7.png OCR✔️ FIX✔️
-- page_8.png OCR✔️ FIX✔️
-- page_16.png OCR✔️ FIX✔️
-- page_9.png OCR✔️ FIX✔️
- 15/15 processed

Aran un país
8 pages to process in publication

- arapai_a2019n1.pdf
-- page_2.png OCR✔️ FIX✔️
-- page_3.png OCR✔️ FIX✔️
-- page_4.png OCR✔️ FIX✔️
-- page_1.png OCR✔️ FIX✔️
-- page_6.png OCR✔️ FIX✔️
-- page_5.png OCR✔️ FIX✔️
-- page_7.png OCR✔️ FIX✔️
-- page_8.png OCR✔️ FIX✔️
- 8/8 processed

Aué : suplement setmanau deth diari Avui
891 pages to process in publication

- aueavui_a1998m10d10.pdf
-- page_1.png OCR✔️ FIX✔️
-- pa

-- page_2.png OCR✔️ FIX✔️
-- page_1.png OCR✔️ FIX✔️
-- page_3.png OCR✔️ FIX✔️
-- page_4.png OCR✔️ FIX✔️
- 482/891 processed

- aueavui_a2002m10d19.pdf
-- page_1.png OCR✔️ FIX✔️
-- page_3.png OCR✔️ FIX✔️
-- page_2.png OCR✔️ FIX✔️
-- page_4.png OCR✔️ FIX✔️
- 486/891 processed

- aueavui_a2002m10d26.pdf
-- page_1.png OCR✔️ FIX✔️
-- page_2.png OCR✔️ FIX✔️
-- page_4.png OCR✔️ FIX✔️
-- page_3.png OCR✔️ FIX✔️
- 490/891 processed

- aueavui_a2002m10d5.pdf
-- page_2.png OCR✔️ FIX✔️
-- page_1.png OCR✔️ FIX✔️
-- page_3.png OCR✔️ FIX✔️
-- page_4.png OCR✔️ FIX✔️
- 494/891 processed

- aueavui_a2002m11d16.pdf
-- page_1.png OCR✔️ FIX✔️
-- page_2.png OCR✔️ FIX✔️
-- page_4.png OCR✔️ FIX✔️
-- page_3.png OCR✔️ FIX✔️
- 498/891 processed

- aueavui_a2002m11d2.pdf
-- page_1.png OCR✔️ FIX✔️
-- page_2.png OCR✔️ FIX✔️
-- page_3.png OCR✔️ FIX✔️
-- page_4.png OCR✔️ FIX✔️
- 502/891 processed

- aueavui_a2002m11d23.pdf
-- page_1.png OCR✔️ FIX✔️
-- page_3.png OCR✔️ FIX✔️
-- page_4.png OCR✔️ FIX✔️
-- page_2.png OCR✔

KeyboardInterrupt: 

### Create sentence segmented corpus

In [105]:
import os  
import re
import sys
from nltk.tokenize import sent_tokenize

END_PUNCS = ["!", '.', '?', '."', '?"', '!"', "…", ":"]
DOC_FORMATS = [".txt"]

def ellipsis_split(text):
    text = re.sub('\.\.\.', '…', text)
    text = re.sub(' +', ' ', text)
    sents = []
    currsent = ""
    split_points = []
    sent_begin = 0
    for i, c in enumerate(text):
        if c == "…":
            if i+1 < len(text) and text[i+1].islower():
                continue
            elif i+2 < len(text) and text[i+2].islower():
                continue
            else:
                sents.append(text[sent_begin:i+1])
                sent_begin = i + 1
        if i == len(text) - 1:
            sents.append(text[sent_begin:i+1])
    return sents

def is_sent(text):
    return any(text.endswith(punc) for punc in END_PUNCS) and len(text) > 3 and text[0].isupper() and "__" not in text and "…………" not in text

def clean_sent(text):
    text = re.sub(' +', ' ', text)
    text = re.sub("^– ", '', text)
    text = re.sub("^- ", '', text)
    text = re.sub("^\(", '', text)
    text = re.sub("\)$", '', text)
    text = re.sub("^»[a-z]\) ", '', text)
    return text.strip()

def parse_sents(text):
    lines_in_text = [t for t in text.split("\n") if t]
    sents = []
    
    for line in lines_in_text:
        sent_candidates = sent_tokenize(line.strip())
        clean_sentences = [clean_sent(s) for s in sent_candidates]
        sents.extend(clean_sentences)
        
    return sents

def make_corpus(download_summary, general_corpus_path = 'docs/corpus-general.txt', sent_corpus_path = "docs/corpus-sentences.txt"):
    all_publication_lines = []
    complete_sents = []

    for pub in download_summary:
        print(pub['name'])

        fix_dir = os.path.join(pub['directory'], 'fix')
        corpus_file_name = f"{pub['id']}-corpus.txt"
        corpus_path = os.path.join(pub['directory'], corpus_file_name)

        all_sents = []

        for doc_name in os.listdir(fix_dir):
            doc_dir = os.path.join(fix_dir, doc_name)
            if not os.path.isdir(doc_dir):
                continue

            for file_name in os.listdir(doc_dir):
                if file_name == '.DS_Store':
                    continue

                file_path = os.path.join(doc_dir, file_name)
                if os.path.splitext(file_name)[1] in DOC_FORMATS:
                    with open(file_path, 'r', encoding='utf-8') as file:
                        text = file.read()

                    sents = parse_sents(text)
                    all_sents.extend(sents)
                    complete_sents.extend([s for s in sents if is_sent(s)])
                    

        # Write all sentences to the publication corpus file
        with open(corpus_path, 'w', encoding='utf-8') as f:
            for sent in all_sents:
                f.write(sent + '\n')

        # Add to the general corpus
        all_publication_lines.extend(all_sents)

        # Print the number of sentences and words for each publication
        word_count = sum(len(sent.split()) for sent in all_sents)
        print(f"{len(all_sents)} lines, {word_count} words")
        print(f"Corpus written to {corpus_path}")
        print("------------------------------------")

    # Write the general corpus
    with open(general_corpus_path, 'w', encoding='utf-8') as f:
        for sent in all_publication_lines:
            f.write(sent + '\n')

    # Print the number of sentences and words for the general corpus
    general_word_count = sum(len(sent.split()) for sent in all_publication_lines)
    print("====================================")
    print(f"General corpus written to {general_corpus_path}")
    print(f"{len(all_publication_lines)} lines, {general_word_count} words")
    
    # Write the sentence-ensured corpus
    with open(sent_corpus_path, 'w', encoding='utf-8') as f:
        for sent in complete_sents:
            f.write(sent + '\n')
            
    print(f"Sentence corpus written to {sent_corpus_path}")
    sentenced_word_count = sum(len(sent.split()) for sent in complete_sents)
    print(f"{len(complete_sents)} lines, {sentenced_word_count} words")
    

In [106]:
print("===========Generating corpora===========")

# Start time
start_time = time.time()

# Perform OCR and correction
make_corpus(download_summary)

# End time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time} seconds")

Vielha de toti : era revista dera gent de Vielha e Mijaran
265 lines, 6580 words
Corpus written to docs/vietot/vietot-corpus.txt
------------------------------------
Aran un país
115 lines, 2927 words
Corpus written to docs/arapai/arapai-corpus.txt
------------------------------------
Aué : suplement setmanau deth diari Avui
17630 lines, 209009 words
Corpus written to docs/aueavui/aueavui-corpus.txt
------------------------------------
General corpus written to docs/corpus-general.txt
18010 lines, 218516 words
Sentence corpus written to docs/corpus-sentences.txt
8230 lines, 171474 words
Elapsed time: 0.7521059513092041 seconds


# PIPELINE ENDS HERE

Rest of this notebook is for auxiliary tasks

## OCR comparisons

In [None]:
import jiwer
import difflib

In [None]:
# Function to calculate WER and print differences
def compare_texts(ocr_text, gold_text):
    # Manual transformation to lowercase and strip extra spaces
    ocr_text = ocr_text.lower().strip()
    gold_text = gold_text.lower().strip()

    # Calculate CER
    cer = jiwer.cer(gold_text, ocr_text)
    
    # Print WER
    print(f"Character Error Rate (CER): {cer:.2f}\n")
    
    # Print differences
    diff = difflib.ndiff(ocr_text.split(), gold_text.split())
    count = 0
    diffs_str = ""
    for line in diff:
        if line.startswith('- '):
            diffs_str += line[2:] + ' / '
        elif line.startswith('+ '):
            count += 1
            diffs_str += line[2:] + '\n'
            
    print(f"OCR / GOLD ({count}/{len(gold_text.split())}):")
    print(diffs_str)

In [None]:
with open('docs_play/page_3_haiku.txt', 'r') as f:
    postfixed_text = f.read()
    
with open('docs_play/page_3_sonnet.txt', 'r') as f:
    direct_text = f.read()
    
compare_texts(postfixed_text, direct_text)

## Sketches

In [None]:
img_path = 'docs/vietot/img/vietot_a2017n1.pdf/page_6.png'

base64_image = image_to_base64(img_path)

OCR_text = get_text_from_image_with_api(base64_image)

In [None]:
print(OCR_text)

In [None]:
import re

def fix_ocr_text_formatting(text):
    # Replace hyphenated line breaks with nothing (merge words split by hyphens)
    text = re.sub(r'-\n', '', text)
    
    # Preserve double line breaks and replace single line breaks within paragraphs with a space
#     text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
    
#     # Replace multiple spaces with a single space
#     text = re.sub(r'\s+', ' ', text)
    
#     # Restore double line breaks as single line breaks
#     text = re.sub(r'\n\s*\n', '\n', text)
    
    # Strip leading and trailing whitespaces
    text = text.strip()
    
    return text

In [None]:
img_path = 'docs/vietot/img/vietot_a2017n1.pdf/page_6.png'

ocr_text = ocr_image_to_text(img_path)

In [None]:
print(fix_ocr_text_formatting(ocr_text))

In [None]:
with open('docs_play/page_6_nolang.txt', 'r') as f:
    postfixed_text = f.read()
    
with open('docs_play/page_6_fra.txt', 'r') as f:
    direct_text = f.read()
    
compare_texts(postfixed_text, direct_text)

In [None]:
compare_texts(postfixed_text, direct_text)