In [7]:
import pandas as pd

parquet_file = "../data/gallica_presse_1_1.parquet"

data = pd.read_parquet(parquet_file)

In [9]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp311-cp311-macosx_11_0_arm64.whl (982 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m982.4/982.4 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [2]:
data.head()

Unnamed: 0,index_id,file_id,ocr,title,date,author,page_count,word_count,character_count,text,corrected_text
0,940,bpt6k6441992x,99,Journal officiel de la République française. D...,1902-01-20,,112,58622,356947,\nCHAMBRE DES DÉPUTÉS 71 législature. Session ...,CHAMBRE DES DÉPUTÉS 71e législature. Session o...
1,941,bpt6k2353312n,86,Journal de la Manche et de la Basse-Normandie ...,1913-03-01,,22,31657,188831,\nDixième Année.— N©960 \nDIX CENTIMES LE NUME...,Dixième Année.— 1960 \nDIX CENTIMES LE NUMÉRO ...
2,942,bpt6k605149p,99,Le Petit Parisien : journal quotidien du soir,1922-09-22,,22,27888,175974,\nLES DÉLIBÉRATIONS \nSORUlUESDID'lEir reprenn...,LES DÉLIBÉRATIONS \nSOURCUES DID'ELIRE reprenn...
3,943,bpt6k54904729,93,Bulletin du Photo-club de Constantine...,1894-12,Photo club de Constantine,34,2715,17059,\nDeuxième année \nDÉCEMBRE 1894 \nN° 16 \nFou...,Deuxième année\nDÉCEMBRE 1894\nN° 16\nPour les...
4,944,bpt6k611668p,99,L'Ouest-Éclair,1926-07-29,,40,51163,317488,\nL'UNION SACREE POUR LE TRAVAIL \nNotre colla...,L'UNION SACRÉE POUR LE TRAVAIL \nNotre collabo...


In [5]:
import numpy as np

HEADERS_INFO = {
    "gpt-2": {
        "magic": 20240520,
        "version": 1,
        "token_dtype": np.uint16,
    },
    "llama-3": {
        "magic": 20240801,
        "version": 7,
        "token_dtype": np.uint32,
    },
}


In [6]:
def write_datafile(filename, toks, model_desc="gpt-2"):
    """
    Saves token data as a .bin file, for reading in C.
    - First comes a header with 256 int32s
    - The tokens follow, each as uint16 (gpt-2) or uint32 (llama)
    """
    assert len(toks) < 2**31, "token count too large" # ~2.1B tokens
    assert model_desc in ["gpt-2", "llama-3"], f"unknown model descriptor {model_desc}"
    info = HEADERS_INFO[model_desc]
    # construct the header
    header = np.zeros(256, dtype=np.int32) # header is always 256 int32 values
    header[0] = info["magic"]
    header[1] = info["version"]
    header[2] = len(toks) # number of tokens after the 256*4 bytes of header
    # construct the data (numpy array of tokens)
    toks_np = np.array(toks, dtype=info["token_dtype"])
    # write to file
    num_bytes = (256 * 4) + (len(toks) * toks_np.itemsize)
    print(f"writing {len(toks):,} tokens to {filename} ({num_bytes:,} bytes) in the {model_desc} format")
    with open(filename, "wb") as f:
        f.write(header.tobytes())
        f.write(toks_np.tobytes())

In [11]:
import os
import pandas as pd
import re
import tiktoken
from transformers import AutoTokenizer
import struct
from tqdm import tqdm

# Define constants
DATA_DIR = "../data/processed_ocr_data"
os.makedirs(DATA_DIR, exist_ok=True)

# 1. Load and Clean OCR Data
def load_and_clean_data(parquet_file):
    """Load and clean OCR data with progress tracking."""
    print("Loading data...")
    df = pd.read_parquet(parquet_file)
    
    # Drop rows with missing text
    df.dropna(subset=['text', 'corrected_text'], inplace=True)

    # Clean text function with progress bar
    def clean_text(text):
        text = text.strip()
        text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines
        text = re.sub(r'[^\x20-\x7E]', '', text)  # Remove non-printable ASCII characters
        return text

    print("Cleaning text data...")
    df['text'] = [clean_text(text) for text in tqdm(df['text'], desc="Processing OCR Text")]
    df['corrected_text'] = [clean_text(text) for text in tqdm(df['corrected_text'], desc="Processing Corrected Text")]

    # Combine text pairs
    print("Formatting data...")
    df['formatted'] = [f"### Text ###\n{text}\n\n\n### Correction ###\n{corr_text}\n" for text, corr_text in tqdm(zip(df['text'], df['corrected_text']), total=len(df), desc="Formatting Text")]

    # Save formatted text to a file
    text_filename = os.path.join(DATA_DIR, "ocr_text_data.txt")
    with open(text_filename, 'w', encoding='utf-8') as f:
        for line in tqdm(df['formatted'], desc="Saving formatted text"):
            f.write(line + "\n")
    
    print(f"Formatted data saved to {text_filename}")
    return text_filename

# 2. Tokenization Function
def tokenize_ocr_data(text_filename, model_desc="gpt-2"):
    """Tokenize text data and split into training/validation sets with progress tracking."""
    print(f"Tokenizing data using {model_desc} model...")

    if model_desc == "gpt-2":
        enc = tiktoken.get_encoding("gpt2")
        encode = lambda s: enc.encode_ordinary(s)
        eot = enc._special_tokens['<|endoftext|>']  # End of text token for GPT-2
        token_size = 2  # GPT-2 tokens use uint16 (2 bytes)
    elif model_desc == "llama-3":
        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B")
        encode = lambda s: tokenizer.encode(s, add_special_tokens=False)
        eot = tokenizer.encode('')[0]  # LLaMA end-of-text token
        token_size = 4  # LLaMA tokens use uint32 (4 bytes)
    else:
        raise ValueError(f"Unknown model descriptor {model_desc}")

    # Read the text
    with open(text_filename, 'r', encoding='utf-8') as f:
        text = f.read()

    # Split into sections and tokenize
    sections = text.split("\n\n")
    tokens = []
    
    print("Tokenizing text sections...")
    for i, s in tqdm(enumerate(sections), total=len(sections), desc="Tokenizing"):
        tokens.append(eot)
        spad = s + "\n\n" if i != len(sections) - 1 else s
        tokens.extend(encode(spad))

    # Split into training (90%) and validation (10%)
    val_tokens = tokens[:32768]
    train_tokens = tokens[32768:]

    # Print token counts
    print(f"Total Tokens: {len(tokens)}")
    print(f"Training Tokens: {len(train_tokens)}")
    print(f"Validation Tokens: {len(val_tokens)}")

    # Save tokenized data
    val_filename = os.path.join(DATA_DIR, f"ocr_val_{model_desc}.bin")
    train_filename = os.path.join(DATA_DIR, f"ocr_train_{model_desc}.bin")

    save_tokens(val_filename, val_tokens, token_size)
    save_tokens(train_filename, train_tokens, token_size)

    print(f"Tokenized data saved to {val_filename} and {train_filename}")

# 3. Save Tokenized Data
def save_tokens(filename, token_list, token_size):
    """Save tokenized data in binary format with progress tracking."""
    print(f"Saving tokenized data to {filename}...")
    with open(filename, 'wb') as f:
        for token in tqdm(token_list, desc=f"Writing to {filename}"):
            f.write(struct.pack(f"<{token_size}s", token.to_bytes(token_size, byteorder="little")))

text_file = load_and_clean_data(parquet_file)



Loading data...
Cleaning text data...


Processing OCR Text: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2447/2447 [00:23<00:00, 103.57it/s]
Processing Corrected Text: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2447/2447 [00:20<00:00, 121.40it/s]


Formatting data...


Formatting Text: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2447/2447 [00:00<00:00, 5190.53it/s]
Saving formatted text: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2447/2447 [00:00<00:00, 3145.26it/s]

Formatted data saved to ../data/processed_ocr_data/ocr_text_data.txt





In [None]:

# Tokenize and split the data (choose 'gpt-2' or 'llama-3')
tokenize_ocr_data(text_file, model_desc="gpt-2")