In [None]:
import re
import os
import json

In [None]:
def open_txt(file):
    with open(file,"r") as ocr:
        text = ocr.read()

    return text

In [None]:
def catalog_content(catalog):
    # Régex de récupération des entrées du catalogue
    pattern = r'^(\d+)\.*?\s*(.*?)\n(?=\d+\.|\Z)'

    # Extraction des entrées : findall renvoie un tuple avec chaque groupe de capture
    entries = re.findall(pattern, catalog, re.MULTILINE | re.DOTALL)

    # Liste de sortie
    catalog_data = []

    # Traitement de chaque entrée
    for entry in entries:
        number, description = entry

        # Déterminer le type d'entrée
        if description.startswith('.*'):
            entry_type = "planche"
        elif description.startswith('."'):
            entry_type = "sketch"
        else:
            entry_type = "no_planche"

        # Ajouter les données formatées à la liste
        catalog_data.append({
            "number": number.strip(),
            "type": entry_type,
            "description": description.strip()
        })

    return catalog_data


In [None]:
def get_limc_structure(file):
    text = open_txt(file)

    # Extraire la première ligne (nom de la notice)
    first_line_match = re.match(r'^([A-Z\s-]+)', text)
    notice_name = first_line_match.group(0).strip() if first_line_match else "Unknown"

    # Introduction (avant "LITERARY SOURCES")
    intro_match = re.search(r'^(.*?)\nLITERARY SOURCES', text, re.DOTALL)
    introduction = intro_match.group(1).strip() if intro_match else "Missing introduction"

    # Literary source (entre "LITERARY SOURCES" et "BIBLIOGRAPHY")
    literary_source_match = re.search(r'LITERARY SOURCES\s*(.*?)\nBIBLIOGRAPHY', text, re.DOTALL)
    literary_source = literary_source_match.group(1).strip() if literary_source_match else "Missing literary source"

    # Bibliography (entre "BIBLIOGRAPHY" et "CATALOGUE")
    bibliography_match = re.search(r'BIBLIOGRAPHY\s*.*?:\s*(.*?)(?=\nCATALOGUE)', text, re.DOTALL)
    bibliography = bibliography_match.group(1).strip() if bibliography_match else "Missing bibliography"

    # Catalog (entre "CATALOGUE" et "COMMENTARY")
    catalog_match = re.search(r'CATALOGUE\s*(.*?)(?=\nCOMMENTARY)', text, re.DOTALL)
    catalog = catalog_match.group(1).strip() if catalog_match else "Missing catalog"

    # Extraire les données du catalogue (exemple d'extraction spécifique)
    catalog_data = catalog_content(catalog) if 'catalog_content' in globals() else catalog

    return notice_name, introduction, literary_source, bibliography, catalog_data

In [None]:
def create_json(file, output_folder):

    notice_name, introduction, literary_source, bibliography, catalog_data = get_limc_structure(file)


    # Construire la structure JSON
    notice = {
        notice_name: {
            "Introduction": introduction,
            "Literary source": literary_source,
            "Bibliography": bibliography,
            "Catalog": catalog_data,
        }
    }


    json_filename = os.path.basename(file).replace('.txt', '.json')
    output_path = os.path.join(output_folder, json_filename)


    os.makedirs(output_folder, exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as result:
        json.dump(notice, result, ensure_ascii=False, indent=4)

    print(f"{json_filename} saved in {output_folder}")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file = '/content/drive/MyDrive/Hackhaton_Hercule/data/Herakles_cleaned.txt'
output_folder = '/content/drive/MyDrive/Hackhaton_Hercule/output'

In [None]:
create_json(file, output_folder)

test
Herakles_cleaned.json saved in /content/drive/MyDrive/Hackhaton_Hercule/output
