In [None]:
# Updated first cell: read PDFs instead of database
import json
import pandas as pd
from pathlib import Path
import ollama
import PyPDF2

# Detect current folder and list PDFs
folder = Path('C:\Users\lena\OneDrive - De Vinci\RECOVER\S9\PLM\Hackathon\HACKATHON_The_regulatory-_GPS\pdf')
pdf_files = list(folder.glob('*.pdf'))
print("PDF files found:", pdf_files)

# Read all PDFs into a list of texts
pdf_texts = []
for pdf_path in pdf_files:
    with open(pdf_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        text = "\n".join(page.extract_text() or "" for page in reader.pages)
        pdf_texts.append({"filename": pdf_path.name, "content": text})

print(f"Number of PDFs loaded: {len(pdf_texts)}")


PDF files found: [WindowsPath('R141e.pdf'), WindowsPath('R141r1am2e.pdf'), WindowsPath('R142am1e.pdf'), WindowsPath('R142am2e (1).pdf'), WindowsPath('R142am2e.pdf'), WindowsPath('R142e.pdf'), WindowsPath('R143am1e.pdf'), WindowsPath('R143e.pdf'), WindowsPath('R144am2e.pdf'), WindowsPath('R144e.pdf'), WindowsPath('R145am4e.pdf'), WindowsPath('R145e.pdf'), WindowsPath('R146e.pdf'), WindowsPath('R147e.pdf'), WindowsPath('R148am1e.pdf'), WindowsPath('R148am2e.pdf'), WindowsPath('R148am3e.pdf'), WindowsPath('R148am4e.pdf'), WindowsPath('R148am5e.pdf'), WindowsPath('R148am6e.pdf'), WindowsPath('R148e.pdf'), WindowsPath('R148r1am2E.pdf'), WindowsPath('R148rev.1am1e.pdf'), WindowsPath('R149am1e.pdf'), WindowsPath('R149am2e.pdf'), WindowsPath('R149am3e.pdf'), WindowsPath('R149am4e.pdf'), WindowsPath('R149am5E.pdf'), WindowsPath('R149am6e.pdf'), WindowsPath('R149am7e.pdf'), WindowsPath('R149am8e.pdf'), WindowsPath('R149am9e_0.pdf'), WindowsPath('R149e.pdf'), WindowsPath('R149r1am1e.pdf'), Window

In [7]:
def build_prompt(content, country):
    return f"""
You are a regulation extractor. You must return STRICTLY valid JSON.

THE JSON STRUCTURE IS ABSOLUTELY FIXED.
YOU ARE FORBIDDEN TO:
- ADD new fields
- REMOVE fields
- RENAME fields
- CHANGE nesting
- CHANGE the order of the structure
- RETURN multiple JSON objects
- RETURN anything outside the exact structure shown below

DO NOT PRODUCE:
- ANY comment
- ANY descriptive text
- ANY explanation
- ANY line break outside the JSON
- ANY ```json``` or markdown

You must answer EXCLUSIVELY with valid JSON strictly and unconditionally following the structure below (no other keys and/or type of values will be accepted):

{{
  "pays": "ONU",
  "nom_norme": "string or 'inconnu'",
  "date_publication": "string or 'inconnu'",
  "loi_id": "string or 'inconnu'",
  "pieces_concernees": ["liste de strings"],
  "exigences": [
    {{
      "description": "string",
      "obligation": "obligatoire/interdit/recommandé/inconnu",
      "tests_requis": ["liste ou vide"],
      "chiffres_cles": ["liste ou vide"],
      "article_source": "string ou inconnu",
      "degre_importance": "low/medium/high/unknown"
    }}
  ]
}}

This JSON structure MUST NEVER change. YOU REALLY CAN'T CHANGE THIS STRUCTURE AND MUST NOT ADD OTHER KEYS. 
If a piece of information does not exist, set it to "inconnu".

Some of the PDFs contain graphs, figures, or visual elements.
These MAY contain useful regulatory information.
You MUST interpret and extract information from text, tables, graphs, and figures whenever possible.

The goal is not to produce legalistic text but clear, actionable constraints.

Here is the regulatory text to analyze:
\"\"\"
{content}
\"\"\"

ANSWER ONLY WITH VALID JSON.
"""


In [3]:
# ------------------------------------------------------
# 3. Fonction qui interroge LLaMA via Ollama
# ------------------------------------------------------

def run_llama(prompt):
    response = ollama.generate(
        model="mistral",
        prompt=prompt,
        options={"temperature": 0.1}  # très faible pour éviter les hallucinations
    )
    return response["response"]

In [8]:
# ------------------------------------------------------
# 4. Loop over extracted PDF contents (custom slice)
# ------------------------------------------------------

results = []

for idx, item in enumerate(pdf_texts[0:10]):   # <-- change this later
    content = item["content"]
    prompt = build_prompt(content, country="ONU")

    print(f"\nProcessing PDF {idx} ({item['filename']})...")

    raw_output = run_llama(prompt)

    try:
        json_obj = json.loads(raw_output)
    except json.JSONDecodeError:
        print("⚠ Invalid JSON, attempting recovery…")
        try:
            start = raw_output.index("{")
            end = raw_output.rindex("}") + 1
            json_obj = json.loads(raw_output[start:end])
        except:
            print("❌ Unable to parse JSON.")
            continue

    results.append(json_obj)



Processing PDF 0 (R141e.pdf)...

Processing PDF 1 (R141r1am2e.pdf)...

Processing PDF 2 (R142am1e.pdf)...

Processing PDF 3 (R142am2e (1).pdf)...

Processing PDF 4 (R142am2e.pdf)...

Processing PDF 5 (R142e.pdf)...

Processing PDF 6 (R143am1e.pdf)...

Processing PDF 7 (R143e.pdf)...
⚠ Invalid JSON, attempting recovery…
❌ Unable to parse JSON.

Processing PDF 8 (R144am2e.pdf)...

Processing PDF 9 (R144e.pdf)...


In [10]:
# ------------------------------------------------------
# 5. Export final
# ------------------------------------------------------

output_path = f"structured_norms.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"\n✔ Export terminé → {output_path}")


✔ Export terminé → structured_norms.json
