In [None]:
!pip install -q -U google-generativeai
!pip install PyPDF2 google-generativeai

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:


# --- IMPORTAR LIBRERÍAS ---
import PyPDF2
import json
import os
import google.generativeai as genai
from google.colab import files

# --- CONFIGURAR API ---
GEMINI_API_KEY = ""  # ← Pega tu API Key aquí
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel("gemini-1.5-flash")

# --- FUNCIONES ---
def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            return "\n".join([
                page.extract_text() for page in pdf_reader.pages if page.extract_text()
            ])
    except Exception as e:
        print(f"❌ Error extrayendo texto del PDF: {str(e)}")
        return None

def generate_json_with_gemini(text):
    try:
        prompt = f"""
Analiza el siguiente texto y devuélvelo directamente como JSON estructurado con el siguiente formato:

{{
  "metadata": {{
    "titulo": "string",
    "autor": "string",
    "fecha": "string"
  }},
  "contenido": {{
    "secciones": [
      {{
        "titulo_seccion": "string",
        "texto": "string",
        "keywords": ["string"]
      }}
    ]
  }}
}}

Texto:
{text[:30000]}
"""
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        print(f"❌ Error con Gemini: {str(e)}")
        return None

def clean_and_validate_json(gemini_output):
    try:
        json_str = gemini_output.replace("```json", "").replace("```", "").strip()
        json_obj = json.loads(json_str)
        return json.dumps(json_obj, ensure_ascii=False, indent=2)
    except Exception as e:
        print(f"❌ Error validando JSON: {str(e)}")
        return None

def process_pdf_and_generate_json(pdf_path):
    try:
        print(f"📄 Procesando archivo: {pdf_path}")
        text = extract_text_from_pdf(pdf_path)
        if not text:
            return None

        print("🤖 Enviando a Gemini...")
        gemini_output = generate_json_with_gemini(text)
        if not gemini_output:
            return None

        print("🧪 Validando JSON...")
        validated_json = clean_and_validate_json(gemini_output)
        if not validated_json:
            return None

        output_dir = "/content/output"
        os.makedirs(output_dir, exist_ok=True)
        output_path = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(pdf_path))[0]}.json")

        with open(output_path, "w", encoding="utf-8") as f:
            f.write(validated_json)

        print(f"✅ JSON guardado en: {output_path}")
        files.download(output_path)
        return True
    except Exception as e:
        print(f"❌ Error general: {str(e)}")
        return None

# --- EJECUCIÓN PRINCIPAL ---
print("⬆️ Sube tu archivo PDF")
uploaded = files.upload()

if uploaded:
    for filename in uploaded:
        process_pdf_and_generate_json(f"/content/{filename}")
else:
    print("⚠️ No se subió ningún archivo.")


⬆️ Sube tu archivo PDF


Saving formato conciliacion.pdf to formato conciliacion.pdf
📄 Procesando archivo: /content/formato conciliacion.pdf
🤖 Enviando a Gemini...
🧪 Validando JSON...
✅ JSON guardado en: /content/output/formato conciliacion.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>