In [9]:
import fitz  # PyMuPDF
import os
import re
import pandas as pd

# 📁 Ruta de los PDFs
pdf_folder_path = 'C:\\Users\\CAROLINA\\Documents\\02_GitHub\\CV_Structure'

# Extrae texto y guarda como .txt
def extract_text_and_save_as_txt(pdf_path, output_txt_path, max_pages=300):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc[:max_pages]:
            text += page.get_text("text")
        doc.close()

        with open(output_txt_path, "w", encoding="utf-8") as f:
            f.write(text)
        print(f"✅ Texto guardado como .txt: {output_txt_path}")
    except Exception as e:
        print(f"❌ Error al procesar {pdf_path}: {e}")

# Extrae secciones específicas del texto
def extract_section(text, section_keywords):
    lines = text.split("\n")
    section_text = []
    collecting = False
    keywords = [kw.lower() for kw in section_keywords]

    for line in lines:
        line_lower = line.strip().lower()
        if any(kw in line_lower for kw in keywords):
            collecting = True
            continue
        elif collecting and any(kw in line_lower for kw in [
            "educación", "educacion", "estudios", 
            "experiencia", "idiomas", "lenguas", "lenguajes", 
            "actividades", "voluntariado", "otros"
        ]):
            break
        elif collecting:
            section_text.append(line.strip())

    return " ".join(section_text).strip()

# Extrae los campos clave del .txt
def extract_fields_from_txt(text, filename):
    lines = text.strip().split("\n")
    lines = [line.strip() for line in lines if line.strip()]
    full_text = "\n".join(lines)

    candidate_name = lines[0] if lines else ""

    email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', full_text)
    phone_match = re.search(r'(\+?\d[\d\s\-]{7,20})', full_text)
    linkedin_match = re.search(r'(linkedin\.com/in/[^\s]+)', full_text)

    education_text = extract_section(full_text, [
        "educación", "educacion", "estudios", "educación/estudios", "educacion/estudios"
    ])

    experience_text = extract_section(full_text, [
        "experiencia laboral", "experiencia profesional"
    ])

    language_text = extract_section(full_text, [
        "idiomas", "lenguas", "lenguajes"
    ])

    extras_text = extract_section(full_text, [
        "actividades extracurriculares", "voluntariado", "voluntariados"
    ])

    # Forzar valores vacíos si no encuentra nada
    fields = {
        "Nombre": candidate_name or "",
        "Correo": email_match.group() if email_match else "",
        "Teléfono": phone_match.group() if phone_match else "",
        "LinkedIn": linkedin_match.group() if linkedin_match else "",
        "Educación": education_text or "",
        "Experiencia": experience_text or "",
        "Idiomas": language_text or "",
        "Extras": extras_text or "",
        "Archivo": filename or ""
    }
    return fields

# Procesa todo: PDF → TXT → CSV
def process_pdfs_to_csv(folder_path, max_pages=300):
    data = []
    txt_files = []

    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            txt_name = os.path.splitext(filename)[0] + ".txt"
            txt_path = os.path.join(folder_path, txt_name)
            extract_text_and_save_as_txt(pdf_path, txt_path, max_pages)
            txt_files.append(txt_path)

    for txt_path in txt_files:
        try:
            with open(txt_path, "r", encoding="utf-8") as f:
                text = f.read()
            filename = os.path.basename(txt_path).replace(".txt", ".pdf")
            fields = extract_fields_from_txt(text, filename)
            data.append(fields)
        except Exception as e:
            print(f"⚠️ No se pudo procesar {txt_path}: {e}")

    # Guardar CSV
    df = pd.DataFrame(data)
    csv_path = os.path.join(folder_path, "resumen_postulantes.csv")
    df.to_csv(csv_path, index=False, encoding="utf-8-sig")
    print(f"\n📄 CSV generado con éxito: {csv_path}")

    # Borrar los .txt
    for txt_path in txt_files:
        try:
            os.remove(txt_path)
            print(f"🗑️ Borrado: {txt_path}")
        except Exception as e:
            print(f"❌ No se pudo borrar {txt_path}: {e}")

# Ejecutar
process_pdfs_to_csv(pdf_folder_path)

✅ Texto guardado como .txt: C:\Users\CAROLINA\Documents\02_GitHub\CV_Structure\Ana Paulina CV.txt
✅ Texto guardado como .txt: C:\Users\CAROLINA\Documents\02_GitHub\CV_Structure\Antony Patino CV.txt
✅ Texto guardado como .txt: C:\Users\CAROLINA\Documents\02_GitHub\CV_Structure\CV - Bruno Fabricio Florian Oliveros - currículum vítae.txt
✅ Texto guardado como .txt: C:\Users\CAROLINA\Documents\02_GitHub\CV_Structure\CV - Christopher Antony Bolo Añorga.txt
✅ Texto guardado como .txt: C:\Users\CAROLINA\Documents\02_GitHub\CV_Structure\CV Carlos Soller.txt
✅ Texto guardado como .txt: C:\Users\CAROLINA\Documents\02_GitHub\CV_Structure\CV David Silva Jaime.txt
✅ Texto guardado como .txt: C:\Users\CAROLINA\Documents\02_GitHub\CV_Structure\CV Fabiola Medina_3.txt
✅ Texto guardado como .txt: C:\Users\CAROLINA\Documents\02_GitHub\CV_Structure\CV- ANGELA SAAVEDRA VITE (2024)- OFICIAL.txt
✅ Texto guardado como .txt: C:\Users\CAROLINA\Documents\02_GitHub\CV_Structure\CV- Fiorella Velasquez.txt

📄 CS

In [None]:
# Borrar los archivos .txt generados
    for txt_path in txt_files:
        try:
            os.remove(txt_path)
            print(f"🗑️ Borrado: {txt_path}")
        except Exception as e:
            print(f"❌ No se pudo borrar {txt_path}: {e}")