In [9]:
import os
import re
import pandas as pd
from docx import Document
import spacy
from datetime import datetime
import unicodedata
from pdf2image import convert_from_path
from pytesseract import image_to_string
from PIL import Image

# Load spaCy's Spanish NER model
#python -m spacy download es_core_news_lg

nlp = spacy.load("es_core_news_lg")

# Tesseract OCR setup (Ensure Tesseract is installed and path is correct)
TESSDATA_PREFIX = "/usr/share/tesseract-ocr/4.00/tessdata"
os.environ["TESSDATA_PREFIX"] = TESSDATA_PREFIX

def extract_text_from_docx(file_path):
    """
    Extract all text from a Word document.
    """
    doc = Document(file_path)
    return "\n".join(paragraph.text for paragraph in doc.paragraphs)

def extract_text_from_pdf(file_path):
    """
    Extract text from a PDF file using OCR.
    Converts each page of the PDF to an image and extracts text using Tesseract.
    """
    pages = convert_from_path(file_path)
    text = ""
    for page in pages:
        text += image_to_string(page, lang="spa") + "\n"
    return text

def extract_text_from_png(file_path):
    """
    Extract text from a PNG file using OCR.
    """
    image = Image.open(file_path)
    return image_to_string(image, lang="spa")

def remove_accents(text):
    return ''.join(
        (c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
    )

def extract_cc(text):
    """
    Extract CC (Cédula de Ciudadanía) from the given text.
    """
    # Normalize the text by removing accents
    text_normalized = remove_accents(text).lower()
    print(f"Normalized Text: {text_normalized}")  # Debugging

    # Enhanced regex to handle different formats
    cc_match = re.search(
        r"(?:cc|cedula(?: de ciudadania)?|cedula|ciudadania|documento)(?:\s*(?:numero|num|no\.?|no:|número|#)?\s*[:\s]*)?([\d\s\-\.]{6,})",
        text_normalized,
        re.IGNORECASE,
    )

    if cc_match:
        # Extract the raw CC part from the match
        cc_raw = cc_match.group(1)
        print(f"Raw CC Match: {cc_raw}")  # Debugging
        
        # Clean up to remove non-digit characters
        cc_cleaned = re.sub(r"[^\d]", "", cc_raw)
        print(f"Cleaned CC: {cc_cleaned}")  # Debugging

        # Return if it meets a reasonable length criterion
        if len(cc_cleaned) >= 6:  # Adjust the minimum length if needed
            return cc_cleaned

    # Return None if no match is found
    return None

def extract_name(text):
    """
    Extracts a person's name from the text.

    Args:
        text (str): Input text.

    Returns:
        str: Extracted name or None if no name is found.
    """
    # Common words or titles that should be removed before extracting the name
    stop_words = {"la señora", "el señor", "señor", "señora", "la", "el", "El", "La", "identificado", "con", "cedula", "de", "ciudadania"}

    # Step 1: Normalize and clean the text
    text = " ".join(text.split())

    # Step 2: Use SpaCy's Named Entity Recognition to find the person
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PER":  # Look for "Person" entities
            # Clean the name by removing any stop words (like titles)
            cleaned_name = " ".join(
                word for word in ent.text.split() if word.lower() not in stop_words
            )
            return cleaned_name.strip()

    # Step 3: Regex fallback for structured patterns like "el señor [NAME]"
    # This regex looks for common patterns in Spanish like "señor [NAME]" or "identificado con cédula de [NAME]"
    context_pattern = r"(?i)(?:el\s+señor|la\s+señora|señor(?:a)?|identificado\s+con\s+c[eé]dula\s+de)\s+([A-ZÁÉÍÓÚÑ][A-Za-zÁÉÍÓÚÑ]*(?:\s+[A-ZÁÉÍÓÚÑ][A-Za-zÁÉÍÓÚÑ]*)+)"
    match = re.search(context_pattern, text)
    if match:
        candidate_name = match.group(1).strip()
        return candidate_name.title()

    # Step 4: Return None if no name is found
    return None


import re
from datetime import datetime
import dateparser

import re
from datetime import datetime
import dateparser

def extract_tiempo_laborado(text):
    """
    Extracts the start and end dates of a working period from text.
    Handles phrases like "desde [day] [month] [year]" and "hasta [day] [month] [year]".
    
    Args:
        text (str): Input text containing work details.

    Returns:
        tuple: (from_date, to_date) in 'YYYY-MM-DD' format.
    """
    # Extract the current date
    current_date = datetime.now()

    # Initialize from_date and to_date
    from_date, to_date = None, None

    # Extract the start date using regex for the pattern "desde el día [day] [month] [year]" or "desde [day] [month] [year]"
    desde_match = re.search(r"desde\s+el?\s*(día\s*)?(\d{1,2})\s*de\s*(\w+)\s*de\s*(\d{4})", text, re.IGNORECASE)
    
    if desde_match:
        # Extract day, month, and year
        day = desde_match.group(2)
        month = desde_match.group(3)
        year = desde_match.group(4)
        
        # Format the start date as 'YYYY-MM-DD'
        from_date = dateparser.parse(f"{day} {month} {year}", languages=['es']).strftime('%Y-%m-%d')
    
    # Extract the end date using regex for the pattern "hasta el [day] [month] [year]"
    hasta_match = re.search(r"hasta\s+el?\s*(\d{1,2})\s*de\s*(\w+)\s*de\s*(\d{4})", text, re.IGNORECASE)
    
    if hasta_match:
        # Extract day, month, and year
        day = hasta_match.group(1)
        month = hasta_match.group(2)
        year = hasta_match.group(3)
        
        # Format the end date as 'YYYY-MM-DD'
        to_date = dateparser.parse(f"{day} {month} {year}", languages=['es']).strftime('%Y-%m-%d')
    
    # Handle "hasta hoy" or similar phrases
    present_terms = ["hasta hoy", "hoy", "actualidad", "presente", "hasta la fecha"]
    if any(term in text.lower() for term in present_terms):
        to_date = current_date.strftime('%Y-%m-%d')  # Set "to_date" to today's date
    
    # Fallback if no "hasta" date was found, default to the current date
    if not to_date:
        to_date = current_date.strftime('%Y-%m-%d')

    # If no "desde" date was found, attempt to use the first year mentioned
    if not from_date:
        year_match = re.search(r"(\d{4})", text)
        if year_match:
            from_date = f"{year_match.group(1)}-01-01"  # Default to January 1st of the year

    return from_date, to_date


import spacy
import re


def extract_name(text):
    """
    Extracts a person's name from the text after removing common titles and roles.

    Args:
        text (str): Input text.

    Returns:
        str: Extracted name or None if no name is found.
    """
    # Define stop words (titles, roles, etc.)
    stop_words = {
        "la señora", "el señor", "señor", "señora", "la", "el", "El", "La",
        "identificado(a)", "certifica", "que", "director", "directora",
        "coordinador", "coordinadora", "gerente", "jefe", "jefa", "cargo", "responsable", "empleado"
    }

    # Step 1: Clean the text by removing unwanted words
    cleaned_text = text
    for word in stop_words:
        cleaned_text = re.sub(r'\b' + re.escape(word) + r'\b', '', cleaned_text, flags=re.IGNORECASE)

    # Step 2: Use SpaCy's Named Entity Recognition (NER) to extract person names
    doc = nlp(cleaned_text)
    for ent in doc.ents:
        if ent.label_ == "PER":  # Look for "Person" entities
            return ent.text.strip()

    # Step 3: Regex fallback for structured patterns like "el señor [NAME]"
    context_pattern = r"(?i)(?:señor(?:a)?\s+)([A-ZÁÉÍÓÚÑ][A-Za-zÁÉÍÓÚÑ]*(?:\s+[A-ZÁÉÍÓÚÑ][A-Za-zÁÉÍÓÚÑ]*)+)"
    match = re.search(context_pattern, cleaned_text)
    if match:
        candidate_name = match.group(1).strip()
        return candidate_name.title()

    # Step 4: Return None if no match is found
    return None

def extract_salario(text):
    """
    Extract salary information from the given text using regex.
    Removes all non-numeric characters from the salary value.
    """
    # Regex to capture salary patterns
    pattern = r"(?:salario\s*[:de]*\s*)([\d.,]+)|[\$€]\s?[\d.,]+"
    
    # Search for the first match
    match = re.search(pattern, text, re.IGNORECASE)
    
    if match:
        # Extract the numeric portion
        salary_str = match.group(1) if match.group(1) else match.group(0)
        
        # Remove all non-numeric characters
        salary_str = re.sub(r"[^\d]", "", salary_str)
        
        try:
            return int(salary_str)  # Convert to integer if possible
        except ValueError:
            return salary_str  # Return raw value if conversion fails

    return None  # Return None if no salary is found

def extract_with_matcher(text):
    results = {"Nombre": None, "CC": None, "Salario": None, "from_tiempo_laborado": None, "to_tiempo_laborado": None}
    results["Nombre"] = extract_name(text)
    results["CC"] = extract_cc(text)
    results["Salario"] = extract_salario(text)
    results["from_tiempo_laborado"], results["to_tiempo_laborado"] = extract_tiempo_laborado(text)
    return results

def process_docs_folder():
    current_dir = os.getcwd()
    
    docs_folder = os.path.join(current_dir, "docs")
    if not os.path.exists(docs_folder) or not os.path.isdir(docs_folder):
        print(f"Subfolder 'docs' does not exist in the current directory: {current_dir}")
        return

    files = [f for f in os.listdir(docs_folder) if f.endswith((".docx", ".pdf", ".png"))]
    if not files:
        print(f"No documents found in the 'docs' subfolder: {docs_folder}")
        return

    data = []
    for file_name in files:
        file_path = os.path.join(docs_folder, file_name)
        print(f"Processing file: {file_name}")
        if file_name.endswith(".docx"):
            text = extract_text_from_docx(file_path)
        elif file_name.endswith(".pdf"):
            text = extract_text_from_pdf(file_path)
        elif file_name.endswith(".png"):
            text = extract_text_from_png(file_path)
        else:
            continue
        extracted_data = extract_with_matcher(text)
        extracted_data["File Name"] = file_name
        data.append(extracted_data)

    df = pd.DataFrame(data)
    output_csv = os.path.join(current_dir, "extracted_data.csv")
    df.to_csv(output_csv, index=False)
    print(f"\nExtracted data saved to: {output_csv}")
    return df

if __name__ == "__main__":
    df_result = process_docs_folder()
    print("\nExtracted DataFrame:")
    print(df_result)


Subfolder 'docs' does not exist in the current directory: /Users/carloscuartas/Library/CloudStorage/Box-Box/MM-TI-Análitica/03. Proyectos/Fraudubot Final/Notebooks/docs

Extracted DataFrame:
None


In [5]:
current_dir = os.getcwd()
print(current_dir)

/Users/carloscuartas/Library/CloudStorage/Box-Box/MM-TI-Análitica/03. Proyectos/Fraudubot Final/Notebooks


In [6]:
docs_folder = os.path.join(current_dir, "docs")
print(docs_folder)

/Users/carloscuartas/Library/CloudStorage/Box-Box/MM-TI-Análitica/03. Proyectos/Fraudubot Final/Notebooks/docs


In [8]:
cd /Users/carloscuartas/Library/CloudStorage/Box-Box/MM-TI-Análitica/03. Proyectos/Fraudubot Final/Notebooks/docs

/Users/carloscuartas/Library/CloudStorage/Box-Box/MM-TI-Análitica/03. Proyectos/Fraudubot Final/Notebooks/docs


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [16]:
print(current_dir)

/Users/carloscuartas/Library/CloudStorage/Box-Box/MM-TI-Análitica/03. Proyectos/Fraudubot Final/Notebooks


In [2]:
import pytesseract
from PIL import Image
pytesseract.pytesseract.tesseract_cmd = '/opt/homebrew/bin/tesseract'
print(pytesseract.image_to_string(Image.open('/Users/carloscuartas/Library/CloudStorage/Box-Box/MM-TI-Análitica/03. Proyectos/Fraudubot Final/Docs2/Carta8.png')))


Mar saccmce
ia

sistema eneriza 10 émparas de 181Y gor 4
/ Dia, Never, comoutedor porttl

CERTIFICACION LABORAL

ELECTRICOS J Y M certifica que el sefior JUAN CARLOS VARGAS
BELTRAN, colombiano, identificado con la cédula de ciudadania No.
1.069.742.674 y residente en Fusagasuga, Cundinamarca, laboré para
esta empresa desde el 10 de enero del 2018 hasta el 10 de diciembre
del 2018, con un contrato de prestacién de servicios, desempefando el
cargo de INSTALADOR DE REDES ELECTRICAS.

Durante este tiempo, el sefior JUAN CARLOS VARGAS BELTRAN recibia
una asignacién mensual a titulo de honorarios de OCHOCIENTOS
CATORCE MIL PESOS M/CTE ($814.000), sin un horario definido.

En virtud del contrato de prestacién de servicios el sefor JUAN CARLOS
VARGAS BELTRAN realizaba instalaciones eléctricas domiciliarias,
instalacién de cableado interno e instalaciones de tomas e interruptores.

Se expide la presente certificaci6n en Fusagasuga, Cundinamarca, a los
11 dias del mes de junio del 2021.

MARIBEL

In [19]:
!ls

In [3]:
import os
print(os.path.exists('/Users/carloscuartas/Library/CloudStorage/Box-Box/MM-TI-Análitica/03. Proyectos/Fraudubot Final/Docs2/Carta8.png'))



True


In [11]:
import pandas as pd
cedula=1110549584
fraude = pd.read_excel("../Datos/BASE_PARA_PREDICT.xlsx")
caso = fraude[fraude['CEDULA'] == cedula]



In [12]:
caso

Unnamed: 0,N,PRESTACIONES_SOCIALES,COINCIDE_DIRECCION,CONTEO_TELEFONO_BASEFRAUDE,CONTEO_CORREO_BASEFRAUDE,OTRAS_SOLICITUDES,CONTEO_TELEFONO_RECONOCER,CONTEO_EMAIL_RECONOCER,CANT_MORA30_ULT12MESES_HISTORICO,CANT_MORA60_ULT12MESES_HISTORICO,CANT_MORA90_ULT12MESES_HISTORICO,CANT_MORA120_ULT12MESES_HISTORICO,RESULTADO_SCORE,PERSONAS_CARGO,DIFERENCIA_meses_mail,DIFERENCIA_meses_tel,INGRESOS_CONSOLIDADO_smlv,EGRESOS_CONSOLIDADO_smlv,ENDEUDAMIENTO_SMLV,CEDULA
0,8273,1,0,0,0,0,1,2,0,0,0,0,52.84,0,-1,2,733,0,0,1110549584
1,10519,1,0,0,0,0,3,1,1,1,1,1,70.83,0,-1,1,0,0,0,1110549584
2,10593,1,1,0,0,1,2,2,0,0,0,0,52.58,0,1,1,932,0,379,1110549584


In [7]:
# After the PDF file is uploaded
with open(temp_file_path, "rb") as f:
    content = f.read(200)  # Read the first 200 bytes
    print(content)


NameError: name 'temp_file_path' is not defined

In [8]:
import os
import pytesseract
from pdf2image import convert_from_path
from word2number import w2n
import spacy
import unicodedata
from PIL import Image

# Set the TESSDATA_PREFIX environment variable
os.environ['TESSDATA_PREFIX'] = '/opt/homebrew/share/'  # Adjust accordingly

# Set the path to Tesseract executable
pytesseract.pytesseract.tesseract_cmd = '/opt/homebrew/bin/tesseract'  # Adjust accordingly

# Check if this is being set correctly
print(f"TESSDATA_PREFIX is set to: {os.getenv('TESSDATA_PREFIX')}")
print(f"Tesseract is located at: {pytesseract.pytesseract.tesseract_cmd}")

# Test with your OCR functions now


TESSDATA_PREFIX is set to: /opt/homebrew/share/
Tesseract is located at: /opt/homebrew/bin/tesseract
