In [3]:
import os
import sys
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import mammoth
import json
from urllib.parse import urlparse, urljoin, unquote
import re
import PyPDF2
import pdfplumber
import docx
import pypandoc

from tld import get_fld

INFO:numexpr.utils:Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


In [4]:
import pypandoc
#pypandoc.download_pandoc()

In [5]:
# Create a folder for downloads
DOWNLOAD_DIR = "999_downloaded_documents_link1"
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

In [6]:
# Read URLs from file
def read_urls(file_name='999.web_urls.txt'):
    with open(file_name, 'r') as f:
        return [line.strip() for line in f.readlines()]

In [7]:
# ---> Función auxiliar para verificar si un enlace es interno (pertenece al mismo dominio)
# Esta funcion es un componente del proceso en bloque 7 que "scrape and store document links in JSON with parent page path segments" ---> esta funcion evita enviar request a links por fuera del dominio al que estamos scrapping


# Helper function to check if a link is internal (within the same domain)
def is_internal_link(url, base_url):
    # Parse both URLs
    parsed_url = urlparse(url)
    parsed_base_url = urlparse(base_url)

    # Check if the domain of the URL matches the base URL's domain
    # TRUE si el dominio es el mismo que el base pero FULL URL, RELATIVE PATH, SAME DOMAIN BUT DIFFERENT PROTOCOL else FALSE
    return parsed_url.netloc == parsed_base_url.netloc or parsed_url.netloc == ''

In [8]:
# Verifica si el contenido de una respuesta es un formato binario (como PDF o Word)
#'application/pdf' for PDFs,'application/msword' for Word documents, and 'application/vnd' for various document formats (e.g., Excel or OpenDocument).
# THIS FUNCTION IS A COMPONENT FOR get_document_links() Function

def is_binary_content(response):
    content_type = response.headers.get('Content-Type', '').lower()
    return 'application/pdf' in content_type or 'application/msword' in content_type or 'application/vnd' in content_type

In [9]:
# Función para limpiar nombres de archivos extraídos de URLs
# The cleaned file_name is returned, making it suitable for use as a filename.
# THIS FUNCTION IS A COMPONENT FOR download_file() Function

# Function to clean filenames
def clean_filename(href):
    # Extract the file name from the URL
    file_name = unquote(os.path.basename(urlparse(href).path))
    # Remove invalid characters for filenames using a regex
    file_name = re.sub(r'[<>:"/\\|?*]', '_', file_name)
    # Fallback if the URL doesn't have a file name
    if not file_name:
        file_name = "downloaded_file"
    return file_name

In [10]:
# Función para obtener enlaces de documentos desde un sitio web, navegando recursivamente por enlaces internos.
# THIS FUNCTION IS A COMPONEN FOR scrape_and_store_links() and scrape_documents_from_website() FUNCTIONS

# Recursively fetch document links from a website, traversing internal links.
def get_document_links(url, base_url, visited=None):
    if visited is None:
        visited = set()

    doc_links = []

    # Ensure we do not revisit the same page more than once
    if url in visited:
        return doc_links
    visited.add(url)

    try:
        sys.stdout.write(f"Processing: {url}\r")
        sys.stdout.flush()

        response = requests.get(url)

        # Skip binary files (like PDFs, DOCs) to avoid trying to parse them
        if is_binary_content(response):
            sys.stdout.write(f"Skipping binary file: {url}\r")
            sys.stdout.flush()
            return doc_links

        # Parse HTML content with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all document links on the page (pdf, doc, docx)
        for link in soup.find_all('a', href=True):
            href = link['href']
            if href.startswith(('tel:', 'mailto:')):
                continue
            if href.endswith(('pdf', 'doc', 'docx','doc')):
                full_url = href if href.startswith('http') else urljoin(url, href)
                # Append both the parent page (url) and the document link (full_url)
                doc_links.append([url, (link, full_url)])  # Return parent page and document link as 2D array

        # Now, find all internal links to recursively navigate
        for link in soup.find_all('a', href=True):
            href = link['href']
            full_url = href if href.startswith('http') else urljoin(url, href)
            if is_internal_link(full_url, base_url) and full_url not in visited:
                doc_links.extend(get_document_links(full_url, base_url, visited))

    except Exception as e:
        print(f"\nError fetching {url}: {e}")

    return doc_links


Purpose
The function get_document_links() returns a list of document links found on the site, with each link paired with its originating page. This approach is useful for collecting downloadable files from an entire website, starting from a base URL and exploring all reachable internal pages.

OUTCOME EXAMPLE: 

doc_links = 
[
    [
        'https://example.com/page1', 
        ('Document 1', 'https://example.com/files/doc1.pdf')
    ],
    [
        'https://example.com/page1', 
        ('Document 2', 'https://example.com/files/doc2.doc')
    ],
    [
        'https://example.com/page2', 
        ('Research Paper', 'https://example.com/files/research_paper.pdf')
    ],
    [
        'https://example.com/page3', 
        ('Report 2023', 'https://example.com/files/report.docx')
    ]
]


In [11]:
# Función para convertir una cadena de texto a formato camelCase
# THIS FUNCTION IS A COMPONENT OF scrape_and_store_links() function
# camelCase, un estilo común de escritura en programación, especialmente útil para nombres de variables y funciones.
# La función convierte texto a camelCase (ej. nombreCompleto), ideal para nombres de variables y claves en JSON, siguiendo convenciones de programación

def to_camel_case(text):
    # Return empty string if the input is empty or None
    if not text:
        return ''
    text = re.sub(r'[_\-]+', ' ', text)
    words = text.split()
    if not words:
        return ''
    camel_case_text = words[0].lower() + ''.join(word.capitalize() for word in words[1:])
    
    return camel_case_text

In [12]:
# Función para extraer y almacenar enlaces de documentos en JSON, incluyendo los segmentos de ruta de la página principal
# Esta funcion crea el archivo .json "scraped_links_y-m-d-H-M-S) 
# Extrae metadata parent_page, tittle, href y file_url
# Esta funcion se activa y es componente del ultimo bloque de codigo main() function

def scrape_and_store_links(urls):
    data = []  # List to hold the scraped links
    timestamp =datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    
    ##timestamp = datetime.now().isoformat()  # Timestamp for saving the file

    for url in urls:
        print(f"Processing URL: {url}")
        base_url = url  # This is the parent page (where the link is found)

        doc_links = get_document_links(base_url, base_url, visited=set())  # Scrape the links from the base URL

        for parent_page, (link, doc_url) in doc_links:  # Extract parent_page and document link from the 2D array
            # Get the title from the link element (use link text)
            title = link.string or link.get_text().strip() or "No title"  # Fallback to "No title" if empty

            # Extract the path segments from the parent page (base_url)
            parsed_url = urlparse(parent_page)  # Use the parent_page returned by get_document_links
            path_segments = parsed_url.path.strip('/').split('/')
            
            # If path is empty, use domain (netloc) as the folder name
            if not path_segments or path_segments == ['']:
                path_segments = [parsed_url.netloc]  # Fallback to the domain if the path is empty
            else:
                # Convert to camelCase and skip empty segments
                path_segments = [to_camel_case(segment) for segment in path_segments if segment]

            # Append the details of each link to the data list, including the parent page path segments
            data.append({
                "timestamp": timestamp,
                "parent_page": path_segments,  # Store the path segments as an array
                "title": title,
                "href": link['href'],  # Extract href directly from the <a> tag (the document link)
                "file_url": doc_url,  # Full URL of the document
            })

    # Convert the list of links into a DataFrame and save as JSON
    df = pd.DataFrame(data)
    output_file = f"scraped_links_{timestamp}.json"
    df.to_json(output_file, orient="records", indent=4)
    print(f"Scraping complete. Links saved to {output_file}")
    return output_file


La función scrape_and_store_links extrae la siguiente metadata para cada documento encontrado:

timestamp: La fecha y hora exacta de la extracción, en formato YYYY-MM-DD_HH-MM-SS. Esto facilita saber cuándo se realizó el scraping.

parent_page: Los segmentos de la ruta de la página principal donde se encontró el enlace. Cada segmento de la URL se guarda como un elemento en una lista, usando formato camelCase (ej. ["seccionPrincipal", "subseccion"]). Si no hay ruta, se usa el dominio.

title: El texto del enlace al documento, que generalmente describe el documento o su título. Si está vacío, se establece como "No title".

href: La URL parcial extraída directamente del atributo href del enlace (<a>) en la página.

file_url: La URL completa del documento (PDF, DOC, etc.), que se puede utilizar para descargar el archivo.

Este conjunto de metadata organiza los documentos y su procedencia, siendo útil para estructurar y consultar enlaces descargables.

In [13]:
# Main function to initiate scraping for documents
# Función principal para iniciar la extracción de documentos desde un sitio web
# Esta funcion se activa y es componente del ultimo bloque de codigo main() function



def scrape_documents_from_website(url):
    base_url = url  # The starting URL
    visited = set()  # To keep track of visited URLs
    doc_links = get_document_links(base_url, base_url, visited)

    print(f"Found {len(doc_links)} document links:")
    for title, link in doc_links:
        print(f"- {title}: {link}")

    return doc_links

LOGICA DE scrape_documents_from_website(x):

Si llamamos a la función con un sitio web como https://example.com, el output en la consola podría verse así:
scrape_documents_from_website("https://example.com")

OUTPUT IS A LIST OF TUPPLES
Each tuple in the list contains two elements: the title of the document and the document’s URL.

OUTPUT:
Found 3 document links:
- Document 1: https://example.com/files/doc1.pdf
- Report 2022: https://example.com/files/report2022.doc
- Guide: https://example.com/downloads/guide.docx

Explicación
La función muestra:

El total de enlaces de documentos encontrados (en este caso, 3).
Una lista donde cada línea representa un documento, mostrando su título (Document 1, Report 2022, Guide) y el enlace directo al archivo (doc1.pdf, report2022.doc, guide.docx).
Este formato hace fácil ver qué documentos fueron encontrados y sus URLs exactas, útiles para confirmar que los enlaces fueron correctamente detectados.

In [1]:
# Function to create subdirectories based on the parent page path segments
# Función para crear subdirectorios basados en los segmentos de ruta de la página principal
# This function organizes documents into a structured folder system based on the webpage hierarchy, making it easier to manage downloaded files according to their origin.
# This function is called and a component for download_file() function


def create_directory_structure(parent_page_segments, doc_url):
    # Convert the path segments array into a directory path
    directory_path = os.path.join(DOWNLOAD_DIR, *parent_page_segments)

    # Create the directories if they don't exist
    os.makedirs(directory_path, exist_ok=True)

    return directory_path

In [15]:
# Function to download a document and save it in the appropriate subdirectory
# Función para descargar un documento y guardarlo en el subdirectorio correspondiente
# Esta funcion se activa y es componente del ultimo bloque de codigo main() function

def download_file(base_url, doc_url, context_href):
    # Use the href to generate a meaningful name
    file_name = clean_filename(context_href)

    # Create the directory structure based on Page and Section
    directory_path = create_directory_structure(base_url, doc_url)

    # Create the full file path to save the file
    file_path = os.path.join(directory_path, file_name)

    try:
        response = requests.get(doc_url)
        with open(file_path, 'wb') as f:
            f.write(response.content)
        print(f"\nDownloaded: {file_name} to {directory_path}")
        return file_path
    except Exception as e:
        print(f"\nFailed to download {doc_url}: {e}")
        return None

In [16]:
import logging

# Set up logging configuration
logging.basicConfig(
    filename="scraping_log.txt",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

logging.info("Starting the scraping process...")

def download_file(base_url, doc_url, context_href):
    # Use the href to generate a meaningful name
    file_name = clean_filename(context_href)
    directory_path = create_directory_structure(base_url, doc_url)
    file_path = os.path.join(directory_path, file_name)

    try:
        response = requests.get(doc_url, timeout=10)  # Add timeout here
        with open(file_path, 'wb') as f:
            f.write(response.content)
        logging.info(f"Downloaded: {file_name} to {directory_path}")
        return file_path
    except requests.exceptions.Timeout:
        logging.warning(f"Timeout occurred for {doc_url}")
        return None
    except Exception as e:
        logging.error(f"Failed to download {doc_url}: {e}")
        return None


INFO:root:Starting the scraping process...


In [17]:
# Extract text from files (.docx, .doc, and .pdf supported)
# Funciones para extraer texto de archivos (.docx, .doc y .pdf)
# Componente que se llama cuando se aplica download_documents_from_json()


def extract_text_from_file(file_path):
    if file_path.endswith(".docx"):
        return extract_text_from_docx(file_path)
    elif file_path.endswith(".pdf"):
        return extract_text_from_pdf(file_path)
    elif file_path.endswith(".doc"):
        return extract_text_from_doc(file_path)
    return None

# Extract text from .docx files using mammoth
def extract_text_from_docx(file_path):
    try:
        with open(file_path, "rb") as docx_file:
            result = mammoth.extract_raw_text(docx_file)
            return result.value
    except Exception as e:
        print(f"Error extracting text from .docx file {file_path}: {e}")
        return None

# Extract text from .pdf files using PyPDF2 or pdfplumber
def extract_text_from_pdf(file_path):
    try:
        # Using pdfplumber for more robust text extraction from PDFs
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text() or ""  # Ensure we add non-None content
            return text if text else "No text found in the PDF"
    except Exception as e:
        print(f"Error extracting text from PDF file {file_path}: {e}")
        return None

# Extract text from .doc files using pypandoc
def extract_text_from_doc(file_path):
    try:
        # Using pypandoc to convert .doc to text
        return pypandoc.convert_file(file_path, 'plain')
    except Exception as e:
        print(f"Error extracting text from .doc file {file_path}: {e}")
        return None

In [18]:
import win32com.client as win32
from docx import Document  # For extracting text from .docx
import mammoth
import pdfplumber
import os
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)

def extract_text_from_file(file_path):
    if file_path.endswith(".docx"):
        return extract_text_from_docx(file_path)
    elif file_path.endswith(".pdf"):
        return extract_text_from_pdf(file_path)
    elif file_path.endswith(".doc"):
        return extract_text_from_doc(file_path)
    return None

# Extract text from .docx files using mammoth
def extract_text_from_docx(file_path):
    try:
        with open(file_path, "rb") as docx_file:
            result = mammoth.extract_raw_text(docx_file)
            return result.value
    except Exception as e:
        logging.error(f"Error extracting text from .docx file {file_path}: {e}")
        return None

# Extract text from .pdf files using pdfplumber
def extract_text_from_pdf(file_path):
    try:
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text() or ""  # Ensure we add non-None content
            return text if text else "No text found in the PDF"
    except Exception as e:
        logging.error(f"Error extracting text from PDF file {file_path}: {e}")
        return None

# Extract text from .doc files directly using win32com.client
def extract_text_from_doc(file_path):
    try:
        word = win32.Dispatch("Word.Application")
        word.Visible = False  # Optional: Keep Word hidden
        doc = word.Documents.Open(os.path.abspath(file_path))
        doc_text = doc.Content.Text
        doc.Close()
        word.Quit()
        return doc_text

    except Exception as e:
        logging.error(f"Error extracting text from .doc file {file_path}: {e}")
        return None


In [19]:
import win32com.client as win32
from docx import Document  # For extracting text from .docx
import mammoth
import pdfplumber
import os
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)

def extract_text_from_file(file_path):
    if file_path.endswith(".docx"):
        return extract_text_from_docx(file_path)
    elif file_path.endswith(".pdf"):
        return extract_text_from_pdf(file_path)
    elif file_path.endswith(".doc"):
        return extract_text_from_doc(file_path)
    return None

# Extract text from .docx files using mammoth with UTF-8 encoding
def extract_text_from_docx(file_path):
    try:
        with open(file_path, "rb") as docx_file:
            result = mammoth.extract_raw_text(docx_file)
            text = result.value
            return text.encode("utf-8").decode("utf-8")  # Ensure UTF-8 encoding
    except Exception as e:
        logging.error(f"Error extracting text from .docx file {file_path}: {e}")
        return None

# Extract text from .pdf files using pdfplumber with UTF-8 encoding
def extract_text_from_pdf(file_path):
    try:
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                page_text = page.extract_text() or ""
                text += page_text.encode("utf-8").decode("utf-8")  # UTF-8 encode/decode
            return text if text else "No text found in the PDF"
    except Exception as e:
        logging.error(f"Error extracting text from PDF file {file_path}: {e}")
        return None

# Extract text from .doc files directly using win32com.client with UTF-8 encoding
def extract_text_from_doc(file_path):
    try:
        word = win32.Dispatch("Word.Application")
        word.Visible = False  # Optional: Keep Word hidden
        doc = word.Documents.Open(os.path.abspath(file_path))
        doc_text = doc.Content.Text
        doc.Close()
        word.Quit()
        return doc_text.encode("utf-8").decode("utf-8")  # UTF-8 encode/decode

    except Exception as e:
        logging.error(f"Error extracting text from .doc file {file_path}: {e}")
        return None


In [20]:
# Extract the main domain (base URL) using the tld library
# Extraer el dominio principal (URL base) usando la biblioteca tld
# NO ESTAMOS USANDO ESTA FUNCION
# NO ES ACTIVADA  

def get_base_domain(url):
    try:
        # Extract the top-level domain (e.g., comunidadandina.org)
        return get_fld(url, fix_protocol=True)
    except Exception as e:
        print(f"Error extracting base domain from {url}: {e}")
        return urlparse(url).netloc

In [21]:
# Function to read the stored links and download documents into organized directories
# Función para leer los enlaces almacenados, descargar documentos y organizar en directorios



def download_documents_from_json(json_file):
    with open(json_file, 'r') as f:
        links_data = json.load(f)

    for link_data in links_data:
        doc_url = link_data['file_url']
        href = link_data['href']
        parent_page_segments = link_data['parent_page']  # Array of path segments

        sys.stdout.write(f"Downloading: {doc_url}\r")
        sys.stdout.flush()

        # Use the parent page path segments to create the folder structure
        file_path = download_file(parent_page_segments, doc_url, href)  # Download the document using the parent page path segments

        if file_path:
            extracted_text = extract_text_from_file(file_path)  # Extract text from the file (if applicable)
            # Update the JSON structure with the downloaded file path and content
            link_data['downloaded_path'] = file_path
            link_data['extracted_content'] = extracted_text or "N/A"

    # Save updated data back to the JSON file with results
    output_file = f"results_with_download_{datetime.now().isoformat().replace(':', '-')}.json"
    with open(output_file, 'w') as f:
        json.dump(links_data, f, indent=4)

    print(f"Downloads complete. Updated results saved to {output_file}")

This function, download_documents_from_json, reads a JSON file with document links, downloads each document, organizes them into folders, and extracts content if possible. Here’s how it works:

Load JSON Data:

with open(json_file, 'r') as f: links_data = json.load(f): Opens and reads the JSON file, loading the document link data into links_data.
Process Each Document:

For each document link in links_data, it retrieves:
doc_url: The full URL of the document.
href: The direct link to the document in the HTML.
parent_page_segments: An array of path segments from the parent page URL, used to organize folders.
Download and Extract Content:

file_path = download_file(parent_page_segments, doc_url, href): Downloads the document, organizing it by folder based on the parent page’s path segments. If successful, the file path is returned.
If the file is downloaded, extract_text_from_file(file_path) attempts to extract text from the document (if it’s a .pdf, .doc, or .docx).
Update Metadata and Save:

The downloaded path (downloaded_path) and extracted text (extracted_content) are added to link_data.
Finally, the updated data is saved back into a new JSON file, results_with_download_<timestamp>.json, with each link entry containing paths and extracted content.

In [22]:
from tqdm import tqdm

# Assuming 'get_document_links' and 'to_camel_case' functions are defined

def scrape_and_store_links(urls):
    data = []  # List to hold the scraped links
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    for url in tqdm(urls, desc="Processing URLs"):  # Add tqdm progress bar to URLs loop
        print(f"Processing URL: {url}")
        base_url = url  # This is the parent page (where the link is found)

        doc_links = get_document_links(base_url, base_url, visited=set())  # Scrape the links from the base URL

        for parent_page, (link, doc_url) in tqdm(doc_links, desc="Extracting links", leave=False):  # Inner progress bar for each URL
            # Get the title from the link element (use link text)
            title = link.string or link.get_text().strip() or "No title"  # Fallback to "No title" if empty

            # Extract the path segments from the parent page (base_url)
            parsed_url = urlparse(parent_page)  # Use the parent_page returned by get_document_links
            path_segments = parsed_url.path.strip('/').split('/')

            # If path is empty, use domain (netloc) as the folder name
            if not path_segments or path_segments == ['']:
                path_segments = [parsed_url.netloc]  # Fallback to the domain if the path is empty
            else:
                # Convert to camelCase and skip empty segments
                path_segments = [to_camel_case(segment) for segment in path_segments if segment]

            # Append the details of each link to the data list, including the parent page path segments
            data.append({
                "timestamp": timestamp,
                "parent_page": path_segments,  # Store the path segments as an array
                "title": title,
                "href": link['href'],  # Extract href directly from the <a> tag (the document link)
                "file_url": doc_url,  # Full URL of the document
            })

    # Convert the list of links into a DataFrame and save as JSON
    df = pd.DataFrame(data)
    output_file = f"scraped_links_{timestamp}.json"
    df.to_json(output_file, orient="records", indent=4)
    print(f"Scraping complete. Links saved to {output_file}")
    return output_file


In [23]:
import win32com.client as win32
from docx import Document  # For extracting text from .docx
import mammoth
import pdfplumber
import os
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)

def extract_text_from_file(file_path):
    if file_path.endswith(".docx"):
        return extract_text_from_docx(file_path)
    elif file_path.endswith(".pdf"):
        return extract_text_from_pdf(file_path)
    elif file_path.endswith(".doc"):
        return extract_text_from_doc(file_path)
    return None

# Extract text from .docx files using mammoth with UTF-8 handling for accented characters
def extract_text_from_docx(file_path):
    try:
        with open(file_path, "rb") as docx_file:
            result = mammoth.extract_raw_text(docx_file)
            text = result.value
            return text.encode("utf-8").decode("utf-8")  # Ensure UTF-8 encoding
    except Exception as e:
        logging.error(f"Error extracting text from .docx file {file_path}: {e}")
        return None

# Extract text from .pdf files using pdfplumber with UTF-8 encoding and explicit decode handling
def extract_text_from_pdf(file_path):
    try:
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                page_text = page.extract_text() or ""
                text += page_text  # Concatenate raw text
            # Encode and decode entire text as UTF-8 after extraction to handle accented characters
            return text.encode("utf-8").decode("utf-8") if text else "No text found in the PDF"
    except Exception as e:
        logging.error(f"Error extracting text from PDF file {file_path}: {e}")
        return None

# Extract text from .doc files directly using win32com.client with UTF-8 encoding
def extract_text_from_doc(file_path):
    try:
        word = win32.Dispatch("Word.Application")
        word.Visible = False  # Optional: Keep Word hidden
        doc = word.Documents.Open(os.path.abspath(file_path))
        doc_text = doc.Content.Text
        doc.Close()
        word.Quit()
        # Handle UTF-8 encoding for accented characters
        return doc_text.encode("utf-8", "ignore").decode("utf-8", "ignore")  # Ignore errors if encoding fails

    except Exception as e:
        logging.error(f"Error extracting text from .doc file {file_path}: {e}")
        return None


In [24]:
import win32com.client as win32
from docx import Document  # For extracting text from .docx
import mammoth
import pdfplumber
import os
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)

def extract_text_from_file(file_path):
    if file_path.endswith(".docx"):
        return extract_text_from_docx(file_path)
    elif file_path.endswith(".pdf"):
        return extract_text_from_pdf(file_path)
    elif file_path.endswith(".doc"):
        return extract_text_from_doc(file_path)
    return None

# Extract text from .docx files using mammoth with UTF-8-SIG encoding
def extract_text_from_docx(file_path):
    try:
        with open(file_path, "rb") as docx_file:
            result = mammoth.extract_raw_text(docx_file)
            text = result.value
            return text.encode("utf-8-sig").decode("utf-8-sig")  # Ensure UTF-8-SIG encoding
    except Exception as e:
        logging.error(f"Error extracting text from .docx file {file_path}: {e}")
        return None

# Extract text from .pdf files using pdfplumber with UTF-8-SIG encoding
def extract_text_from_pdf(file_path):
    try:
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                page_text = page.extract_text() or ""
                text += page_text  # Concatenate raw text
            # Encode and decode entire text as UTF-8-SIG after extraction
            return text.encode("utf-8-sig").decode("utf-8-sig") if text else "No text found in the PDF"
    except Exception as e:
        logging.error(f"Error extracting text from PDF file {file_path}: {e}")
        return None

# Extract text from .doc files directly using win32com.client with UTF-8-SIG encoding
def extract_text_from_doc(file_path):
    try:
        word = win32.Dispatch("Word.Application")
        word.Visible = False  # Optional: Keep Word hidden
        doc = word.Documents.Open(os.path.abspath(file_path))
        doc_text = doc.Content.Text
        doc.Close()
        word.Quit()
        # Handle UTF-8-SIG encoding for accented characters
        return doc_text.encode("utf-8-sig", "ignore").decode("utf-8-sig", "ignore")

    except Exception as e:
        logging.error(f"Error extracting text from .doc file {file_path}: {e}")
        return None

#test

In [None]:
import win32com.client as win32
from docx import Document  # For extracting text from .docx
import mammoth
import pdfplumber
import os
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)

def extract_text_from_file(file_path):
    if file_path.endswith(".docx"):
        return extract_text_from_docx(file_path)
    elif file_path.endswith(".pdf"):
        return extract_text_from_pdf(file_path)
    elif file_path.endswith(".doc"):
        return extract_text_from_doc(file_path)
    return None

# Extract text from .docx files using mammoth
def extract_text_from_docx(file_path):
    try:
        with open(file_path, "rb") as docx_file:
            result = mammoth.extract_raw_text(docx_file)
            return result.value
    except Exception as e:
        logging.error(f"Error extracting text from .docx file {file_path}: {e}")
        return None

# Extract text from .pdf files using pdfplumber and clean Unicode escapes
def extract_text_from_pdf(file_path):
    try:
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                page_text = page.extract_text() or ""
                text += page_text  # Concatenate raw text

            # Decode any Unicode escape sequences (e.g., \u00f3) to readable text
            return text.encode().decode('unicode_escape') if text else "No text found in the PDF"
    except Exception as e:
        logging.error(f"Error extracting text from PDF file {file_path}: {e}")
        return None

# Extract text from .doc files directly using win32com.client and clean Unicode escapes
def extract_text_from_doc(file_path):
    try:
        word = win32.Dispatch("Word.Application")
        word.Visible = False  # Optional: Keep Word hidden
        doc = word.Documents.Open(os.path.abspath(file_path))
        doc_text = doc.Content.Text
        doc.Close()
        word.Quit()
        
        # Clean up Unicode escape sequences
        return doc_text.encode("utf-8", "replace").decode("unicode_escape", "replace")
    except Exception as e:
        logging.error(f"Error extracting text from .doc file {file_path}: {e}")
        return None

# Example usage
file_path = "your_file_path_here"
extracted_text = extract_text_from_file(file_path)
print(extracted_text)


In [26]:
import win32com.client as win32
from docx import Document  # For extracting text from .docx
import mammoth
import pdfplumber
import os
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)

def extract_text_from_file(file_path):
    if file_path.endswith(".docx"):
        return extract_text_from_docx(file_path)
    elif file_path.endswith(".pdf"):
        return extract_text_from_pdf(file_path)
    elif file_path.endswith(".doc"):
        return extract_text_from_doc(file_path)
    return None

# Extract text from .docx files using mammoth and clean Unicode escapes
def extract_text_from_docx(file_path):
    try:
        with open(file_path, "rb") as docx_file:
            result = mammoth.extract_raw_text(docx_file)
            text = result.value
            # Decode any Unicode escape sequences (e.g., \u00f3) to readable text
            return text.encode().decode('unicode_escape') if text else "No text found in the DOCX file"
    except Exception as e:
        logging.error(f"Error extracting text from .docx file {file_path}: {e}")
        return None

# Extract text from .pdf files using pdfplumber and clean Unicode escapes
def extract_text_from_pdf(file_path):
    try:
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                page_text = page.extract_text() or ""
                text += page_text  # Concatenate raw text
            # Decode any Unicode escape sequences (e.g., \u00f3) to readable text
            return text.encode().decode('unicode_escape') if text else "No text found in the PDF"
    except Exception as e:
        logging.error(f"Error extracting text from PDF file {file_path}: {e}")
        return None

# Extract text from .doc files directly using win32com.client and clean Unicode escapes
def extract_text_from_doc(file_path):
    try:
        word = win32.Dispatch("Word.Application")
        word.Visible = False  # Optional: Keep Word hidden
        doc = word.Documents.Open(os.path.abspath(file_path))
        doc_text = doc.Content.Text
        doc.Close()
        word.Quit()
        
        # Clean up Unicode escape sequences
        return doc_text.encode("utf-8", "replace").decode("unicode_escape", "replace")
    except Exception as e:
        logging.error(f"Error extracting text from .doc file {file_path}: {e}")
        return None

# Example usage
file_path = "your_file_path_here"
extracted_text = extract_text_from_file(file_path)
print(extracted_text)


None


In [28]:
import win32com.client as win32
from docx import Document  # For extracting text from .docx
import mammoth
import pdfplumber
import os
import logging
import unicodedata

# Setup logging
logging.basicConfig(level=logging.INFO)

def extract_text_from_file(file_path):
    if file_path.endswith(".docx"):
        return normalize_text(extract_text_from_docx(file_path))
    elif file_path.endswith(".pdf"):
        return normalize_text(extract_text_from_pdf(file_path))
    elif file_path.endswith(".doc"):
        return normalize_text(extract_text_from_doc(file_path))
    return None

# Normalize extracted text to handle Unicode sequences
def normalize_text(text):
    if text:
        # Normalize to NFC (Canonical Composition) to handle accented characters
        return unicodedata.normalize('NFC', text)
    return text

# Extract text from .docx files using mammoth
def extract_text_from_docx(file_path):
    try:
        with open(file_path, "rb") as docx_file:
            result = mammoth.extract_raw_text(docx_file)
            return result.value
    except Exception as e:
        logging.error(f"Error extracting text from .docx file {file_path}: {e}")
        return None

# Extract text from .pdf files using pdfplumber
def extract_text_from_pdf(file_path):
    try:
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                page_text = page.extract_text() or ""
                text += page_text  # Concatenate raw text
            return text if text else "No text found in the PDF"
    except Exception as e:
        logging.error(f"Error extracting text from PDF file {file_path}: {e}")
        return None

# Extract text from .doc files directly using win32com.client
def extract_text_from_doc(file_path):
    try:
        word = win32.Dispatch("Word.Application")
        word.Visible = False  # Optional: Keep Word hidden
        doc = word.Documents.Open(os.path.abspath(file_path))
        doc_text = doc.Content.Text
        doc.Close()
        word.Quit()
        return doc_text
    except Exception as e:
        logging.error(f"Error extracting text from .doc file {file_path}: {e}")
        return None

# Example usage
file_path = "your_file_path_here"
extracted_text = extract_text_from_file(file_path)
print(extracted_text)


None


In [31]:
import win32com.client as win32
from docx import Document  # For extracting text from .docx
import mammoth
import pdfplumber
import os
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)

def extract_text_from_file(file_path):
    if file_path.endswith(".docx"):
        return extract_text_from_docx(file_path)
    elif file_path.endswith(".pdf"):
        return extract_text_from_pdf(file_path)
    elif file_path.endswith(".doc"):
        return extract_text_from_doc(file_path)
    return None

# Function to clean up any remaining unicode sequences
def clean_unicode_sequences(text):
    try:
        return text.encode('utf-8').decode('utf-8-sig')
    except Exception as e:
        logging.error(f"Error during Unicode cleanup: {e}")
        return text

# Extract text from .docx files using mammoth
def extract_text_from_docx(file_path):
    try:
        with open(file_path, "rb") as docx_file:
            result = mammoth.extract_raw_text(docx_file)
            text = result.value
            return clean_unicode_sequences(text)  # Clean up Unicode sequences
    except Exception as e:
        logging.error(f"Error extracting text from .docx file {file_path}: {e}")
        return None

# Extract text from .pdf files using pdfplumber
def extract_text_from_pdf(file_path):
    try:
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                page_text = page.extract_text() or ""
                text += page_text  # Concatenate raw text
            return clean_unicode_sequences(text) if text else "No text found in the PDF"
    except Exception as e:
        logging.error(f"Error extracting text from PDF file {file_path}: {e}")
        return None

# Extract text from .doc files directly using win32com.client
def extract_text_from_doc(file_path):
    try:
        word = win32.Dispatch("Word.Application")
        word.Visible = False  # Optional: Keep Word hidden
        doc = word.Documents.Open(os.path.abspath(file_path))
        doc_text = doc.Content.Text
        doc.Close()
        word.Quit()
        return clean_unicode_sequences(doc_text)
    except Exception as e:
        logging.error(f"Error extracting text from .doc file {file_path}: {e}")
        return None

# Example usage
file_path = "your_file_path_here"
extracted_text = extract_text_from_file(file_path)
print(extracted_text)


None


In [35]:
import win32com.client as win32
from docx import Document  # For extracting text from .docx
import mammoth
import pdfplumber
import os
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)

def extract_text_from_file(file_path):
    if file_path.endswith(".docx"):
        return extract_text_from_docx(file_path)
    elif file_path.endswith(".pdf"):
        return extract_text_from_pdf(file_path)
    elif file_path.endswith(".doc"):
        return extract_text_from_doc(file_path)
    return None

# Function to clean up any remaining unicode sequences
def clean_unicode_sequences(text):
    try:
        return text.encode('utf-8').decode('utf-8-sig')
    except Exception as e:
        logging.error(f"Error during Unicode cleanup: {e}")
        return text

# Extract text from .docx files using mammoth
def extract_text_from_docx(file_path):
    try:
        with open(file_path, "rb") as docx_file:
            result = mammoth.extract_raw_text(docx_file)
            text = result.value
            return clean_unicode_sequences(text)  # Clean up Unicode sequences
    except Exception as e:
        logging.error(f"Error extracting text from .docx file {file_path}: {e}")
        return None

# Extract text from .pdf files using pdfplumber
def extract_text_from_pdf(file_path):
    try:
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                page_text = page.extract_text() or ""
                text += page_text  # Concatenate raw text
            return clean_unicode_sequences(text) if text else "No text found in the PDF"
    except Exception as e:
        logging.error(f"Error extracting text from PDF file {file_path}: {e}")
        return None

# Extract text from .doc files directly using win32com.client
def extract_text_from_doc(file_path):
    try:
        word = win32.Dispatch("Word.Application")
        word.Visible = False  # Optional: Keep Word hidden
        doc = word.Documents.Open(os.path.abspath(file_path))
        doc_text = doc.Content.Text
        doc.Close()
        word.Quit()
        return clean_unicode_sequences(doc_text)
    except Exception as e:
        logging.error(f"Error extracting text from .doc file {file_path}: {e}")
        return None

# Example usage
file_path = "your_file_path_here"
extracted_text = extract_text_from_file(file_path)
print(extracted_text)


None


In [38]:
import win32com.client as win32
from docx import Document  # For extracting text from .docx
import mammoth
import pdfplumber
import os
import logging
import re

# Setup logging
logging.basicConfig(level=logging.INFO)

def extract_text_from_file(file_path):
    if file_path.endswith(".docx"):
        return replace_unicode_escapes(extract_text_from_docx(file_path))
    elif file_path.endswith(".pdf"):
        return replace_unicode_escapes(extract_text_from_pdf(file_path))
    elif file_path.endswith(".doc"):
        return replace_unicode_escapes(extract_text_from_doc(file_path))
    return None

# Replace specific Unicode escapes with actual characters
def replace_unicode_escapes(text):
    if text:
        unicode_replacements = {
            r'\u00e1': 'á', r'\u00e9': 'é', r'\u00ed': 'í', r'\u00f3': 'ó', r'\u00fa': 'ú',
            r'\u00f1': 'ñ', r'\u00c1': 'Á', r'\u00c9': 'É', r'\u00cd': 'Í', r'\u00d3': 'Ó',
            r'\u00da': 'Ú', r'\u00d1': 'Ñ'
        }
        for unicode_seq, char in unicode_replacements.items():
            text = re.sub(unicode_seq, char, text)
    return text

# Extract text from .docx files using mammoth
def extract_text_from_docx(file_path):
    try:
        with open(file_path, "rb") as docx_file:
            result = mammoth.extract_raw_text(docx_file)
            return result.value
    except Exception as e:
        logging.error(f"Error extracting text from .docx file {file_path}: {e}")
        return None

# Extract text from .pdf files using pdfplumber
def extract_text_from_pdf(file_path):
    try:
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                page_text = page.extract_text() or ""
                text += page_text  # Concatenate raw text
            return text if text else "No text found in the PDF"
    except Exception as e:
        logging.error(f"Error extracting text from PDF file {file_path}: {e}")
        return None

# Extract text from .doc files directly using win32com.client
def extract_text_from_doc(file_path):
    try:
        word = win32.Dispatch("Word.Application")
        word.Visible = False  # Optional: Keep Word hidden
        doc = word.Documents.Open(os.path.abspath(file_path))
        doc_text = doc.Content.Text
        doc.Close()
        word.Quit()
        return doc_text
    except Exception as e:
        logging.error(f"Error extracting text from .doc file {file_path}: {e}")
        return None

# Example usage
file_path = "your_file_path_here"
extracted_text = extract_text_from_file(file_path)
print(extracted_text)


None


In [39]:
# Function to read the stored links and download documents into organized directories
# Función para leer los enlaces almacenados, descargar documentos y organizar en directorios



def download_documents_from_json(json_file):
    with open(json_file, 'r') as f:
        links_data = json.load(f)

    for link_data in links_data:
        doc_url = link_data['file_url']
        href = link_data['href']
        parent_page_segments = link_data['parent_page']  # Array of path segments

        sys.stdout.write(f"Downloading: {doc_url}\r")
        sys.stdout.flush()

        # Use the parent page path segments to create the folder structure
        file_path = download_file(parent_page_segments, doc_url, href)  # Download the document using the parent page path segments

        if file_path:
            extracted_text = extract_text_from_file(file_path)  # Extract text from the file (if applicable)
            # Update the JSON structure with the downloaded file path and content
            link_data['downloaded_path'] = file_path
            link_data['extracted_content'] = extracted_text or "N/A"

    # Save updated data back to the JSON file with results
    output_file = f"results_with_download_{datetime.now().isoformat().replace(':', '-')}.json"
    with open(output_file, 'w') as f:
        json.dump(links_data, f, indent=4)

    print(f"Downloads complete. Updated results saved to {output_file}")

In [None]:
# scrape_and_store_link

urls = read_urls()
scrape_and_store_links(urls)

In [41]:
download_documents_from_json('scraped_links_2024-10-15_20-26-53.json')

Downloading: https://www.comunidadandina.org/documents/quienes-somos/acuerdocartagena.pdf

INFO:root:Downloaded: acuerdocartagena.pdf to 999_downloaded_documents_link1\quienesSomos


Downloading: https://www.comunidadandina.org/documents/quienes-somos/secretaria-general/dec409.doc

INFO:root:Downloaded: dec409.doc to 999_downloaded_documents_link1\quienesSomos\secretariaGeneralDeLaComunidadAndina


Downloading: https://www.comunidadandina.org/StaticFiles/DocOf/DEC726.pdf

INFO:root:Downloaded: DEC726.pdf to 999_downloaded_documents_link1\temas\dgDec\cooperacionTecnica


Downloading: https://www.comunidadandina.org/DocOficialesFiles/Gacetas/Gace1787.pdf

INFO:root:Downloaded: Gace1787.pdf to 999_downloaded_documents_link1\temas\dgDec\cooperacionTecnica


Downloads complete. Updated results saved to results_with_download_2024-10-17T18-19-41.369908.json


In [45]:
import re
import json

def replace_characters_in_json_file(output_file):
    # Define the replacements for common problematic characters
    replacements = {
        r'\u00e1': 'á', r'\u00e9': 'é', r'\u00ed': 'í', r'\u00f3': 'ó', r'\u00fa': 'ú',
        r'\u00f1': 'ñ', r'\u00c1': 'Á', r'\u00c9': 'É', r'\u00cd': 'Í', r'\u00d3': 'Ó',
        r'\u00da': 'Ú', r'\u00d1': 'Ñ'
    }
    
    try:
        # Step 1: Read the JSON file
        with open(output_file, 'r', encoding='utf-8') as file:
            json_data = json.load(file)
        
        # Step 2: Convert JSON to string and perform replacements
        json_str = json.dumps(json_data)  # Convert JSON data to string
        
        # Perform replacements using the dictionary
        for unicode_seq, char in replacements.items():
            json_str = re.sub(unicode_seq, char, json_str)
        
        # Step 3: Convert the string back to JSON format
        updated_json_data = json.loads(json_str)
        
        # Step 4: Write the updated JSON back to the file
        with open(output_file, 'w', encoding='utf-8') as file:
            json.dump(updated_json_data, file, ensure_ascii=False, indent=4)
        
        print("Character replacements in JSON file completed successfully.")
    
    except Exception as e:
        print(f"An error occurred while processing the JSON file: {e}")

# Usage
replace_characters_in_json_file('results_with_download_2024-10-17T18-19-41.369908.json')


Character replacements in JSON file completed successfully.


In [None]:
# Función principal que organiza el flujo de scraping, almacenamiento y descarga de documentos
def main():
    urls = read_urls()  # Read URLs from the text file

    # Phase 1: Scrape and store links in JSON
    links_json_file = scrape_and_store_links(urls)
    #links_json_file = "scraped_links_2024-10-09.json"

    # Phase 2: Download documents using the stored links
    download_documents_from_json(links_json_file)


if __name__ == "__main__":
    main()
