In [46]:
import os
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import json
from docx import Document
import re
import glob
import openpyxl
import zipfile
import urllib.request
import mammoth
import pdfplumber
import win32com.client as win32
import logging
from tld import get_fld
import pypandoc
import sys
from datetime import datetime
from urllib.parse import urlparse, urljoin, unquote

In [47]:
# Create a folder for downloads
DOWNLOAD_DIR = "999_downloaded_documents_link1"
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

In [48]:
# Read URLs from file
def read_urls(file_name='999.web_urls.txt'):
    with open(file_name, 'r') as f:
        return [line.strip() for line in f.readlines()]

In [49]:
# ---> Función auxiliar para verificar si un enlace es interno (pertenece al mismo dominio)
# Esta funcion es un componente del proceso en bloque 7 que "scrape and store document links in JSON with parent page path segments" ---> esta funcion evita enviar request a links por fuera del dominio al que estamos scrapping


# Helper function to check if a link is internal (within the same domain)
def is_internal_link(url, base_url):
    # Parse both URLs
    parsed_url = urlparse(url)
    parsed_base_url = urlparse(base_url)

    # Check if the domain of the URL matches the base URL's domain
    # TRUE si el dominio es el mismo que el base pero FULL URL, RELATIVE PATH, SAME DOMAIN BUT DIFFERENT PROTOCOL else FALSE
    return parsed_url.netloc == parsed_base_url.netloc or parsed_url.netloc == ''

In [50]:
# Verifica si el contenido de una respuesta es un formato binario (como PDF o Word)
#'application/pdf' for PDFs,'application/msword' for Word documents, and 'application/vnd' for various document formats (e.g., Excel or OpenDocument).
# THIS FUNCTION IS A COMPONENT FOR get_document_links() Function

def is_binary_content(response):
    content_type = response.headers.get('Content-Type', '').lower()
    return 'application/pdf' in content_type or 'application/msword' in content_type or 'application/vnd' in content_type

In [51]:
# Función para limpiar nombres de archivos extraídos de URLs
# The cleaned file_name is returned, making it suitable for use as a filename.
# THIS FUNCTION IS A COMPONENT FOR download_file() Function

# Function to clean filenames
def clean_filename(href):
    # Extract the file name from the URL
    file_name = unquote(os.path.basename(urlparse(href).path))
    # Remove invalid characters for filenames using a regex
    file_name = re.sub(r'[<>:"/\\|?*]', '_', file_name)
    # Fallback if the URL doesn't have a file name
    if not file_name:
        file_name = "downloaded_file"
    return file_name

In [52]:
# Función para obtener enlaces de documentos desde un sitio web, navegando recursivamente por enlaces internos.
# THIS FUNCTION IS A COMPONEN FOR scrape_and_store_links() and scrape_documents_from_website() FUNCTIONS

# Recursively fetch document links from a website, traversing internal links.
def get_document_links(url, base_url, visited=None):
    if visited is None:
        visited = set()

    doc_links = []

    # Ensure we do not revisit the same page more than once
    if url in visited:
        return doc_links
    visited.add(url)

    try:
        sys.stdout.write(f"Processing: {url}\r")
        sys.stdout.flush()

        response = requests.get(url)

        # Skip binary files (like PDFs, DOCs) to avoid trying to parse them
        if is_binary_content(response):
            sys.stdout.write(f"Skipping binary file: {url}\r")
            sys.stdout.flush()
            return doc_links

        # Parse HTML content with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all document links on the page (pdf, doc, docx)
        for link in soup.find_all('a', href=True):
            href = link['href']
            if href.startswith(('tel:', 'mailto:')):
                continue
            if href.endswith(('pdf', 'doc', 'docx','doc')):
                full_url = href if href.startswith('http') else urljoin(url, href)
                # Append both the parent page (url) and the document link (full_url)
                doc_links.append([url, (link, full_url)])  # Return parent page and document link as 2D array

        # Now, find all internal links to recursively navigate
        for link in soup.find_all('a', href=True):
            href = link['href']
            full_url = href if href.startswith('http') else urljoin(url, href)
            if is_internal_link(full_url, base_url) and full_url not in visited:
                doc_links.extend(get_document_links(full_url, base_url, visited))

    except Exception as e:
        print(f"\nError fetching {url}: {e}")

    return doc_links


In [53]:
## TEST NO MAS BORRAR LUEGO

def get_document_links(url, base_url, visited=None, link_limit=6):
    if visited is None:
        visited = set()

    doc_links = []
    link_count = 0  # Counter to track the number of document links found

    # Ensure we do not revisit the same page more than once
    if url in visited:
        return doc_links
    visited.add(url)

    try:
        sys.stdout.write(f"Processing: {url}\r")
        sys.stdout.flush()

        response = requests.get(url)

        # Skip binary files (like PDFs, DOCs) to avoid trying to parse them
        if is_binary_content(response):
            sys.stdout.write(f"Skipping binary file: {url}\r")
            sys.stdout.flush()
            return doc_links

        # Parse HTML content with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all document links on the page (pdf, doc, docx)
        for link in soup.find_all('a', href=True):
            href = link['href']
            if href.startswith(('tel:', 'mailto:')):
                continue
            if href.endswith(('pdf', 'doc', 'docx', 'doc')):
                full_url = href if href.startswith('http') else urljoin(url, href)
                doc_links.append([url, (link, full_url)])  # Return parent page and document link

                link_count += 1
                if link_count >= link_limit:
                    print("Limit of 6 document links reached. Stopping.")
                    return doc_links

        # Now, find all internal links to recursively navigate
        for link in soup.find_all('a', href=True):
            href = link['href']
            full_url = href if href.startswith('http') else urljoin(url, href)
            if is_internal_link(full_url, base_url) and full_url not in visited:
                new_links = get_document_links(full_url, base_url, visited)
                doc_links.extend(new_links)

                link_count += len(new_links)
                if link_count >= link_limit:
                    print("Limit of 6 document links reached. Stopping.")
                    return doc_links

    except Exception as e:
        print(f"\nError fetching {url}: {e}")

    return doc_links


In [54]:
# Función para convertir una cadena de texto a formato camelCase
# THIS FUNCTION IS A COMPONENT OF scrape_and_store_links() function
# camelCase, un estilo común de escritura en programación, especialmente útil para nombres de variables y funciones.
# La función convierte texto a camelCase (ej. nombreCompleto), ideal para nombres de variables y claves en JSON, siguiendo convenciones de programación

def to_camel_case(text):
    # Return empty string if the input is empty or None
    if not text:
        return ''
    text = re.sub(r'[_\-]+', ' ', text)
    words = text.split()
    if not words:
        return ''
    camel_case_text = words[0].lower() + ''.join(word.capitalize() for word in words[1:])
    
    return camel_case_text

In [55]:
# Función para extraer y almacenar enlaces de documentos en JSON, incluyendo los segmentos de ruta de la página principal
# Esta funcion crea el archivo .json "scraped_links_y-m-d-H-M-S) 
# Extrae metadata parent_page, tittle, href y file_url
# Esta funcion se activa y es componente del ultimo bloque de codigo main() function

def scrape_and_store_links(urls):
    data = []  # List to hold the scraped links
    timestamp =datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    
    ##timestamp = datetime.now().isoformat()  # Timestamp for saving the file

    for url in urls:
        print(f"Processing URL: {url}")
        base_url = url  # This is the parent page (where the link is found)

        doc_links = get_document_links(base_url, base_url, visited=set())  # Scrape the links from the base URL

        for parent_page, (link, doc_url) in doc_links:  # Extract parent_page and document link from the 2D array
            # Get the title from the link element (use link text)
            title = link.string or link.get_text().strip() or "No title"  # Fallback to "No title" if empty

            # Extract the path segments from the parent page (base_url)
            parsed_url = urlparse(parent_page)  # Use the parent_page returned by get_document_links
            path_segments = parsed_url.path.strip('/').split('/')
            
            # If path is empty, use domain (netloc) as the folder name
            if not path_segments or path_segments == ['']:
                path_segments = [parsed_url.netloc]  # Fallback to the domain if the path is empty
            else:
                # Convert to camelCase and skip empty segments
                path_segments = [to_camel_case(segment) for segment in path_segments if segment]

            # Append the details of each link to the data list, including the parent page path segments
            data.append({
                "timestamp": timestamp,
                "parent_page": path_segments,  # Store the path segments as an array
                "title": title,
                "href": link['href'],  # Extract href directly from the <a> tag (the document link)
                "file_url": doc_url,  # Full URL of the document
            })

    # Convert the list of links into a DataFrame and save as JSON
    df = pd.DataFrame(data)
    output_file = f"scraped_links_{timestamp}.json"
    df.to_json(output_file, orient="records", indent=4)
    print(f"Scraping complete. Links saved to {output_file}")
    return output_file


In [56]:
# Main function to initiate scraping for documents
# Función principal para iniciar la extracción de documentos desde un sitio web
# Esta funcion se activa y es componente del ultimo bloque de codigo main() function



def scrape_documents_from_website(url):
    base_url = url  # The starting URL
    visited = set()  # To keep track of visited URLs
    doc_links = get_document_links(base_url, base_url, visited)

    print(f"Found {len(doc_links)} document links:")
    for title, link in doc_links:
        print(f"- {title}: {link}")

    return doc_links

In [57]:
# Function to create subdirectories based on the parent page path segments
# Función para crear subdirectorios basados en los segmentos de ruta de la página principal
# This function organizes documents into a structured folder system based on the webpage hierarchy, making it easier to manage downloaded files according to their origin.
# This function is called and a component for download_file() function


def create_directory_structure(parent_page_segments, doc_url):
    # Convert the path segments array into a directory path
    directory_path = os.path.join(DOWNLOAD_DIR, *parent_page_segments)

    # Create the directories if they don't exist
    os.makedirs(directory_path, exist_ok=True)

    return directory_path

In [58]:
# Function to download a document and save it in the appropriate subdirectory
# Función para descargar un documento y guardarlo en el subdirectorio correspondiente
# Esta funcion se activa y es componente del ultimo bloque de codigo main() function

def download_file(base_url, doc_url, context_href):
    # Use the href to generate a meaningful name
    file_name = clean_filename(context_href)

    # Create the directory structure based on Page and Section
    directory_path = create_directory_structure(base_url, doc_url)

    # Create the full file path to save the file
    file_path = os.path.join(directory_path, file_name)

    try:
        response = requests.get(doc_url)
        with open(file_path, 'wb') as f:
            f.write(response.content)
        print(f"\nDownloaded: {file_name} to {directory_path}")
        return file_path
    except Exception as e:
        print(f"\nFailed to download {doc_url}: {e}")
        return None

In [59]:
# Set up logging configuration
logging.basicConfig(
    filename="scraping_log.txt",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

logging.info("Starting the scraping process...")

def download_file(base_url, doc_url, context_href):
    # Use the href to generate a meaningful name
    file_name = clean_filename(context_href)
    directory_path = create_directory_structure(base_url, doc_url)
    file_path = os.path.join(directory_path, file_name)

    try:
        response = requests.get(doc_url, timeout=10)  # Add timeout here
        with open(file_path, 'wb') as f:
            f.write(response.content)
        logging.info(f"Downloaded: {file_name} to {directory_path}")
        return file_path
    except requests.exceptions.Timeout:
        logging.warning(f"Timeout occurred for {doc_url}")
        return None
    except Exception as e:
        logging.error(f"Failed to download {doc_url}: {e}")
        return None


In [60]:

# Setup logging
logging.basicConfig(level=logging.INFO)

def extract_text_from_file(file_path):
    if file_path.endswith(".docx"):
        return extract_text_from_docx(file_path)
    elif file_path.endswith(".pdf"):
        return extract_text_from_pdf(file_path)
    elif file_path.endswith(".doc"):
        return extract_text_from_doc(file_path)
    return None

# Extract text from .docx files using mammoth with UTF-8-SIG encoding
def extract_text_from_docx(file_path):
    try:
        with open(file_path, "rb") as docx_file:
            result = mammoth.extract_raw_text(docx_file)
            text = result.value
            return text.encode("utf-8-sig").decode("utf-8-sig")  # Ensure UTF-8-SIG encoding
    except Exception as e:
        logging.error(f"Error extracting text from .docx file {file_path}: {e}")
        return None

# Extract text from .pdf files using pdfplumber with UTF-8-SIG encoding
def extract_text_from_pdf(file_path):
    try:
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                page_text = page.extract_text() or ""
                text += page_text  # Concatenate raw text
            # Encode and decode entire text as UTF-8-SIG after extraction
            return text.encode("utf-8-sig").decode("utf-8-sig") if text else "No text found in the PDF"
    except Exception as e:
        logging.error(f"Error extracting text from PDF file {file_path}: {e}")
        return None

# Extract text from .doc files directly using win32com.client with UTF-8-SIG encoding
def extract_text_from_doc(file_path):
    try:
        word = win32.Dispatch("Word.Application")
        word.Visible = False  # Optional: Keep Word hidden
        doc = word.Documents.Open(os.path.abspath(file_path))
        doc_text = doc.Content.Text
        doc.Close()
        word.Quit()
        # Handle UTF-8-SIG encoding for accented characters
        return doc_text.encode("utf-8-sig", "ignore").decode("utf-8-sig", "ignore")

    except Exception as e:
        logging.error(f"Error extracting text from .doc file {file_path}: {e}")
        return None


In [61]:
# Extract the main domain (base URL) using the tld library
# Extraer el dominio principal (URL base) usando la biblioteca tld
# NO ESTAMOS USANDO ESTA FUNCION
# NO ES ACTIVADA  

def get_base_domain(url):
    try:
        # Extract the top-level domain (e.g., comunidadandina.org)
        return get_fld(url, fix_protocol=True)
    except Exception as e:
        print(f"Error extracting base domain from {url}: {e}")
        return urlparse(url).netloc

In [62]:
def download_documents_from_json(json_file):
    # Define the replacements for common problematic characters, including newline
    replacements = {
        r'\u00e1': 'á', r'\u00e9': 'é', r'\u00ed': 'í', r'\u00f3': 'ó', r'\u00fa': 'ú',
        r'\u00f1': 'ñ', r'\u00c1': 'Á', r'\u00c9': 'É', r'\u00cd': 'Í', r'\u00d3': 'Ó',
        r'\u00da': 'Ú', r'\u00d1': 'Ñ', r'\n': ' '  # Replace newline with a space
    }
    
    with open(json_file, 'r', encoding='utf-8') as f:
        links_data = json.load(f)

    for link_data in links_data:
        doc_url = link_data['file_url']
        href = link_data['href']
        parent_page_segments = link_data['parent_page']  # Array of path segments

        sys.stdout.write(f"Downloading: {doc_url}\r")
        sys.stdout.flush()

        # Use the parent page path segments to create the folder structure
        file_path = download_file(parent_page_segments, doc_url, href)  # Download the document using the parent page path segments

        if file_path:
            extracted_text = extract_text_from_file(file_path)  # Extract text from the file (if applicable)
            # Update the JSON structure with the downloaded file path and content
            link_data['downloaded_path'] = file_path
            link_data['extracted_content'] = extracted_text or "N/A"

    # Perform replacements using the dictionary
    json_str = json.dumps(links_data)  # Convert JSON data to string
    for unicode_seq, char in replacements.items():
        json_str = re.sub(unicode_seq, char, json_str)
    
    # Convert the string back to JSON format
    updated_json_data = json.loads(json_str)
    
    # Save updated data back to the JSON file with results
    output_file = f"results_with_download_{datetime.now().isoformat().replace(':', '-')}.json"
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(updated_json_data, f, ensure_ascii=False, indent=4)

    print(f"Downloads complete. Updated results saved to {output_file}")


In [63]:
import re
import json
from urllib.parse import unquote, urlparse
from datetime import datetime

def clean_encoded_text(text):
    # Use a regex to find and remove all (cid:xxx) patterns
    return re.sub(r'\(cid:\d+\)', '', text)

def download_documents_from_json(json_file):
    # Define the replacements for common problematic characters, including newline
    replacements = {
        r'\u00e1': 'á', r'\u00e9': 'é', r'\u00ed': 'í', r'\u00f3': 'ó', r'\u00fa': 'ú',
        r'\u00f1': 'ñ', r'\u00c1': 'Á', r'\u00c9': 'É', r'\u00cd': 'Í', r'\u00d3': 'Ó',
        r'\u00da': 'Ú', r'\u00d1': 'Ñ', r'\n': ' '  # Replace newline with a space
    }

    with open(json_file, 'r', encoding='utf-8') as f:
        links_data = json.load(f)

    for link_data in links_data:
        doc_url = link_data['file_url']
        href = link_data['href']
        parent_page_segments = link_data['parent_page']  # Array of path segments

        sys.stdout.write(f"Downloading: {doc_url}\r")
        sys.stdout.flush()

        # Use the parent page path segments to create the folder structure
        file_path = download_file(parent_page_segments, doc_url, href)  # Download the document using the parent page path segments

        if file_path:
            extracted_text = extract_text_from_file(file_path)  # Extract text from the file (if applicable)
            # Clean the extracted text from any (cid:xxx) patterns
            cleaned_text = clean_encoded_text(extracted_text)
            # Update the JSON structure with the downloaded file path and content
            link_data['downloaded_path'] = file_path
            link_data['extracted_content'] = cleaned_text or "N/A"

    # Perform replacements using the dictionary
    json_str = json.dumps(links_data)  # Convert JSON data to string
    for unicode_seq, char in replacements.items():
        json_str = re.sub(unicode_seq, char, json_str)
    
    # Convert the string back to JSON format
    updated_json_data = json.loads(json_str)
    
    # Save updated data back to the JSON file with results
    output_file = f"results_with_download_{datetime.now().isoformat().replace(':', '-')}.json"
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(updated_json_data, f, ensure_ascii=False, indent=4)

    print(f"Downloads complete. Updated results saved to {output_file}")


In [64]:
def scrape_and_store_links(urls):
    data = []  # List to hold the scraped links
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    for url in tqdm(urls, desc="Processing URLs"):  # Add tqdm progress bar to URLs loop
        print(f"Processing URL: {url}")
        base_url = url  # This is the parent page (where the link is found)

        doc_links = get_document_links(base_url, base_url, visited=set())  # Scrape the links from the base URL

        for parent_page, (link, doc_url) in tqdm(doc_links, desc="Extracting links", leave=False):  # Inner progress bar for each URL
            # Get the title from the link element (use link text)
            title = link.string or link.get_text().strip() or "No title"  # Fallback to "No title" if empty

            # Extract the path segments from the parent page (base_url)
            parsed_url = urlparse(parent_page)  # Use the parent_page returned by get_document_links
            path_segments = parsed_url.path.strip('/').split('/')

            # If path is empty, use domain (netloc) as the folder name
            if not path_segments or path_segments == ['']:
                path_segments = [parsed_url.netloc]  # Fallback to the domain if the path is empty
            else:
                # Convert to camelCase and skip empty segments
                path_segments = [to_camel_case(segment) for segment in path_segments if segment]

            # Append the details of each link to the data list, including the parent page path segments
            data.append({
                "timestamp": timestamp,
                "parent_page": path_segments,  # Store the path segments as an array
                "title": title,
                "href": link['href'],  # Extract href directly from the <a> tag (the document link)
                "file_url": doc_url,  # Full URL of the document
            })

    # Convert the list of links into a DataFrame and save as JSON
    df = pd.DataFrame(data)
    output_file = f"scraped_links_{timestamp}.json"
    df.to_json(output_file, orient="records", indent=4)
    print(f"Scraping complete. Links saved to {output_file}")
    return output_file


In [65]:
###BORRARRRR LUEGO 

def scrape_and_store_links(urls):
    data = []  # List to hold the scraped links
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    link_count = 0  # Counter to track the number of links scraped
    
    for url in urls:
        print(f"Processing URL: {url}")
        base_url = url  # This is the parent page (where the link is found)

        doc_links = get_document_links(base_url, base_url, visited=set())  # Scrape the links from the base URL

        for parent_page, (link, doc_url) in doc_links:
            # Get the title from the link element (use link text)
            title = link.string or link.get_text().strip() or "No title"

            # Extract the path segments from the parent page (base_url)
            parsed_url = urlparse(parent_page)
            path_segments = parsed_url.path.strip('/').split('/')

            # If path is empty, use domain (netloc) as the folder name
            if not path_segments or path_segments == ['']:
                path_segments = [parsed_url.netloc]
            else:
                # Convert to camelCase and skip empty segments
                path_segments = [to_camel_case(segment) for segment in path_segments if segment]

            # Append the details of each link to the data list
            data.append({
                "timestamp": timestamp,
                "parent_page": path_segments,
                "title": title,
                "href": link['href'],
                "file_url": doc_url,
            })
            
            link_count += 1
            # Stop after scraping 6 links
            if link_count >= 6:
                print("Limit of 6 links reached. Stopping.")
                break

        if link_count >= 6:
            break

    # Convert the list of links into a DataFrame and save as JSON
    df = pd.DataFrame(data)
    output_file = f"scraped_links_{timestamp}.json"
    df.to_json(output_file, orient="records", indent=4)
    print(f"Scraping complete. Links saved to {output_file}")
    return output_file

In [66]:
# scrape_and_store_link

urls = read_urls()
scrape_and_store_links(urls)

Processing URL: https://produccion.gob.bo/
Limit of 6 document links reached. Stopping.
Limit of 6 links reached. Stopping.
Scraping complete. Links saved to scraped_links_2024-10-18_00-30-06.json


'scraped_links_2024-10-18_00-30-06.json'

In [67]:
download_documents_from_json('scraped_links_2024-10-15_20-26-53.json')

Downloads complete. Updated results saved to results_with_download_2024-10-18T00-31-11.313225.jsondfdf
