In [18]:
import os
import sys
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import mammoth
import json
from urllib.parse import urlparse, urljoin, unquote
import re
import PyPDF2
import pdfplumber
import docx
import pypandoc
from tld import get_fld

In [19]:
# Create a folder for downloads
DOWNLOAD_DIR = "999_downloaded_documents"
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

In [20]:
# Read URLs from file
def read_urls(file_name='999.web_urls.txt'):
    with open(file_name, 'r') as f:
        return [line.strip() for line in f.readlines()]

In [21]:
# Helper function to check if a link is internal (within the same domain)
def is_internal_link(url, base_url):
    # Parse both URLs
    parsed_url = urlparse(url)
    parsed_base_url = urlparse(base_url)

    # Check if the domain of the URL matches the base URL's domain
    return parsed_url.netloc == parsed_base_url.netloc or parsed_url.netloc == ''

In [22]:
def is_binary_content(response):
    content_type = response.headers.get('Content-Type', '').lower()
    return 'application/pdf' in content_type or 'application/msword' in content_type or 'application/vnd' in content_type

In [23]:
# Function to clean filenames
def clean_filename(href):
    # Extract the file name from the URL
    file_name = unquote(os.path.basename(urlparse(href).path))
    # Remove invalid characters for filenames using a regex
    file_name = re.sub(r'[<>:"/\\|?*]', '_', file_name)
    # Fallback if the URL doesn't have a file name
    if not file_name:
        file_name = "downloaded_file"
    return file_name

In [24]:
# Recursively fetch document links from a website, traversing internal links.
def get_document_links(url, base_url, visited=None):
    if visited is None:
        visited = set()

    doc_links = []

    # Ensure we do not revisit the same page
    if url in visited:
        return doc_links
    visited.add(url)

    try:
        sys.stdout.write(f"Processing: {url}\r")
        sys.stdout.flush()

        response = requests.get(url)

        # Skip binary files (like PDFs, DOCs) to avoid trying to parse them
        if is_binary_content(response):
            sys.stdout.write(f"Skipping binary file: {url}\r")
            sys.stdout.flush()
            return doc_links

        # Parse HTML content with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all document links on the page (pdf, doc, docx)
        for link in soup.find_all('a', href=True):
            href = link['href']
            if href.startswith(('tel:', 'mailto:')):
                continue
            if href.endswith(('pdf', 'doc', 'docx','doc')):
                full_url = href if href.startswith('http') else urljoin(url, href)
                # Append both the parent page (url) and the document link (full_url)
                doc_links.append([url, (link, full_url)])  # Return parent page and document link as 2D array

        # Now, find all internal links to recursively navigate
        for link in soup.find_all('a', href=True):
            href = link['href']
            full_url = href if href.startswith('http') else urljoin(url, href)
            if is_internal_link(full_url, base_url) and full_url not in visited:
                doc_links.extend(get_document_links(full_url, base_url, visited))

    except Exception as e:
        print(f"\nError fetching {url}: {e}")

    return doc_links


In [25]:
# Function to convert a string to camelCase
def to_camel_case(text):
    # Return empty string if the input is empty or None
    if not text:
        return ''
    text = re.sub(r'[_\-]+', ' ', text)
    words = text.split()
    if not words:
        return ''
    camel_case_text = words[0].lower() + ''.join(word.capitalize() for word in words[1:])
    
    return camel_case_text

In [26]:
# Function to scrape and store document links in JSON with parent page path segments
def scrape_and_store_links(urls):
    data = []  # List to hold the scraped links
    timestamp =datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    
    ##timestamp = datetime.now().isoformat()  # Timestamp for saving the file

    for url in urls:
        print(f"Processing URL: {url}")
        base_url = url  # This is the parent page (where the link is found)

        doc_links = get_document_links(base_url, base_url, visited=set())  # Scrape the links from the base URL

        for parent_page, (link, doc_url) in doc_links:  # Extract parent_page and document link from the 2D array
            # Get the title from the link element (use link text)
            title = link.string or link.get_text().strip() or "No title"  # Fallback to "No title" if empty

            # Extract the path segments from the parent page (base_url)
            parsed_url = urlparse(parent_page)  # Use the parent_page returned by get_document_links
            path_segments = parsed_url.path.strip('/').split('/')
            
            # If path is empty, use domain (netloc) as the folder name
            if not path_segments or path_segments == ['']:
                path_segments = [parsed_url.netloc]  # Fallback to the domain if the path is empty
            else:
                # Convert to camelCase and skip empty segments
                path_segments = [to_camel_case(segment) for segment in path_segments if segment]

            # Append the details of each link to the data list, including the parent page path segments
            data.append({
                "timestamp": timestamp,
                "parent_page": path_segments,  # Store the path segments as an array
                "title": title,
                "href": link['href'],  # Extract href directly from the <a> tag (the document link)
                "file_url": doc_url,  # Full URL of the document
            })

    # Convert the list of links into a DataFrame and save as JSON
    df = pd.DataFrame(data)
    output_file = f"scraped_links_{timestamp}.json"
    df.to_json(output_file, orient="records", indent=4)
    print(f"Scraping complete. Links saved to {output_file}")
    return output_file


In [27]:
# Main function to initiate scraping for documents
def scrape_documents_from_website(url):
    base_url = url  # The starting URL
    visited = set()  # To keep track of visited URLs
    doc_links = get_document_links(base_url, base_url, visited)

    print(f"Found {len(doc_links)} document links:")
    for title, link in doc_links:
        print(f"- {title}: {link}")

    return doc_links

In [28]:
# Function to create subdirectories based on the parent page path segments
def create_directory_structure(parent_page_segments, doc_url):
    # Convert the path segments array into a directory path
    directory_path = os.path.join(DOWNLOAD_DIR, *parent_page_segments)

    # Create the directories if they don't exist
    os.makedirs(directory_path, exist_ok=True)

    return directory_path

In [29]:
# Function to download a document and save it in the appropriate subdirectory
def download_file(base_url, doc_url, context_href):
    # Use the href to generate a meaningful name
    file_name = clean_filename(context_href)

    # Create the directory structure based on Page and Section
    directory_path = create_directory_structure(base_url, doc_url)

    # Create the full file path to save the file
    file_path = os.path.join(directory_path, file_name)

    try:
        response = requests.get(doc_url)
        with open(file_path, 'wb') as f:
            f.write(response.content)
        print(f"\nDownloaded: {file_name} to {directory_path}")
        return file_path
    except Exception as e:
        print(f"\nFailed to download {doc_url}: {e}")
        return None

In [30]:
# Extract text from files (.docx, .doc, and .pdf supported)
def extract_text_from_file(file_path):
    if file_path.endswith(".docx"):
        return extract_text_from_docx(file_path)
    elif file_path.endswith(".pdf"):
        return extract_text_from_pdf(file_path)
    elif file_path.endswith(".doc"):
        return extract_text_from_doc(file_path)
    return None

# Extract text from .docx files using mammoth
def extract_text_from_docx(file_path):
    try:
        with open(file_path, "rb") as docx_file:
            result = mammoth.extract_raw_text(docx_file)
            return result.value
    except Exception as e:
        print(f"Error extracting text from .docx file {file_path}: {e}")
        return None

# Extract text from .pdf files using PyPDF2 or pdfplumber
def extract_text_from_pdf(file_path):
    try:
        # Using pdfplumber for more robust text extraction from PDFs
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text() or ""  # Ensure we add non-None content
            return text if text else "No text found in the PDF"
    except Exception as e:
        print(f"Error extracting text from PDF file {file_path}: {e}")
        return None

# Extract text from .doc files using pypandoc
def extract_text_from_doc(file_path):
    try:
        # Using pypandoc to convert .doc to text
        return pypandoc.convert_file(file_path, 'plain')
    except Exception as e:
        print(f"Error extracting text from .doc file {file_path}: {e}")
        return None

In [31]:
# Extract the main domain (base URL) using the tld library
def get_base_domain(url):
    try:
        # Extract the top-level domain (e.g., comunidadandina.org)
        return get_fld(url, fix_protocol=True)
    except Exception as e:
        print(f"Error extracting base domain from {url}: {e}")
        return urlparse(url).netloc

In [32]:
# Function to read the stored links and download documents into organized directories
def download_documents_from_json(json_file):
    with open(json_file, 'r') as f:
        links_data = json.load(f)

    for link_data in links_data:
        doc_url = link_data['file_url']
        href = link_data['href']
        parent_page_segments = link_data['parent_page']  # Array of path segments

        sys.stdout.write(f"Downloading: {doc_url}\r")
        sys.stdout.flush()

        # Use the parent page path segments to create the folder structure
        file_path = download_file(parent_page_segments, doc_url, href)  # Download the document using the parent page path segments

        if file_path:
            extracted_text = extract_text_from_file(file_path)  # Extract text from the file (if applicable)
            # Update the JSON structure with the downloaded file path and content
            link_data['downloaded_path'] = file_path
            link_data['extracted_content'] = extracted_text or "N/A"

    # Save updated data back to the JSON file with results
    output_file = f"results_with_download_{datetime.now().isoformat().replace(':', '-')}.json"
    with open(output_file, 'w') as f:
        json.dump(links_data, f, indent=4)

    print(f"Downloads complete. Updated results saved to {output_file}")

In [17]:
def main():
    urls = read_urls()  # Read URLs from the text file

    # Phase 1: Scrape and store links in JSON
    links_json_file = scrape_and_store_links(urls)
    #links_json_file = "scraped_links_2024-10-09.json"

    # Phase 2: Download documents using the stored links
    download_documents_from_json(links_json_file)


if __name__ == "__main__":
    main()


Processing URL: https://www.comunidadandina.org/
Processing: tel:+5117106573unidadandina.org/archivo-de-convocatorias/e-la-secretaria-general-can//a/
Error fetching tel:+5117106573: No connection adapters were found for 'tel:+5117106573'
Processing: mailto:bibliocan@comunidadandina.org
Error fetching mailto:bibliocan@comunidadandina.org: No connection adapters were found for 'mailto:bibliocan@comunidadandina.org'
Processing: tel:5117106400munidadandina.org/images/quienes-somos/simbolos-de-la-can/files/MAPA_CAN_PDF_155x230.zipA-CERTIFICACION-Y-VERIFICACION-DE-ORIGEN-2024.xlsxfdf
Error fetching tel:5117106400: No connection adapters were found for 'tel:5117106400'
Processing: mailto:contacto@conhu.org.perg/quienes-somos/organismo-andino-de-salud-convenio-hipolito-unanue/s/dfA-PRO-TEMPORE-DEL-ECUADOR.-Publicación.pdfcamino-para-enfrentar-la-pandemia/
Error fetching mailto:contacto@conhu.org.pe: No connection adapters were found for 'mailto:contacto@conhu.org.pe'
Processing: mailto:infocaf