In [26]:
import os
import requests
from bs4 import BeautifulSoup
import shutil
import re
import time

# Constants
BASE_URL = "https://www.thegef.org"
DATABASE_URL = "https://www.thegef.org/projects-operations/database?page="
PROJECTS_FOLDER = "GEF_docs"


In [27]:

# Check and create folder to store downloaded documents
if not os.path.exists(PROJECTS_FOLDER):
    os.makedirs(PROJECTS_FOLDER)
    
def sanitize_filename(filename):
    # Replace invalid characters with underscores or remove them
    sanitized = re.sub(r'[<>:"/\\|?*]', '_', filename)
    return sanitized

def download_file(url, filename, retries=3, delay=5):
    """
    Helper function to download a file from the given URL and save it with the provided filename.
    """
    # Properly encode the URL
    encoded_url = requests.utils.requote_uri(url)

    for attempt in range(retries):
        try:
            with requests.get(encoded_url, stream=True, timeout=10) as response:
                response.raise_for_status()  # Check for HTTP errors
                with open(filename, 'wb') as out_file:
                    shutil.copyfileobj(response.raw, out_file)
                return
        except requests.exceptions.RequestException as e:
            print(f"Error downloading {url} on attempt {attempt + 1}: {e}")
            if attempt < retries - 1:  # If it's not the last attempt
                time.sleep(delay)  # Wait for some time before retrying
                delay *= 2  # Double the delay for exponential backoff
    print(f"Failed to download {url} after {retries} attempts.")


# Main scraper
def scrape_gef_documents(start_page=0):
    # Loop over all the pages
    for page in range(start_page, 197):  # 0 to 196
        response = requests.get(DATABASE_URL + str(page))
        soup = BeautifulSoup(response.content, 'html.parser')

        # Fetch all the links from the project title column
        project_links = [link.get('href') for link in soup.select('.views-field-title a')]

        for link in project_links:
            project_response = requests.get(BASE_URL + link)
            project_soup = BeautifulSoup(project_response.content, 'html.parser')

            # Get the GEF Project ID
            project_id_element = project_soup.select_one("div.field--name-field-gef-project-id .field__item")
            if project_id_element:
                project_id = project_id_element.text.strip()
            else:
                print(f"Failed to find Project ID at URL: {BASE_URL + link}")
                continue

            # Get the project document links
            doc_links = project_soup.select("div.field--name-field-document-url .field__item a")
            # Inside the loop that downloads documents
            for doc_link in doc_links:
                doc_url = doc_link.get('href')
                original_filename = sanitize_filename(doc_url.split('/')[-1])
                download_path = os.path.join(PROJECTS_FOLDER, f"{project_id}_{original_filename}")
                print(f"Attempting to download {doc_url} to {download_path}")
                try:
                    download_file(doc_url, download_path)
                    print(f"Downloaded {doc_url} to {download_path}")
                except requests.exceptions.HTTPError as e:
                    print(f"Error downloading {doc_url}: {e}")
                    # Handle the error as needed (e.g., log, skip, etc.)


if __name__ == "__main__":
    scrape_gef_documents(start_page=149)


Attempting to download https://publicpartnershipdata.azureedge.net/gef/PMISGEFDocuments/Multi-focal%20Area/CHAD%20-%20Community%20Based%20Integrated%20Ecosystem%20Management/3-4-04%20Endorsement%20Ltr.pdf to GEF_docs\1855_3-4-04%20Endorsement%20Ltr.pdf
Downloaded https://publicpartnershipdata.azureedge.net/gef/PMISGEFDocuments/Multi-focal%20Area/CHAD%20-%20Community%20Based%20Integrated%20Ecosystem%20Management/3-4-04%20Endorsement%20Ltr.pdf to GEF_docs\1855_3-4-04%20Endorsement%20Ltr.pdf
Attempting to download https://publicpartnershipdata.azureedge.net/gef/PMISGEFDocuments/Multi-focal%20Area/CHAD%20-%20Community%20Based%20Integrated%20Ecosystem%20Management/3-4-04%20Chad%20Exec%20Summary.doc to GEF_docs\1855_3-4-04%20Chad%20Exec%20Summary.doc
Downloaded https://publicpartnershipdata.azureedge.net/gef/PMISGEFDocuments/Multi-focal%20Area/CHAD%20-%20Community%20Based%20Integrated%20Ecosystem%20Management/3-4-04%20Chad%20Exec%20Summary.doc to GEF_docs\1855_3-4-04%20Chad%20Exec%20Summary.