In [5]:
import os
import requests
from bs4 import BeautifulSoup
import shutil
import re
import time

# Constants
BASE_URL = "https://www.adaptation-fund.org"
PROJECTS_PAGE = "https://www.adaptation-fund.org/projects-programmes/"
PROJECTS_FOLDER = "C:\\Users\\david\\My Drive\\data\\analysis_git_data\\cgiar\\AF_docs"

# Check and create folder to store downloaded documents
if not os.path.exists(PROJECTS_FOLDER):
    os.makedirs(PROJECTS_FOLDER)

def sanitize_filename(filename):
    # Replace invalid characters with underscores or remove them
    sanitized = re.sub(r'[<>:"/\\|?*]', '_', filename)
    return sanitized

def download_file(url, filename, retries=3, delay=5):
    encoded_url = requests.utils.requote_uri(url)
    for attempt in range(retries):
        try:
            with requests.get(encoded_url, stream=True, timeout=10) as response:
                response.raise_for_status()
                with open(filename, 'wb') as out_file:
                    shutil.copyfileobj(response.raw, out_file)
                return
        except requests.exceptions.RequestException as e:
            print(f"Failed to download {url} after {retries} attempts. Error: {e}")
            if attempt < retries - 1:
                time.sleep(delay)
                delay *= 2

def scrape_af_documents():
    response = requests.get(PROJECTS_PAGE)
    soup = BeautifulSoup(response.content, 'html.parser')
    project_links = []
    for script in soup.find_all("script"):
        if "var projects" in script.text:
            matches = re.findall(r'"link":"(.*?)"', script.text)
            matches = [re.sub(r'\\/', '/', match) for match in matches]
            project_links.extend(matches)

    for project_url in project_links:
        if project_url.startswith("http://") or project_url.startswith("https://"):
            full_url = project_url
        else:
            full_url = BASE_URL.rstrip('/') + '/' + project_url.lstrip('/')
        
        project_response = requests.get(full_url)
        if project_response.status_code != 200:
            print(f"Failed to get project page: {full_url}")
            continue

        project_soup = BeautifulSoup(project_response.content, 'html.parser')
        
        attachments = project_soup.select("tbody tr td a")
        for attachment in attachments:
            doc_url = attachment.get('href')
            if not doc_url:
                continue

            original_filename = sanitize_filename(doc_url.split('/')[-1])
            project_number = doc_url.split('/')[-2]
            download_path = os.path.join(PROJECTS_FOLDER, f"{project_number}_{original_filename}")
            print(f"Attempting to download {doc_url} to {download_path}")
            download_file(doc_url, download_path)

if __name__ == "__main__":
    scrape_af_documents()


Attempting to download https://fifspubprd.azureedge.net/afdocuments/project/15081/CAF_Peru_Proposal.pdf to C:\Users\david\My Drive\data\analysis_git_data\cgiar\AF_docs\15081_CAF_Peru_Proposal.pdf
Attempting to download https://fifspubprd.azureedge.net/afdocuments/project/15070/SPC_Nauru_Resubmission_Clean.pdf to C:\Users\david\My Drive\data\analysis_git_data\cgiar\AF_docs\15070_SPC_Nauru_Resubmission_Clean.pdf
Attempting to download https://fifspubprd.azureedge.net/afdocuments/project/15049/IFAD_Libya_Proposal.pdf to C:\Users\david\My Drive\data\analysis_git_data\cgiar\AF_docs\15049_IFAD_Libya_Proposal.pdf
Attempting to download https://fifspubprd.azureedge.net/afdocuments/project/15051/15051_Mongolia AF Concept Note August 2022 - For submission clean.pdf to C:\Users\david\My Drive\data\analysis_git_data\cgiar\AF_docs\15051_15051_Mongolia AF Concept Note August 2022 - For submission clean.pdf
Attempting to download https://fifspubprd.azureedge.net/afdocuments/project/15051/UN-Habitat_M

Extract project link, download link and then title to join with table. 

In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import os

# Constants
BASE_URL = "https://www.adaptation-fund.org"
PROJECTS_PAGE = "https://www.adaptation-fund.org/projects-programmes/"

def get_project_links():
    """Extract project links from the projects page."""
    print("Extracting project links...")
    response = requests.get(PROJECTS_PAGE)
    soup = BeautifulSoup(response.content, 'html.parser')
    project_links = []
    for script in soup.find_all("script"):
        if "var projects" in script.text:
            matches = re.findall(r'"link":"(.*?)"', script.text)
            for match in matches:
                match = re.sub(r'\\/', '/', match)
                if not match.startswith("http"):
                    match = BASE_URL + match
                project_links.append(match)
    return project_links

def get_project_details(project_url):
    """Extract document links and the project title from a project page."""
    project_response = requests.get(project_url)
    project_soup = BeautifulSoup(project_response.content, 'html.parser')
    title_tag = project_soup.find("h1", class_="entry-title")
    project_title = title_tag.text.strip() if title_tag else "No Title Found"
    attachments = project_soup.select("tbody tr td a")
    document_links = [attachment.get('href') for attachment in attachments if attachment.get('href')]
    return project_title, document_links

def main():
    project_links = get_project_links()
    records = []

    for project_url in project_links:
        print(f"Processing project: {project_url}")
        project_title, document_links = get_project_details(project_url)
        for doc_link in document_links:
            if not doc_link.startswith("http"):
                doc_link = BASE_URL + doc_link
            records.append({"project_link": project_url, "project_title": project_title, "document_link": doc_link})
            print(f"Added document link: {doc_link}")

    df = pd.DataFrame(records)
    csv_path = "project_documents_links_with_titles.csv"
    df.to_csv(csv_path, index=False)
    print(f"CSV file has been saved to {csv_path}")

if __name__ == "__main__":
    main()


Extracting project links...
Processing project: https://www.adaptation-fund.org/project/implementing-protection-technologies-to-foster-the-resilience-of-aquaculture-in-the-regions-of-huanuco-junin-and-puno-to-strengthen-food-security-in-the-context-of-extreme-events-associated-with-cli/
Added document link: https://fifspubprd.azureedge.net/afdocuments/project/15081/CAF_Peru_Proposal.pdf
Processing project: https://www.adaptation-fund.org/project/resilient-coastal-fisheries-and-aquaculture-in-nauru/
Added document link: https://fifspubprd.azureedge.net/afdocuments/project/15070/SPC_Nauru_Resubmission_Clean.pdf
Processing project: https://www.adaptation-fund.org/project/increasing-resilience-to-climate-aggravated-water-scarcity-in-the-agriculture-sector-in-libya/
Added document link: https://fifspubprd.azureedge.net/afdocuments/project/15049/IFAD_Libya_Proposal.pdf
Processing project: https://www.adaptation-fund.org/project/ger-community-resilience-project-gcrp/
Added document link: http