In [6]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time

# Chrome setup
chrome_options = Options()
service = Service("/Users/felix.toutant/Desktop/chromedriver-mac-arm64/chromedriver")
driver = webdriver.Chrome(service=service, options=chrome_options)

# List to collect all the project data
projects_data = []

# Loop through all 23 pages
for page in range(1, 24):
    url = f'https://voedseluithetbos.nl/en/projecten/page/{page}/'
    print(f"\nOpening page {page}: {url}")
    driver.get(url)
    time.sleep(5)

    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    projects = soup.find_all('article', class_='block projectBlock whiteBg')
    print(f"Found {len(projects)} projects on page {page}.")

    for project in projects:
        name = project.find('h2').get_text(strip=True)

        info = {}
        for h3 in project.find_all('h3'):
            label = h3.get_text(strip=True).replace(':', '')
            value = h3.find_next_sibling('p').get_text(strip=True)
            info[label] = value

        by = info.get('By', 'N/A')
        square_meters = info.get('Square metres', 'N/A')
        province = info.get('Province', 'N/A')

        # Add project data to the list
        projects_data.append({
            'Name': name,
            'By': by,
            'Square metres': square_meters,
            'Province': province
        })

driver.quit()

# Create DataFrame
df = pd.DataFrame(projects_data)

# Export to CSV
df.to_csv('food_forests.csv', index=False)

print("\n✅ Scraping completed! CSV saved as 'food_forests.csv'.")




Opening page 1: https://voedseluithetbos.nl/en/projecten/page/1/


KeyboardInterrupt: 

In [11]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time

# Chrome setup
chrome_options = Options()
chrome_options.add_argument("--headless=new")  # Use headless mode for speed
chrome_options.add_argument("--disable-gpu")  # Helps on some systems
chrome_options.add_argument("--no-sandbox")  # May improve speed
chrome_options.add_argument("--disable-dev-shm-usage")  # Prevents crashes on some machines
service = Service("/Users/felix.toutant/Desktop/chromedriver-mac-arm64/chromedriver")
driver = webdriver.Chrome(service=service, options=chrome_options)

# List to store results
projects_data = []

# Loop through all 23 pages
for page in range(1, 24):
    url = f'https://voedseluithetbos.nl/en/projecten/page/{page}/'
    print(f"\nOpening page {page}: {url}")
    driver.get(url)
    
    try:
        # Wait for a more reliable element to load
        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, "article.block.projectBlock.whiteBg")))
    except:
        print(f"⚠️ Timeout on page {page}, skipping...")
        continue  # Skip this page if it fails

    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    projects = soup.find_all('article', class_='block projectBlock whiteBg')
    print(f"Found {len(projects)} projects on page {page}.")

    for project in projects:
        name = project.find('h2').get_text(strip=True)

        info = {}
        for h3 in project.find_all('h3'):
            label = h3.get_text(strip=True).replace(':', '')
            value = h3.find_next_sibling('p').get_text(strip=True)
            info[label] = value

        by = info.get('By', 'N/A')
        square_meters = info.get('Square metres', 'N/A')
        province = info.get('Province', 'N/A')

        # Step 1: Extract the food forest page link
        project_link_tag = project.find('a', href=True)
        project_url = project_link_tag['href'] if project_link_tag else "N/A"

        profile_url = "N/A"

        # Step 2: Open food forest page in a new tab (faster than reloading)
        if project_url != "N/A":
            driver.execute_script(f"window.open('{project_url}', 'new_window')")
            driver.switch_to.window(driver.window_handles[1])  # Switch to new tab

            try:
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
                project_soup = BeautifulSoup(driver.page_source, 'html.parser')

                # Find the profile link
                profile_link_tag = project_soup.find('a', href=True, class_="info")
                if profile_link_tag:
                    profile_url = profile_link_tag['href']
            except:
                print(f"⚠️ Timeout loading project page: {project_url}")

            # Close the new tab and switch back to the main page
            driver.close()
            driver.switch_to.window(driver.window_handles[0])

        # Store data
        projects_data.append({
            'Name': name,
            'By': by,
            'Square metres': square_meters,
            'Province': province,
            'Food Forest URL': project_url,
            'Profile URL': profile_url
        })

driver.quit()

# Create DataFrame
df = pd.DataFrame(projects_data)

# Export to CSV
df.to_csv('food_forests_with_profiles_optimized.csv', index=False)

print("\n✅ Scraping completed! CSV saved as 'food_forests_with_profiles_optimized.csv'.")





Opening page 1: https://voedseluithetbos.nl/en/projecten/page/1/
Found 20 projects on page 1.

Opening page 2: https://voedseluithetbos.nl/en/projecten/page/2/
Found 20 projects on page 2.

Opening page 3: https://voedseluithetbos.nl/en/projecten/page/3/
Found 20 projects on page 3.

Opening page 4: https://voedseluithetbos.nl/en/projecten/page/4/
Found 20 projects on page 4.

Opening page 5: https://voedseluithetbos.nl/en/projecten/page/5/
Found 20 projects on page 5.

Opening page 6: https://voedseluithetbos.nl/en/projecten/page/6/
Found 20 projects on page 6.

Opening page 7: https://voedseluithetbos.nl/en/projecten/page/7/
Found 20 projects on page 7.

Opening page 8: https://voedseluithetbos.nl/en/projecten/page/8/
Found 20 projects on page 8.

Opening page 9: https://voedseluithetbos.nl/en/projecten/page/9/
Found 20 projects on page 9.

Opening page 10: https://voedseluithetbos.nl/en/projecten/page/10/
Found 20 projects on page 10.

Opening page 11: https://voedseluithetbos.nl/e