# Scrap from coworker.com

In [14]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
import csv

# Setup Selenium WebDriver
chromedriver_path = "/workspaces/Coworking/chromedriver-linux64/chromedriver"
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--headless=false")  # Run with visible browser

# Anti-detection measures
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)

service = Service(chromedriver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

# Set user agent to appear more like a regular browser
driver.execute_cdp_cmd('Network.setUserAgentOverride', {
    "userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
})

try:
    # Open coworking page
    city = "madrid"
    base_url = f"https://www.coworker.com/spain/{city}?view=list"
    print(f"Opening URL: {base_url}")
    driver.get(base_url)

    # Wait for page to load
    print("Waiting for page to load...")
    time.sleep(10)
    
    # First, check if we're getting the main content
    print("Checking if main content is accessible...")
    try:
        # Look for main sections
        main_elements = driver.find_elements(By.TAG_NAME, "main")
        if main_elements:
            print(f"Found {len(main_elements)} <main> elements")
            
        # Look for the slick carousel we identified
        carousel_elements = driver.find_elements(By.CSS_SELECTOR, ".slick-track")
        if carousel_elements:
            print(f"Found {len(carousel_elements)} carousel tracks")
    except Exception as e:
        print(f"Error finding main elements: {e}")
    
    # Now let's extract the coworking space links directly
    print("\nExtracting coworking space links...")
    coworking_links = driver.find_elements(By.CSS_SELECTOR, "a[href*='/spain/madrid/']")
    filtered_links = []
    
    for link in coworking_links:
        href = link.get_attribute("href")
        if href and "/search?" not in href and "/explore?" not in href:
            # This is likely a link to a specific coworking space
            filtered_links.append({
                "href": href,
                "text": link.text.strip()
            })
    
    # Remove duplicates based on href
    unique_links = []
    seen_hrefs = set()
    for link in filtered_links:
        if link["href"] not in seen_hrefs:
            seen_hrefs.add(link["href"])
            unique_links.append(link)
    
    print(f"Found {len(unique_links)} unique coworking space links")
    
    # Extract information for each coworking space
    coworking_spaces = []
    
    print("\nExtracting information for each coworking space...")
    for i, link_data in enumerate(unique_links):
        try:
            if i < 10:  # Process first 10 links for demonstration
                print(f"Processing link {i+1}/{len(unique_links)}: {link_data['href']}")
                
                # Open the coworking space page
                driver.get(link_data["href"])
                time.sleep(3)  # Wait for page to load
                
                # Extract data
                space_data = {
                    "url": link_data["href"],
                    "name": "",
                    "address": "",
                    "description": "",
                    "amenities": []
                }
                
                # Name (try different selectors)
                try:
                    name_element = driver.find_element(By.CSS_SELECTOR, "h1")
                    space_data["name"] = name_element.text.strip()
                except:
                    try:
                        name_element = driver.find_element(By.CSS_SELECTOR, "[class*='title']")
                        space_data["name"] = name_element.text.strip()
                    except:
                        pass
                
                # Address
                try:
                    address_element = driver.find_element(By.CSS_SELECTOR, "[class*='address'], [class*='location']")
                    space_data["address"] = address_element.text.strip()
                except:
                    pass
                
                # Description
                try:
                    desc_element = driver.find_element(By.CSS_SELECTOR, "[class*='description'], [class*='about']")
                    space_data["description"] = desc_element.text.strip()
                except:
                    pass
                
                # Amenities
                try:
                    amenity_elements = driver.find_elements(By.CSS_SELECTOR, "[class*='amenity'], [class*='facility']")
                    space_data["amenities"] = [el.text.strip() for el in amenity_elements if el.text.strip()]
                except:
                    pass
                
                coworking_spaces.append(space_data)
                print(f"  - Name: {space_data['name']}")
                
                # Go back to the list page if we need to process more spaces
                if i < len(unique_links) - 1:
                    driver.back()
                    time.sleep(2)
        except Exception as e:
            print(f"Error processing link {i+1}: {e}")
    
    # Save the extracted data
    print(f"\nSaving data for {len(coworking_spaces)} coworking spaces...")
    
    # Save as JSON
    with open("/workspaces/Coworking/coworking_spaces.json", "w", encoding="utf-8") as f:
        json.dump(coworking_spaces, f, indent=2)
    
    # Save as CSV
    with open("/workspaces/Coworking/coworking_spaces.csv", "w", encoding="utf-8", newline="") as f:
        fieldnames = ["name", "address", "url", "description"]
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for space in coworking_spaces:
            # Create a new dict with just the fields we want for CSV
            csv_row = {field: space.get(field, "") for field in fieldnames}
            writer.writerow(csv_row)
    
    print("Data saved successfully!")
    
    # Alternative approach: Try to extract directly from the list page
    print("\nAttempting to extract data directly from list page...")
    driver.get(base_url)
    time.sleep(5)
    
    # The links are within a slider, but we also need to try finding standard list items
    list_selectors = [
        "div.slick-track > div",  # The slider items we identified
        "div[class*='card']",     # Common card pattern
        "div[class*='listing']",  # Common listing pattern
        "div[class*='result']",   # Search result pattern
        "main > div > div"        # Generic nested divs that might contain listings
    ]
    
    list_items = []
    working_selector = ""
    
    for selector in list_selectors:
        try:
            elements = driver.find_elements(By.CSS_SELECTOR, selector)
            if elements and len(elements) > 5 and len(elements) < 100:
                print(f"Found {len(elements)} potential list items with selector: {selector}")
                list_items = elements
                working_selector = selector
                break
        except:
            continue
    
    if list_items:
        print(f"Extracting data from {len(list_items)} list items...")
        list_data = []
        
        for i, item in enumerate(list_items[:10]):  # Process first 10 items
            try:
                item_data = {"position": i + 1}
                
                # Try to get the link
                links = item.find_elements(By.TAG_NAME, "a")
                if links:
                    item_data["url"] = links[0].get_attribute("href")
                
                # Try to get name/title
                try:
                    title_els = item.find_elements(By.CSS_SELECTOR, "h1, h2, h3, h4, strong, [class*='title'], [class*='name']")
                    if title_els:
                        item_data["name"] = title_els[0].text.strip()
                except:
                    pass
                
                # Try to get address
                try:
                    addr_els = item.find_elements(By.CSS_SELECTOR, "[class*='address'], [class*='location']")
                    if addr_els:
                        item_data["address"] = addr_els[0].text.strip()
                except:
                    pass
                
                # Try to get image URL
                try:
                    img_els = item.find_elements(By.TAG_NAME, "img")
                    if img_els:
                        item_data["image_url"] = img_els[0].get_attribute("src")
                except:
                    pass
                
                list_data.append(item_data)
                print(f"  Item {i+1}: {item_data.get('name', 'Unknown')}")
            except Exception as e:
                print(f"  Error processing list item {i+1}: {e}")
        
        # Save list data
        with open("/workspaces/Coworking/list_items.json", "w", encoding="utf-8") as f:
            json.dump(list_data, f, indent=2)
        
        print("List data saved successfully!")

except Exception as e:
    print(f"Error during extraction: {e}")

finally:
    # Clean up
    driver.quit()
    print("\nExtraction complete.")

Opening URL: https://www.coworker.com/spain/madrid?view=list
Waiting for page to load...
Checking if main content is accessible...
Found 1 <main> elements
Found 10 carousel tracks

Extracting coworking space links...
Found 10 unique coworking space links

Extracting information for each coworking space...
Processing link 1/10: https://www.coworker.com/spain/madrid/regus-madrid-la-moraleja
  - Name: Coworking Space: Regus - Madrid, La Moraleja in Madrid
Processing link 2/10: https://www.coworker.com/spain/madrid/regus-madrid-torre-de-cristal
  - Name: Coworking Space: Regus - Madrid, Torre de Cristal in Madrid
Processing link 3/10: https://www.coworker.com/spain/madrid/regus-madrid-ortega-y-gasset
  - Name: Coworking Space: Regus - Madrid, Ortega y Gasset in Madrid
Processing link 4/10: https://www.coworker.com/spain/madrid/regus-madrid-manoteras
  - Name: Coworking Space: Regus - Madrid, Manoteras in Madrid
Processing link 5/10: https://www.coworker.com/spain/madrid/regus-madrid-pinar-

In [20]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# ✅ Function to restart WebDriver if it crashes
def restart_driver():
    global driver
    try:
        driver.quit()
    except:
        pass  # Ignore errors if driver is already closed

    service = Service(chromedriver_path)
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.execute_cdp_cmd('Network.setUserAgentOverride', {
        "userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    })
    return driver

# ✅ Setup Selenium WebDriver
chromedriver_path = "/workspaces/Coworking/chromedriver-linux64/chromedriver"
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--headless=false")  # Run with visible browser
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)

# Start driver
driver = restart_driver()
wait = WebDriverWait(driver, 10)

# ✅ Base URL for pagination
city = "madrid"
base_url = f"https://www.coworker.com/spain/{city}?view=list"
current_page = 1
has_next_page = True

while has_next_page:
    if current_page % 5 == 0:  # Refresh every 5 pages to keep session alive
        print("Refreshing browser to prevent timeouts...")
        driver.refresh()
        time.sleep(3)

    page_url = f"{base_url}&page={current_page}" if current_page > 1 else base_url
    print(f"\nProcessing Page {current_page}: {page_url}")
    
    # ✅ Try loading the page (restart driver if necessary)
    try:
        driver.get(page_url)
        time.sleep(5)
    except Exception as e:
        print(f"Error loading page {current_page}, restarting WebDriver...")
        driver = restart_driver()
        continue

    # ✅ Extract coworking links
    coworking_links = driver.find_elements(By.CSS_SELECTOR, "a[href*='/spain/madrid/']")
    unique_links = list(set([link.get_attribute("href") for link in coworking_links if link.get_attribute("href")]))
    print(f"Found {len(unique_links)} coworking spaces on page {current_page}")

    if not unique_links:
        print("No more coworking spaces found. Pagination finished.")
        break

    # ✅ Check for 'Next' button
    try:
        next_button = driver.find_element(By.XPATH, "//button[contains(@class, 'Pagination_page_link') and not(contains(text(),'1'))]")
        if next_button.is_enabled():
            print("Next button found, moving to next page...")
            current_page += 1
        else:
            has_next_page = False
    except:
        print("No 'Next' button found, ending pagination.")
        has_next_page = False

driver.quit()



Processing Page 1: https://www.coworker.com/spain/madrid?view=list
Found 10 coworking spaces on page 1
Next button found, moving to next page...

Processing Page 2: https://www.coworker.com/spain/madrid?view=list&page=2
Found 10 coworking spaces on page 2
Next button found, moving to next page...

Processing Page 3: https://www.coworker.com/spain/madrid?view=list&page=3
Found 10 coworking spaces on page 3
Next button found, moving to next page...

Processing Page 4: https://www.coworker.com/spain/madrid?view=list&page=4
Found 10 coworking spaces on page 4
Next button found, moving to next page...
Refreshing browser to prevent timeouts...

Processing Page 5: https://www.coworker.com/spain/madrid?view=list&page=5
Found 10 coworking spaces on page 5
Next button found, moving to next page...

Processing Page 6: https://www.coworker.com/spain/madrid?view=list&page=6
Found 10 coworking spaces on page 6
Next button found, moving to next page...

Processing Page 7: https://www.coworker.com/sp

In [21]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
import json
import csv

# Function to restart WebDriver if needed
def restart_driver():
    global driver
    try:
        driver.quit()
    except:
        pass
    
    service = Service(chromedriver_path)
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.execute_cdp_cmd('Network.setUserAgentOverride', {
        "userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    })
    return driver

# Setup Selenium WebDriver
chromedriver_path = "/workspaces/Coworking/chromedriver-linux64/chromedriver"
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--headless=false")  # Run with visible browser
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)

# Start driver
driver = restart_driver()

city = "madrid"
base_url = f"https://www.coworker.com/spain/{city}?view=list"
current_page = 1
has_next_page = True
coworking_spaces = []

while has_next_page:
    if current_page % 5 == 0:
        print("Refreshing browser to prevent timeouts...")
        driver.refresh()
        time.sleep(3)

    page_url = f"{base_url}&page={current_page}" if current_page > 1 else base_url
    print(f"\nProcessing Page {current_page}: {page_url}")
    
    try:
        driver.get(page_url)
        time.sleep(5)
    except:
        driver = restart_driver()
        continue
    
    coworking_links = driver.find_elements(By.CSS_SELECTOR, "a[href*='/spain/madrid/']")
    unique_links = list(set([link.get_attribute("href") for link in coworking_links if link.get_attribute("href")]))
    print(f"Found {len(unique_links)} coworking spaces on page {current_page}")
    
    for link in unique_links[:10]:  # Scrape first 10 spaces per page for demo
        try:
            driver.get(link)
            time.sleep(3)
            
            space_data = {
                "url": link,
                "name": "",
                "address": "",
                "description": "",
                "amenities": []
            }
            
            try:
                space_data["name"] = driver.find_element(By.CSS_SELECTOR, "h1").text.strip()
            except:
                pass
            
            try:
                space_data["address"] = driver.find_element(By.CSS_SELECTOR, "[class*='address']").text.strip()
            except:
                pass
            
            try:
                space_data["description"] = driver.find_element(By.CSS_SELECTOR, "[class*='description']").text.strip()
            except:
                pass
            
            try:
                amenities = driver.find_elements(By.CSS_SELECTOR, "[class*='amenity']")
                space_data["amenities"] = [a.text.strip() for a in amenities if a.text.strip()]
            except:
                pass
            
            coworking_spaces.append(space_data)
            print(f"  - Scraped: {space_data['name']}")
            driver.back()
            time.sleep(2)
        except:
            continue
    
    try:
        next_button = driver.find_element(By.XPATH, "//button[contains(@class, 'Pagination_page_link')]")
        if next_button.is_enabled():
            current_page += 1
        else:
            has_next_page = False
    except:
        has_next_page = False

driver.quit()

# Save JSON
with open("/workspaces/Coworking/coworking_spaces.json", "w", encoding="utf-8") as f:
    json.dump(coworking_spaces, f, ensure_ascii=False, indent=4)

# Save CSV
csv_file = "/workspaces/Coworking/coworking_spaces.csv"
with open(csv_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["url", "name", "address", "description", "amenities"])
    writer.writeheader()
    for space in coworking_spaces:
        writer.writerow({
            "url": space["url"],
            "name": space["name"],
            "address": space["address"],
            "description": space["description"],
            "amenities": ", ".join(space["amenities"])
        })

print(f"\nData saved: {len(coworking_spaces)} coworking spaces.")



Processing Page 1: https://www.coworker.com/spain/madrid?view=list
Found 10 coworking spaces on page 1
  - Scraped: Coworking Space: WeWork Eloy Gonzalo 27 in Madrid
  - Scraped: Coworking Space: Regus - Madrid, La Moraleja in Madrid
  - Scraped: Coworking Space: Regus - Madrid Pinar-Salamanca District in Madrid
  - Scraped: Coworking Space: Regus - Madrid, Torre de Cristal in Madrid
  - Scraped: Coworking Space: Regus - Madrid, Avenida America in Madrid
  - Scraped: Coworking Space: Regus - Madrid, Ortega y Gasset in Madrid
  - Scraped: Coworking Space: Regus - Madrid Financial District - Torre Europa in Madrid
  - Scraped: Coworking Space: Regus - LAS ROZAS, Las Rozas in Madrid
  - Scraped: Coworking Space: Regus - Madrid, Manoteras in Madrid
  - Scraped: Coworking Space: Regus - Madrid, Colon in Madrid

Processing Page 2: https://www.coworker.com/spain/madrid?view=list&page=2
Found 10 coworking spaces on page 2
  - Scraped: Coworking Space: Spaces - Madrid, Retiro in Madrid
  - Scr