# Scrap from coworker.com

In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
import csv

# Setup Selenium WebDriver
chromedriver_path = "/workspaces/Coworking/chromedriver-linux64/chromedriver"
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--headless=false")  # Run with visible browser

# Anti-detection measures
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)

service = Service(chromedriver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

# Set user agent to appear more like a regular browser
driver.execute_cdp_cmd('Network.setUserAgentOverride', {
    "userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
})

try:
    # Open coworking page
    city = "madrid"
    base_url = f"https://www.coworker.com/spain/{city}?view=list"
    print(f"Opening URL: {base_url}")
    driver.get(base_url)

    # Wait for page to load
    print("Waiting for page to load...")
    time.sleep(10)
    
    # First, check if we're getting the main content
    print("Checking if main content is accessible...")
    try:
        # Look for main sections
        main_elements = driver.find_elements(By.TAG_NAME, "main")
        if main_elements:
            print(f"Found {len(main_elements)} <main> elements")
            
        # Look for the slick carousel we identified
        carousel_elements = driver.find_elements(By.CSS_SELECTOR, ".slick-track")
        if carousel_elements:
            print(f"Found {len(carousel_elements)} carousel tracks")
    except Exception as e:
        print(f"Error finding main elements: {e}")
    
    # Now let's extract the coworking space links directly
    print("\nExtracting coworking space links...")
    coworking_links = driver.find_elements(By.CSS_SELECTOR, "a[href*='/spain/madrid/']")
    filtered_links = []
    
    for link in coworking_links:
        href = link.get_attribute("href")
        if href and "/search?" not in href and "/explore?" not in href:
            # This is likely a link to a specific coworking space
            filtered_links.append({
                "href": href,
                "text": link.text.strip()
            })
    
    # Remove duplicates based on href
    unique_links = []
    seen_hrefs = set()
    for link in filtered_links:
        if link["href"] not in seen_hrefs:
            seen_hrefs.add(link["href"])
            unique_links.append(link)
    
    print(f"Found {len(unique_links)} unique coworking space links")
    
    # Extract information for each coworking space
    coworking_spaces = []
    
    print("\nExtracting information for each coworking space...")
    for i, link_data in enumerate(unique_links):
        try:
            if i < 10:  # Process first 10 links for demonstration
                print(f"Processing link {i+1}/{len(unique_links)}: {link_data['href']}")
                
                # Open the coworking space page
                driver.get(link_data["href"])
                time.sleep(3)  # Wait for page to load
                
                # Extract data
                space_data = {
                    "url": link_data["href"],
                    "name": "",
                    "address": "",
                    "description": "",
                    "amenities": []
                }
                
                # Name (try different selectors)
                try:
                    name_element = driver.find_element(By.CSS_SELECTOR, "h1")
                    space_data["name"] = name_element.text.strip()
                except:
                    try:
                        name_element = driver.find_element(By.CSS_SELECTOR, "[class*='title']")
                        space_data["name"] = name_element.text.strip()
                    except:
                        pass
                
                # Address
                try:
                    address_element = driver.find_element(By.CSS_SELECTOR, "[class*='address'], [class*='location']")
                    space_data["address"] = address_element.text.strip()
                except:
                    pass
                
                # Description
                try:
                    desc_element = driver.find_element(By.CSS_SELECTOR, "[class*='description'], [class*='about']")
                    space_data["description"] = desc_element.text.strip()
                except:
                    pass
                
                # Amenities
                try:
                    amenity_elements = driver.find_elements(By.CSS_SELECTOR, "[class*='amenity'], [class*='facility']")
                    space_data["amenities"] = [el.text.strip() for el in amenity_elements if el.text.strip()]
                except:
                    pass
                
                coworking_spaces.append(space_data)
                print(f"  - Name: {space_data['name']}")
                
                # Go back to the list page if we need to process more spaces
                if i < len(unique_links) - 1:
                    driver.back()
                    time.sleep(2)
        except Exception as e:
            print(f"Error processing link {i+1}: {e}")
    
    # Save the extracted data
    print(f"\nSaving data for {len(coworking_spaces)} coworking spaces...")
    
    # Save as JSON
    with open("/workspaces/Coworking/coworking_spaces.json", "w", encoding="utf-8") as f:
        json.dump(coworking_spaces, f, indent=2)
    
    # Save as CSV
    with open("/workspaces/Coworking/coworking_spaces.csv", "w", encoding="utf-8", newline="") as f:
        fieldnames = ["name", "address", "url", "description"]
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for space in coworking_spaces:
            # Create a new dict with just the fields we want for CSV
            csv_row = {field: space.get(field, "") for field in fieldnames}
            writer.writerow(csv_row)
    
    print("Data saved successfully!")
    
    # Alternative approach: Try to extract directly from the list page
    print("\nAttempting to extract data directly from list page...")
    driver.get(base_url)
    time.sleep(5)
    
    # The links are within a slider, but we also need to try finding standard list items
    list_selectors = [
        "div.slick-track > div",  # The slider items we identified
        "div[class*='card']",     # Common card pattern
        "div[class*='listing']",  # Common listing pattern
        "div[class*='result']",   # Search result pattern
        "main > div > div"        # Generic nested divs that might contain listings
    ]
    
    list_items = []
    working_selector = ""
    
    for selector in list_selectors:
        try:
            elements = driver.find_elements(By.CSS_SELECTOR, selector)
            if elements and len(elements) > 5 and len(elements) < 100:
                print(f"Found {len(elements)} potential list items with selector: {selector}")
                list_items = elements
                working_selector = selector
                break
        except:
            continue
    
    if list_items:
        print(f"Extracting data from {len(list_items)} list items...")
        list_data = []
        
        for i, item in enumerate(list_items[:10]):  # Process first 10 items
            try:
                item_data = {"position": i + 1}
                
                # Try to get the link
                links = item.find_elements(By.TAG_NAME, "a")
                if links:
                    item_data["url"] = links[0].get_attribute("href")
                
                # Try to get name/title
                try:
                    title_els = item.find_elements(By.CSS_SELECTOR, "h1, h2, h3, h4, strong, [class*='title'], [class*='name']")
                    if title_els:
                        item_data["name"] = title_els[0].text.strip()
                except:
                    pass
                
                # Try to get address
                try:
                    addr_els = item.find_elements(By.CSS_SELECTOR, "[class*='address'], [class*='location']")
                    if addr_els:
                        item_data["address"] = addr_els[0].text.strip()
                except:
                    pass
                
                # Try to get image URL
                try:
                    img_els = item.find_elements(By.TAG_NAME, "img")
                    if img_els:
                        item_data["image_url"] = img_els[0].get_attribute("src")
                except:
                    pass
                
                list_data.append(item_data)
                print(f"  Item {i+1}: {item_data.get('name', 'Unknown')}")
            except Exception as e:
                print(f"  Error processing list item {i+1}: {e}")
        
        # Save list data
        with open("/workspaces/Coworking/list_items.json", "w", encoding="utf-8") as f:
            json.dump(list_data, f, indent=2)
        
        print("List data saved successfully!")

except Exception as e:
    print(f"Error during extraction: {e}")

finally:
    # Clean up
    driver.quit()
    print("\nExtraction complete.")

Opening URL: https://www.coworker.com/spain/madrid?view=list
Waiting for page to load...
Checking if main content is accessible...
Found 1 <main> elements
Found 10 carousel tracks

Extracting coworking space links...
Found 10 unique coworking space links

Extracting information for each coworking space...
Processing link 1/10: https://www.coworker.com/spain/madrid/regus-madrid-la-moraleja
  - Name: Coworking Space: Regus - Madrid, La Moraleja in Madrid
Processing link 2/10: https://www.coworker.com/spain/madrid/regus-madrid-torre-de-cristal
  - Name: Coworking Space: Regus - Madrid, Torre de Cristal in Madrid
Processing link 3/10: https://www.coworker.com/spain/madrid/regus-madrid-ortega-y-gasset
  - Name: Coworking Space: Regus - Madrid, Ortega y Gasset in Madrid
Processing link 4/10: https://www.coworker.com/spain/madrid/regus-madrid-manoteras
  - Name: Coworking Space: Regus - Madrid, Manoteras in Madrid
Processing link 5/10: https://www.coworker.com/spain/madrid/regus-madrid-pinar-

In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
import pandas as pd
import os

def scrape_coworker_data(driver, city="madrid"):
    base_url = f"https://www.coworker.com/spain/{city}?view=list"
    driver.get(base_url)

    time.sleep(5)  # Wait for content to load

    # ✅ Save page source for debugging
    with open(f"/workspaces/Coworking/page_source_{city}.html", "w", encoding="utf-8") as f:
        f.write(driver.page_source)

    # ✅ Print page title to confirm loading
    print(f"Page title: {driver.title}")

    coworkings, addresses, amenities, prices = [], [], [], []

    while True:
        wait = WebDriverWait(driver, 10)
        try:
            coworking_blocks = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.SearchResult_container__0EI6G")))
            print(f"Found {len(coworking_blocks)} coworking spaces on this page.")
        except Exception as e:
            print("❌ Error finding coworking blocks:", e)
            break

        for block in coworking_blocks:
            try:
                name = block.find_element(By.CSS_SELECTOR, "p.SearchResult_container__content__title__fSW47").text
                address = block.find_element(By.CSS_SELECTOR, "span.Transportation_name__CkTxv").text
                amenities_str = ", ".join([a.text for a in block.find_elements(By.CSS_SELECTOR, "div.amenities li")]) or "N/A"
                price = block.find_element(By.CSS_SELECTOR, "div.Prices_container__-8jBv p").text or "N/A"

                coworkings.append(name)
                addresses.append(address)
                amenities.append(amenities_str)
                prices.append(price)

            except Exception as e:
                print(f"⚠️ Error scraping a coworking space: {e}")

        try:
            next_button = driver.find_element(By.CSS_SELECTOR, "button.Pagination_page_link__4IZxn")
            next_button.click()
            time.sleep(5)
        except:
            print("✅ No more pages. Scraping complete.")
            break

    df = pd.DataFrame({
        "Coworking Name": coworkings,
        "Address": addresses,
        "Amenities": amenities,
        "Price (per month)": prices
    })

    print(df.head(), f"Total rows: {df.shape[0]}")

    save_path = "/workspaces/Coworking/src/results/Madrid/"
    os.makedirs(save_path, exist_ok=True)
    csv_filename = os.path.join(save_path, f"coworkings_{city}.csv")
    df.to_csv(csv_filename, index=False)

    print(f"✅ CSV file saved at {csv_filename}")

# ---------- Main Execution ----------
if __name__ == "__main__":
    chromedriver_path = "/workspaces/Coworking/chromedriver-linux64/chromedriver"

    if not os.path.exists(chromedriver_path):
        raise FileNotFoundError(f"ChromeDriver not found at {chromedriver_path}")

    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--headless=false")  # ✅ Run with a visible browser for debugging

    service = Service(chromedriver_path)
    driver = webdriver.Chrome(service=service, options=chrome_options)

    print("🚀 Starting web scraping...")
    scrape_coworker_data(driver, city="madrid")

    driver.quit()


🚀 Starting web scraping...
Page title: Just a moment...
❌ Error finding coworking blocks: Message: 

Empty DataFrame
Columns: [Coworking Name, Address, Amenities, Price (per month)]
Index: [] Total rows: 0
✅ CSV file saved at /workspaces/Coworking/src/results/Madrid/coworkings_madrid.csv


In [2]:
df.head()

NameError: name 'df' is not defined