# Extract the name, address, price and the description from New York in coworker.com

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
import json
import csv

# Function to restart WebDriver if needed
def restart_driver():
    global driver
    try:
        driver.quit()
    except:
        pass
    
    service = Service(chromedriver_path)
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.execute_cdp_cmd('Network.setUserAgentOverride', {
        "userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    })
    return driver

# Setup Selenium WebDriver
chromedriver_path = "/workspaces/Coworking/chromedriver-linux64/chromedriver"
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--headless=false")  # Run with visible browser
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)

# Start driver
driver = restart_driver()

city = "new-york-city"
base_url = f"https://www.coworker.com/united-states/new-york/{city}?view=list"
current_page = 1
has_next_page = True
coworking_spaces = []

while has_next_page:
    if current_page % 5 == 0:
        print("Refreshing browser to prevent timeouts...")
        driver.refresh()
        time.sleep(3)

    page_url = f"{base_url}&page={current_page}" if current_page > 1 else base_url
    print(f"\nProcessing Page {current_page}: {page_url}")
    
    try:
        driver.get(page_url)
        time.sleep(5)
    except:
        driver = restart_driver()
        continue
    
    # Extract price information from the search results page
    search_results = driver.find_elements(By.CSS_SELECTOR, "div.SearchResult_container__0EI6G")
    result_data = {}
    
    for i, result in enumerate(search_results):
        try:
            # Get the link to the detailed page
            link_element = result.find_element(By.CSS_SELECTOR, "a[href*='/united-states/new-york/new-york-city']")
            link = link_element.get_attribute("href")
            
            # Extract price from the specific div you mentioned
            try:
                # Using the selector you provided, adjusted for the current result
                price_element = result.find_element(By.CSS_SELECTOR, "div:nth-child(2) > div:nth-child(3) > div:nth-child(1)")
                price = price_element.text.strip()
                result_data[link] = price
                print(f"Found price on search page: {price} for {link}")
            except:
                print(f"No price found for result #{i+1}")
                result_data[link] = ""
        except:
            continue
    
    coworking_links = driver.find_elements(By.CSS_SELECTOR, "a[href*='/united-states/new-york/new-york-city']")
    unique_links = list(set([link.get_attribute("href") for link in coworking_links if link.get_attribute("href")]))
    print(f"Found {len(unique_links)} coworking spaces on page {current_page}")
    
    for link in unique_links[:10]:  # Scrape first 10 spaces per page for demo
        try:
            driver.get(link)
            time.sleep(3)
            
            space_data = {
                "url": link,
                "name": "",
                "address": "",
                "description": "",
                "amenities": [],
                "price": result_data.get(link, "Price not found")  # Use price from search page
            }
            
            try:
                space_data["name"] = driver.find_element(By.CSS_SELECTOR, "h1").text.strip()
            except:
                pass
            
            try:
                space_data["address"] = driver.find_element(By.CSS_SELECTOR, "[class*='address']").text.strip()
            except:
                pass
            
            try:
                space_data["description"] = driver.find_element(By.CSS_SELECTOR, "[class*='description']").text.strip()
            except:
                pass
            
            # If price wasn't found on the search page, try to find it on the detail page
            if not space_data["price"] or space_data["price"] == "Price not found":
                try:
                    # Try various selectors that might contain pricing information
                    price_elements = driver.find_elements(By.CSS_SELECTOR, 
                                                         "[class*='price'], [class*='pricing'], [class*='cost'], .rate, .membership, .plan")
                    for element in price_elements:
                        text = element.text.strip()
                        if text and ("€" in text or "$" in text) and ("month" in text.lower() or "/mo" in text.lower()):
                            space_data["price"] = text
                            break
                except:
                    pass
            
            coworking_spaces.append(space_data)
            print(f"  - Scraped: {space_data['name']} - Price: {space_data['price']}")
            driver.back()
            time.sleep(2)
        except Exception as e:
            print(f"Error processing {link}: {str(e)}")
            continue
    
    try:
        next_button = driver.find_element(By.XPATH, "//button[contains(@class, 'Pagination_page_link')]")
        if next_button.is_enabled():
            current_page += 1
        else:
            has_next_page = False
    except:
        has_next_page = False

driver.quit()

# Save JSON
with open("/workspaces/Coworking/src/results/New York/New_York_coworking_spaces.json", "w", encoding="utf-8") as f:
    json.dump(coworking_spaces, f, ensure_ascii=False, indent=4)

# Save CSV - updated to include price
csv_file = "/workspaces/Coworking/src/results/New York/New_York_coworking_spaces.csv"
with open(csv_file, "w", newline="", encoding="utf-8") as f:
    # Add price field to CSV headers
    writer = csv.DictWriter(f, fieldnames=["url", "name", "address", "description", "price"])
    writer.writeheader()
    for space in coworking_spaces:
        writer.writerow({
            "url": space["url"],
            "name": space["name"],
            "address": space["address"],
            "description": space["description"],
            "price": space["price"]
        })

print(f"\nData saved: {len(coworking_spaces)} coworking spaces.")


Processing Page 1: https://www.coworker.com/united-states/new-york/new-york-city?view=list
Found price on search page: POPULAR for https://www.coworker.com/united-states/new-york/new-york-city/jocale-studio
No price found for result #2
Found price on search page: Private Office
 from $ 615
/month for https://www.coworker.com/united-states/new-york/new-york-city/regus-new-york-forest-hills-queens-forest-hills-tower-queens
Found price on search page: Private Office
 from $ 1600
/month for https://www.coworker.com/united-states/new-york/new-york-city/the-yard-flatiron-north
Found price on search page: Private Office
 from $ 3200
/month for https://www.coworker.com/united-states/new-york/new-york-city/the-yard
Found price on search page: Private Office
 from $ 473
/month for https://www.coworker.com/united-states/new-york/new-york-city/industrious-brooklyn
Found price on search page: Private Office
 from $ 775
/month for https://www.coworker.com/united-states/new-york/new-york-city/the-ya