In [12]:
!pip install playwright
!playwright install

Collecting playwright
  Downloading playwright-1.57.0-py3-none-macosx_11_0_universal2.whl (42.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 MB[0m [31m219.7 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:06[0m
[?25hCollecting pyee<14,>=13
  Downloading pyee-13.0.0-py3-none-any.whl (15 kB)
Collecting greenlet<4.0.0,>=3.1.1
  Downloading greenlet-3.2.4-cp39-cp39-macosx_11_0_universal2.whl (269 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m269.9/269.9 kB[0m [31m312.2 kB/s[0m eta [36m0:00:00[0m kB/s[0m eta [36m0:00:01[0m:01[0m
Installing collected packages: pyee, greenlet, playwright
  Attempting uninstall: greenlet
    Found existing installation: greenlet 1.1.1
    Uninstalling greenlet-1.1.1:
      Successfully uninstalled greenlet-1.1.1
Successfully installed greenlet-3.2.4 playwright-1.57.0 pyee-13.0.0
Downloading Chromium 143.0.7499.4 (playwright build v1200)[2m from https://cdn.pl

In [29]:
import re
import asyncio
import random
import pandas as pd
from playwright.async_api import async_playwright

CITIES = [
    "Paris", "Barcelona", "Tokyo", "New York", "London",
    "Rome", "Amsterdam", "Sydney", "Bangkok", "Istanbul", 
    "Cape Town", "Rio de Janeiro", "Venice", "Los Angeles"
]

async def scrape_airbnb_clicker():
    all_data = []
    
    async with async_playwright() as p:
        # Launch with headless=False so you can watch it click "Next"
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        
        page = await context.new_page()

        for city in CITIES:
            print(f"\n--- Starting City: {city} ---")
            
            # 1. Go to the main search page for the city (No offset parameters)
            url = f"https://www.airbnb.com/s/{city}/homes?currency=EGP&locale=en"
            try:
                await page.goto(url, timeout=60000)
                await page.wait_for_timeout(3000) # Let it settle
            except:
                print(f"  > Failed to load {city}. Skipping.")
                continue

            city_listings_collected = 0
            seen_links = set() # Reset duplicates tracker for this city
            page_num = 1
            
            while city_listings_collected < 100:
                
                # Wait for listings to be visible
                try:
                    await page.wait_for_selector('div[itemprop="itemListElement"]', timeout=10000)
                except:
                    print("  > No listings grid found.")
                    break

                # Grab all listings on current page
                listings = await page.locator('div[itemprop="itemListElement"]').all()
                print(f"  > Page {page_num}: Found {len(listings)} listings on screen.")
                
                new_items_on_page = 0
                
                for listing in listings:
                    if city_listings_collected >= 100: break

                    # --- 1. Get Link & Check Duplicates ---
                    try:
                        link_el = listing.locator('a[href^="/rooms/"]').first
                        rel_link = await link_el.get_attribute("href")
                        # Clean ID from link to ensure uniqueness
                        clean_id = rel_link.split('?')[0] 
                        full_link = f"https://www.airbnb.com{clean_id}"
                    except:
                        full_link = "N/A"
                        clean_id = str(random.random()) # Fallback ID

                    # IMMEDIATE DEDUPLICATION
                    if clean_id in seen_links:
                        continue
                    seen_links.add(clean_id)

                    # --- 2. Parse Text ---
                    text_content = await listing.inner_text()
                    
                    # --- 3. Price (The Comma Hunt) ---
                    # Matches "8,638" from your screenshot
                    comma_prices = re.findall(r"(\d{1,3},\d{3})", text_content)
                    price = "N/A"
                    if comma_prices:
                        # Taking the last one is usually safest (Total or Nightly)
                        price = comma_prices[-1].replace(',', '')

                    # Fallback for small prices (no comma)
                    if price == "N/A":
                        # Look for digits followed by Arabic 'ج.م' or 'EGP'
                        match = re.search(r"(\d+)\s?(?:ج\.م|EGP)", text_content)
                        if match:
                            price = match.group(1)

                    # --- 4. Rating ---
                    rating = "N/A"
                    reviews = "0"
                    rt_match = re.search(r"(\d\.\d+)\s\((\d+)\)", text_content)
                    if rt_match:
                        rating = rt_match.group(1)
                        reviews = rt_match.group(2)
                    elif "New" in text_content:
                        rating = "New"

                    # --- 5. Dates ---
                    dates = "N/A"
                    lines = text_content.split('\n')
                    months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
                    for line in lines:
                        if any(m in line for m in months) and any(c.isdigit() for c in line):
                            dates = line
                            break

                    # Save Data
                    if price != "N/A":
                        all_data.append({
                            "City": city,
                            "Price_EGP": price,
                            "Rating": rating,
                            "Reviews": reviews,
                            "Link": full_link
                        })
                        city_listings_collected += 1
                        new_items_on_page += 1
                        print(f"    {city_listings_collected}/100 | {price} EGP | {rating}*")

                # --- PAGINATION: CLICK NEXT ---
                if city_listings_collected >= 100:
                    break

                try:
                    # Look for the 'Next' arrow/button.
                    # Airbnb changes this often, but aria-label is usually stable.
                    next_btn = page.locator('a[aria-label="Next"], button[aria-label="Next"]').first
                    
                    if await next_btn.is_visible():
                        await next_btn.click()
                        print("  > Clicking Next...")
                        # Vital: Wait for the old listings to be replaced or URL to change
                        await page.wait_for_timeout(4000) 
                        page_num += 1
                    else:
                        print("  > No Next button found. End of city.")
                        break
                except Exception as e:
                    print(f"  > Error clicking next: {e}")
                    break
        
        await browser.close()
        return pd.DataFrame(all_data)

# Run
df = await scrape_airbnb_clicker()

# Verify
print("\n--- DONE ---")
print(f"Total Rows: {len(df)}")
print(f"Duplicates: {df.duplicated().sum()}") # Should be 0 now
df.to_csv("airbnb_final_clean.csv", index=False)


--- Starting City: Paris ---
  > Page 1: Found 18 listings on screen.
    1/100 | 16045 EGP | New*
    2/100 | 16301 EGP | 4.82*
    3/100 | 29213 EGP | 5.0*
    4/100 | 20334 EGP | New*
    5/100 | 16365 EGP | N/A*
    6/100 | 18602 EGP | 4.8*
    7/100 | 12018 EGP | New*
    8/100 | 22949 EGP | 4.93*
    9/100 | 12785 EGP | 4.97*
    10/100 | 10755 EGP | 5.0*
    11/100 | 15981 EGP | 4.84*
    12/100 | 17899 EGP | 4.96*
    13/100 | 23971 EGP | 4.95*
    14/100 | 13105 EGP | 4.88*
    15/100 | 42189 EGP | 5.0*
    16/100 | 9589 EGP | 5.0*
    17/100 | 12785 EGP | 4.92*
    18/100 | 13360 EGP | 4.92*
  > No Next button found. End of city.

--- Starting City: Barcelona ---
  > Page 1: Found 18 listings on screen.
    1/100 | 9147 EGP | New*
    2/100 | 15981 EGP | 5.0*
    3/100 | 9589 EGP | 4.92*
    4/100 | 8683 EGP | 5.0*
    5/100 | 7484 EGP | 4.74*
    6/100 | 9205 EGP | 4.75*
    7/100 | 15278 EGP | 4.96*
    8/100 | 9333 EGP | 4.79*
    9/100 | 14063 EGP | 4.84*
    10/100 | 93

Future exception was never retrieved
future: <Future finished exception=TargetClosedError('Target page, context or browser has been closed')>
playwright._impl._errors.TargetClosedError: Target page, context or browser has been closed


  > Page 2: Found 18 listings on screen.
    19/100 | 11174 EGP | 5.0*
    20/100 | 8950 EGP | 4.88*
    21/100 | 13104 EGP | 5.0*
    22/100 | 9589 EGP | 4.92*
    23/100 | 7084 EGP | 4.78*
    24/100 | 9589 EGP | 4.83*
    25/100 | 9333 EGP | 4.78*
    26/100 | 13424 EGP | 4.9*
    27/100 | 13424 EGP | 4.8*
    28/100 | 11762 EGP | 5.0*
    29/100 | 26848 EGP | 4.83*
    30/100 | 11698 EGP | 4.83*
    31/100 | 19045 EGP | New*
    32/100 | 11187 EGP | 4.63*
    33/100 | 11187 EGP | New*
    34/100 | 10164 EGP | New*
    35/100 | 9589 EGP | 4.67*
    36/100 | 6393 EGP | N/A*
  > Clicking Next...
  > Page 3: Found 18 listings on screen.
    37/100 | 9589 EGP | 4.77*
    38/100 | 12785 EGP | 4.98*
    39/100 | 14703 EGP | 5.0*
    40/100 | 22966 EGP | New*
    41/100 | 6000 EGP | 4.47*
    42/100 | 8935 EGP | 4.7*
    43/100 | 9589 EGP | 4.89*
    44/100 | 12529 EGP | 4.75*
    45/100 | 8985 EGP | 4.85*
    46/100 | 8630 EGP | 4.56*
    47/100 | 8246 EGP | 5.0*
    48/100 | 9589 EGP | 4

In [31]:
df.duplicated().sum()

0

In [33]:
df.groupby('City').count()

Unnamed: 0_level_0,Price_EGP,Rating,Reviews,Link
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Amsterdam,100,100,100,100
Bangkok,100,100,100,100
Barcelona,100,100,100,100
Cape Town,100,100,100,100
Istanbul,100,100,100,100
London,100,100,100,100
Los Angeles,100,100,100,100
New York,100,100,100,100
Paris,18,18,18,18
Rio de Janeiro,100,100,100,100


In [34]:
import re
import asyncio
import random
import pandas as pd
from playwright.async_api import async_playwright

# 1. LOAD & CLEAN
# Assuming your current data is in a variable named 'df'. 
# If it's in a CSV, uncomment the next line:
# df = pd.read_csv("airbnb_final_clean.csv")

print(f"Original Count: {len(df)}")
df_clean = df[df['City'] != 'Paris'].copy()
print(f"Count after dropping Paris: {len(df_clean)}")

# 2. DEFINE THE SCRAPER (Just for Paris)
async def scrape_paris_only():
    city = "Paris"
    paris_data = []
    
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        page = await context.new_page()
        
        print(f"\n--- Starting Rescue Mission: {city} ---")
        
        # Go to Paris, forcing English & EGP
        url = f"https://www.airbnb.com/s/{city}/homes?currency=EGP&locale=en"
        await page.goto(url, timeout=60000)
        await page.wait_for_timeout(3000)

        city_listings_collected = 0
        seen_links = set()
        
        # We try to get 100 listings
        while city_listings_collected < 100:
            
            # Wait for grid
            try:
                await page.wait_for_selector('div[itemprop="itemListElement"]', timeout=15000)
            except:
                print("  > Grid not found. Retrying page load...")
                await page.reload()
                await page.wait_for_timeout(5000)
                continue

            listings = await page.locator('div[itemprop="itemListElement"]').all()
            print(f"  > Found {len(listings)} listings on screen.")
            
            new_items = 0
            for listing in listings:
                if city_listings_collected >= 100: break

                # Get Link first for deduplication
                try:
                    link_el = listing.locator('a[href^="/rooms/"]').first
                    rel_link = await link_el.get_attribute("href")
                    clean_id = rel_link.split('?')[0]
                    full_link = f"https://www.airbnb.com{clean_id}"
                except: continue

                if clean_id in seen_links: continue
                seen_links.add(clean_id)

                # Parse Text
                text_content = await listing.inner_text()
                
                # PRICE: Comma Hunt
                comma_prices = re.findall(r"(\d{1,3},\d{3})", text_content)
                price = "N/A"
                if comma_prices:
                    price = comma_prices[-1].replace(',', '')
                
                # Rating
                rating = "N/A"
                reviews = "0"
                rt_match = re.search(r"(\d\.\d+)\s\((\d+)\)", text_content)
                if rt_match:
                    rating = rt_match.group(1)
                    reviews = rt_match.group(2)
                elif "New" in text_content:
                    rating = "New"

                # Dates
                dates = "N/A"
                lines = text_content.split('\n')
                months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
                for line in lines:
                    if any(m in line for m in months) and any(c.isdigit() for c in line):
                        dates = line
                        break

                if price != "N/A":
                    paris_data.append({
                        "City": city,
                        "Price_EGP": price,
                        "Rating": rating,
                        "Reviews": reviews,
                        "Dates": dates, # Capture dates if you have that column
                        "Link": full_link
                    })
                    city_listings_collected += 1
                    new_items += 1
                    print(f"    {city_listings_collected}/100 | {price} EGP | {rating}*")

            # Click Next
            if city_listings_collected < 100:
                try:
                    next_btn = page.locator('a[aria-label="Next"], button[aria-label="Next"]').first
                    if await next_btn.is_visible():
                        await next_btn.click()
                        print("  > Clicking Next...")
                        await page.wait_for_timeout(4000)
                    else:
                        print("  > No Next button. Done.")
                        break
                except: break
        
        await browser.close()
        return pd.DataFrame(paris_data)

# 3. RUN & MERGE
df_paris = await scrape_paris_only()
print(f"\nParis Scrape Complete. Found {len(df_paris)} records.")

# Concatenate old clean data + new Paris data
df_final = pd.concat([df_clean, df_paris], ignore_index=True)

# 4. VERIFY & SAVE
print(f"\n--- Final Dataset ---")
print(f"Total Rows: {len(df_final)}")
print(df_final['City'].value_counts())
df_final.to_csv("airbnb_complete_fixed.csv", index=False)

Original Count: 1318
Count after dropping Paris: 1300

--- Starting Rescue Mission: Paris ---
  > Found 18 listings on screen.
    1/100 | 16045 EGP | New*
    2/100 | 16301 EGP | 4.82*
    3/100 | 20334 EGP | New*
    4/100 | 29213 EGP | 5.0*
    5/100 | 10036 EGP | 4.92*
    6/100 | 18602 EGP | 4.8*
    7/100 | 12018 EGP | New*
    8/100 | 22949 EGP | 4.93*
    9/100 | 15981 EGP | 4.84*
    10/100 | 11826 EGP | 4.87*
    11/100 | 10755 EGP | 5.0*
    12/100 | 12785 EGP | 4.97*
    13/100 | 13105 EGP | 4.88*
    14/100 | 17899 EGP | 4.96*
    15/100 | 23971 EGP | 4.95*
    16/100 | 42189 EGP | 5.0*
    17/100 | 9589 EGP | 5.0*
    18/100 | 9013 EGP | 4.97*
  > Clicking Next...
  > Found 18 listings on screen.
    19/100 | 13296 EGP | 4.97*
    20/100 | 12727 EGP | 4.88*
    21/100 | 9589 EGP | 4.84*
    22/100 | 11506 EGP | 4.79*
    23/100 | 12465 EGP | 4.88*
    24/100 | 9589 EGP | 4.87*
    25/100 | 12146 EGP | 4.82*
    26/100 | 8982 EGP | 4.97*
    27/100 | 76068 EGP | 5.0*
    2

In [40]:
#oopsie i forgot to include dates in my df in the first scrape, and it's
#not too necessary for our purposes, so we'll just drop it
df_final_final = df_final.drop(columns=['Dates'], axis = 1)

In [41]:
df_final_final.to_csv("airbnb_complete_fixed2.csv", index=False)