In [12]:
!pip install playwright
!playwright install

Collecting playwright
  Downloading playwright-1.57.0-py3-none-macosx_11_0_universal2.whl (42.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 MB[0m [31m219.7 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:06[0m
[?25hCollecting pyee<14,>=13
  Downloading pyee-13.0.0-py3-none-any.whl (15 kB)
Collecting greenlet<4.0.0,>=3.1.1
  Downloading greenlet-3.2.4-cp39-cp39-macosx_11_0_universal2.whl (269 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m269.9/269.9 kB[0m [31m312.2 kB/s[0m eta [36m0:00:00[0m kB/s[0m eta [36m0:00:01[0m:01[0m
Installing collected packages: pyee, greenlet, playwright
  Attempting uninstall: greenlet
    Found existing installation: greenlet 1.1.1
    Uninstalling greenlet-1.1.1:
      Successfully uninstalled greenlet-1.1.1
Successfully installed greenlet-3.2.4 playwright-1.57.0 pyee-13.0.0
Downloading Chromium 143.0.7499.4 (playwright build v1200)[2m from https://cdn.pl

In [1]:
import re
import asyncio
import random
import pandas as pd
from playwright.async_api import async_playwright

CITIES = [
    "Paris", "Barcelona", "Tokyo", "New York", "London",
    "Rome", "Amsterdam", "Sydney", "Bangkok", "Istanbul", 
    "Cairo", "Rio de Janeiro", "Venice", "Los Angeles"
]

async def scrape_airbnb_clicker():
    all_data = []
    
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        
        page = await context.new_page()

        for city in CITIES:
            print(f"\n--- Starting City: {city} ---")
            
            url = f"https://www.airbnb.com/s/{city}/homes?currency=EGP&locale=en"
            try:
                await page.goto(url, timeout=60000)
                await page.wait_for_timeout(3000) # Let it settle
            except:
                print(f"  > Failed to load {city}. Skipping.")
                continue

            city_listings_collected = 0
            seen_links = set() # Reset duplicates tracker for this city
            page_num = 1
            
            while city_listings_collected < 100:
                
                try:
                    await page.wait_for_selector('div[itemprop="itemListElement"]', timeout=10000)
                except:
                    print("  > No listings grid found.")
                    break

                listings = await page.locator('div[itemprop="itemListElement"]').all()
                print(f"  > Page {page_num}: Found {len(listings)} listings on screen.")
                
                new_items_on_page = 0
                
                for listing in listings:
                    if city_listings_collected >= 100: break

                    try:
                        link_el = listing.locator('a[href^="/rooms/"]').first
                        rel_link = await link_el.get_attribute("href")
                        clean_id = rel_link.split('?')[0] 
                        full_link = f"https://www.airbnb.com{clean_id}"
                    except:
                        full_link = "N/A"
                        clean_id = str(random.random())

                    if clean_id in seen_links:
                        continue
                    seen_links.add(clean_id)

                    text_content = await listing.inner_text()
                    
                    comma_prices = re.findall(r"(\d{1,3},\d{3})", text_content)
                    price = "N/A"
                    if comma_prices:
                        price = comma_prices[-1].replace(',', '')

                    if price == "N/A":
                        match = re.search(r"(\d+)\s?(?:ج\.م|EGP)", text_content)
                        if match:
                            price = match.group(1)

                    rating = "N/A"
                    reviews = "0"
                    rt_match = re.search(r"(\d\.\d+)\s\((\d+)\)", text_content)
                    if rt_match:
                        rating = rt_match.group(1)
                        reviews = rt_match.group(2)
                    elif "New" in text_content:
                        rating = "New"

                    if price != "N/A":
                        all_data.append({
                            "City": city,
                            "Price_EGP": price,
                            "Rating": rating,
                            "Reviews": reviews,
                            "Link": full_link,
                        })
                        city_listings_collected += 1
                        new_items_on_page += 1
                        print(f"    {city_listings_collected}/100 | {price} EGP | {rating}*")

                if city_listings_collected >= 100:
                    break

                try:
                    next_btn = page.locator('a[aria-label="Next"], button[aria-label="Next"]').first
                    
                    if await next_btn.is_visible():
                        await next_btn.click()
                        print("  > Clicking Next...")
                        await page.wait_for_timeout(4000) 
                        page_num += 1
                    else:
                        print("  > No Next button found. End of city.")
                        break
                except Exception as e:
                    print(f"  > Error clicking next: {e}")
                    break
        
        await browser.close()
        return pd.DataFrame(all_data)

df = await scrape_airbnb_clicker()

print("\n--- DONE ---")
print(f"Total Rows: {len(df)}")
print(f"Duplicates: {df.duplicated().sum()}") 
df.to_csv("airbnb_cairo_addition.csv", index=False)


--- Starting City: Paris ---
  > Failed to load Paris. Skipping.

--- Starting City: Barcelona ---
  > Failed to load Barcelona. Skipping.

--- Starting City: Tokyo ---
  > Failed to load Tokyo. Skipping.

--- Starting City: New York ---
  > Failed to load New York. Skipping.

--- Starting City: London ---
  > Failed to load London. Skipping.

--- Starting City: Rome ---
  > Failed to load Rome. Skipping.

--- Starting City: Amsterdam ---
  > Failed to load Amsterdam. Skipping.

--- Starting City: Sydney ---
  > Failed to load Sydney. Skipping.

--- Starting City: Bangkok ---
  > Failed to load Bangkok. Skipping.

--- Starting City: Istanbul ---
  > Failed to load Istanbul. Skipping.

--- Starting City: Cairo ---
  > Failed to load Cairo. Skipping.

--- Starting City: Rio de Janeiro ---
  > Failed to load Rio de Janeiro. Skipping.

--- Starting City: Venice ---
  > Failed to load Venice. Skipping.

--- Starting City: Los Angeles ---
  > Failed to load Los Angeles. Skipping.

--- DONE 

In [31]:
df.duplicated().sum()

0

In [33]:
df.groupby('City').count()

Unnamed: 0_level_0,Price_EGP,Rating,Reviews,Link
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Amsterdam,100,100,100,100
Bangkok,100,100,100,100
Barcelona,100,100,100,100
Cape Town,100,100,100,100
Istanbul,100,100,100,100
London,100,100,100,100
Los Angeles,100,100,100,100
New York,100,100,100,100
Paris,18,18,18,18
Rio de Janeiro,100,100,100,100


In [34]:
print(f"Original Count: {len(df)}")
df_clean = df[df['City'] != 'Paris'].copy()
print(f"Count after dropping Paris: {len(df_clean)}")

async def scrape_paris_only():
    city = "Paris"
    paris_data = []
    
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        page = await context.new_page()
        
        print(f"\n--- Starting Rescue Mission: {city} ---")
        
        url = f"https://www.airbnb.com/s/{city}/homes?currency=EGP&locale=en"
        await page.goto(url, timeout=60000)
        await page.wait_for_timeout(3000)

        city_listings_collected = 0
        seen_links = set()
        
        while city_listings_collected < 100:
            
            try:
                await page.wait_for_selector('div[itemprop="itemListElement"]', timeout=15000)
            except:
                print("  > Grid not found. Retrying page load...")
                await page.reload()
                await page.wait_for_timeout(5000)
                continue

            listings = await page.locator('div[itemprop="itemListElement"]').all()
            print(f"  > Found {len(listings)} listings on screen.")
            
            new_items = 0
            for listing in listings:
                if city_listings_collected >= 100: break

                try:
                    link_el = listing.locator('a[href^="/rooms/"]').first
                    rel_link = await link_el.get_attribute("href")
                    clean_id = rel_link.split('?')[0]
                    full_link = f"https://www.airbnb.com{clean_id}"
                except: continue

                if clean_id in seen_links: continue
                seen_links.add(clean_id)

                text_content = await listing.inner_text()
                
                comma_prices = re.findall(r"(\d{1,3},\d{3})", text_content)
                price = "N/A"
                if comma_prices:
                    price = comma_prices[-1].replace(',', '')
                
                rating = "N/A"
                reviews = "0"
                rt_match = re.search(r"(\d\.\d+)\s\((\d+)\)", text_content)
                if rt_match:
                    rating = rt_match.group(1)
                    reviews = rt_match.group(2)
                elif "New" in text_content:
                    rating = "New"

                if price != "N/A":
                    paris_data.append({
                        "City": city,
                        "Price_EGP": price,
                        "Rating": rating,
                        "Reviews": reviews,
                        "Link": full_link
                    })
                    city_listings_collected += 1
                    new_items += 1
                    print(f"    {city_listings_collected}/100 | {price} EGP | {rating}*")

            if city_listings_collected < 100:
                try:
                    next_btn = page.locator('a[aria-label="Next"], button[aria-label="Next"]').first
                    if await next_btn.is_visible():
                        await next_btn.click()
                        print("  > Clicking Next...")
                        await page.wait_for_timeout(4000)
                    else:
                        print("  > No Next button. Done.")
                        break
                except: break
        
        await browser.close()
        return pd.DataFrame(paris_data)

df_paris = await scrape_paris_only()
print(f"\nParis Scrape Complete. Found {len(df_paris)} records.")

df_final = pd.concat([df_clean, df_paris], ignore_index=True)

print(f"\n--- Final Dataset ---")
print(f"Total Rows: {len(df_final)}")
print(df_final['City'].value_counts())
df_final.to_csv("airbnb_complete_fixed.csv", index=False)

Original Count: 1318
Count after dropping Paris: 1300

--- Starting Rescue Mission: Paris ---
  > Found 18 listings on screen.
    1/100 | 16045 EGP | New*
    2/100 | 16301 EGP | 4.82*
    3/100 | 20334 EGP | New*
    4/100 | 29213 EGP | 5.0*
    5/100 | 10036 EGP | 4.92*
    6/100 | 18602 EGP | 4.8*
    7/100 | 12018 EGP | New*
    8/100 | 22949 EGP | 4.93*
    9/100 | 15981 EGP | 4.84*
    10/100 | 11826 EGP | 4.87*
    11/100 | 10755 EGP | 5.0*
    12/100 | 12785 EGP | 4.97*
    13/100 | 13105 EGP | 4.88*
    14/100 | 17899 EGP | 4.96*
    15/100 | 23971 EGP | 4.95*
    16/100 | 42189 EGP | 5.0*
    17/100 | 9589 EGP | 5.0*
    18/100 | 9013 EGP | 4.97*
  > Clicking Next...
  > Found 18 listings on screen.
    19/100 | 13296 EGP | 4.97*
    20/100 | 12727 EGP | 4.88*
    21/100 | 9589 EGP | 4.84*
    22/100 | 11506 EGP | 4.79*
    23/100 | 12465 EGP | 4.88*
    24/100 | 9589 EGP | 4.87*
    25/100 | 12146 EGP | 4.82*
    26/100 | 8982 EGP | 4.97*
    27/100 | 76068 EGP | 5.0*
    2

In [41]:
df_final_final.to_csv("airbnb_complete_fixed2.csv", index=False)

In [5]:
df = pd.read_csv("airbnb_complete_fixed2.csv")

print(f"Original Count: {len(df)}")
df_clean = df[df['City'] != 'Cape Town'].copy()
print(f"Count after dropping Cape Town: {len(df_clean)}")

async def scrape_cairo_only():
    city = "Cairo"
    cairo_data = []
    
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        page = await context.new_page()
        
        
        url = f"https://www.airbnb.com/s/{city}/homes?currency=EGP&locale=en"
        await page.goto(url, timeout=60000)
        await page.wait_for_timeout(3000)

        city_listings_collected = 0
        seen_links = set()
        
        while city_listings_collected < 100:
            
            try:
                await page.wait_for_selector('div[itemprop="itemListElement"]', timeout=15000)
            except:
                print("  > Grid not found. Retrying page load...")
                await page.reload()
                await page.wait_for_timeout(5000)
                continue

            listings = await page.locator('div[itemprop="itemListElement"]').all()
            print(f"  > Found {len(listings)} listings on screen.")
            
            new_items = 0
            for listing in listings:
                if city_listings_collected >= 100: break

                try:
                    link_el = listing.locator('a[href^="/rooms/"]').first
                    rel_link = await link_el.get_attribute("href")
                    clean_id = rel_link.split('?')[0]
                    full_link = f"https://www.airbnb.com{clean_id}"
                except: continue

                if clean_id in seen_links: continue
                seen_links.add(clean_id)

                text_content = await listing.inner_text()
                
                comma_prices = re.findall(r"(\d{1,3},\d{3})", text_content)
                price = "N/A"
                if comma_prices:
                    price = comma_prices[-1].replace(',', '')
                
                rating = "N/A"
                reviews = "0"
                rt_match = re.search(r"(\d\.\d+)\s\((\d+)\)", text_content)
                if rt_match:
                    rating = rt_match.group(1)
                    reviews = rt_match.group(2)
                elif "New" in text_content:
                    rating = "New"

                if price != "N/A":
                    cairo_data.append({
                        "City": city,
                        "Price_EGP": price,
                        "Rating": rating,
                        "Reviews": reviews,
                        "Link": full_link
                    })
                    city_listings_collected += 1
                    new_items += 1
                    print(f"    {city_listings_collected}/100 | {price} EGP | {rating}*")

            if city_listings_collected < 100:
                try:
                    next_btn = page.locator('a[aria-label="Next"], button[aria-label="Next"]').first
                    if await next_btn.is_visible():
                        await next_btn.click()
                        print("  > Clicking Next...")
                        await page.wait_for_timeout(4000)
                    else:
                        print("  > No Next button. Done.")
                        break
                except: break
        
        await browser.close()
        return pd.DataFrame(cairo_data)

df_cairo = await scrape_cairo_only()
print(f"\n Cairo Scrape Complete. Found {len(df_cairo)} records.")

df_final = pd.concat([df_clean, df_cairo], ignore_index=True)

print(f"\n--- Final Dataset ---")
print(f"Total Rows: {len(df_final)}")
print(df_final['City'].value_counts())
df_final.to_csv("airbnb_scraped_dataset.csv", index=False)

Original Count: 1400
Count after dropping Cape Town: 1300
  > Found 18 listings on screen.
    1/100 | 10695 EGP | New*
    2/100 | 5134 EGP | 4.7*
    3/100 | 6618 EGP | 4.97*
    4/100 | 12359 EGP | 5.0*
    5/100 | 7920 EGP | 5.0*
    6/100 | 2869 EGP | 4.65*
    7/100 | 3636 EGP | 4.64*
    8/100 | 6005 EGP | 5.0*
    9/100 | 9493 EGP | 4.8*
    10/100 | 5425 EGP | 4.95*
    11/100 | 14483 EGP | 5.0*
    12/100 | 5967 EGP | 5.0*
    13/100 | 3526 EGP | 5.0*
    14/100 | 4882 EGP | 4.93*
    15/100 | 8951 EGP | 4.87*
    16/100 | 5425 EGP | 4.88*
    17/100 | 4611 EGP | 4.97*
    18/100 | 5425 EGP | 4.99*
  > Clicking Next...
  > Found 18 listings on screen.
    19/100 | 7947 EGP | 5.0*
    20/100 | 7567 EGP | 5.0*
    21/100 | 15460 EGP | 4.94*
    22/100 | 8319 EGP | New*
    23/100 | 7898 EGP | 5.0*
    24/100 | 13904 EGP | 4.9*
    25/100 | 13019 EGP | 5.0*
    26/100 | 3743 EGP | 4.91*
    27/100 | 8342 EGP | 4.93*
    28/100 | 11663 EGP | 5.0*
    29/100 | 5425 EGP | 4.97*
   