In [13]:
import asyncio
import time
import random
import urllib.request
import ssl
from dotenv import load_dotenv
import os

In [14]:
load_dotenv()

BRD_PROXY_USERNAME = os.getenv("BRD_PROXY_USERNAME")
BRD_SERVER = os.getenv("BRD_SERVER").replace("https://", "")
BRD_PROXY_PASSWORD = os.getenv("BRD_PROXY_PASSWORD")
proxy = f"http://{BRD_PROXY_USERNAME}:{BRD_PROXY_PASSWORD}@{BRD_SERVER}"

MIN_DELAY_SECONDS = 5
MAX_DELAY_SECONDS = 15
NUM_URLS_TO_SCRAPE = 10

print(BRD_SERVER)
print(BRD_PROXY_USERNAME)
print(BRD_PROXY_PASSWORD)

http://brd.superproxy.io:33335
brd-customer-hl_51e27139-zone-residential_proxy1
dbgb6ojk1bp8


In [15]:
def generate_random_user_agent():
    """
    Generate a random plausible Chrome-based user agent string.
    """
    # OS options
    os_options = [
        "Windows NT 10.0; Win64; x64",
        "Windows NT 10.0; WOW64",
        "Windows NT 6.1; Win64; x64",
        "Macintosh; Intel Mac OS X 10_15_7",
        "Macintosh; Intel Mac OS X 11_2_3",
        "X11; Linux x86_64",
    ]
    os_str = random.choice(os_options)

    # Chrome version
    chrome_major = random.randint(90, 120)
    chrome_minor = random.randint(0, 0)
    chrome_build = random.randint(4400, 5800)
    chrome_patch = random.randint(50, 200)
    chrome_version = f"{chrome_major}.0.{chrome_build}.{chrome_patch}"

    # AppleWebKit version
    webkit_major = 537
    webkit_minor = random.randint(36, 50)
    webkit_version = f"{webkit_major}.{webkit_minor}"

    # Safari version
    safari_major = 537
    safari_minor = random.randint(36, 50)
    safari_version = f"{safari_major}.{safari_minor}"

    user_agent = (
        f"Mozilla/5.0 ({os_str}) "
        f"AppleWebKit/{webkit_version} (KHTML, like Gecko) "
        f"Chrome/{chrome_version} Safari/{safari_version}"
    )
    return user_agent


def generate_random_accept_language():
    # Common Accept-Language header values
    languages = [
        "en-US,en;q=0.9",
        "en-GB,en;q=0.8",
        "en-US,en;q=0.8,fr;q=0.6",
        "en-US,en;q=0.7,es;q=0.3",
        "en-US,en;q=0.9,fr-CA;q=0.7,fr;q=0.6",
        "en-US,en;q=0.9,es-ES;q=0.7,es;q=0.6",
    ]
    return random.choice(languages)


def generate_random_accept_encoding():
    # Common Accept-Encoding header values
    encodings = [
        "gzip, deflate, br",
        "gzip, deflate",
        "br, gzip, deflate",
        "gzip",
    ]
    return random.choice(encodings)


def generate_random_accept():
    # Common Accept header values
    accepts = [
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "text/html,application/xml;q=0.9,*/*;q=0.8",
    ]
    return random.choice(accepts)


def generate_random_referer(url):
    # If the URL looks like a property page, set referer to a plausible search page
    # Otherwise, use a generic referer
    if "streeteasy.com" in url:
        # Try to extract the neighborhood from the URL
        import re

        m = re.search(r"/for-rent/([^/]+)", url)
        if m:
            neighborhood = m.group(1)
            referer = f"https://streeteasy.com/for-rent/{neighborhood}?sort_by=se_score"
        else:
            referer = "https://streeteasy.com/for-rent/nyc"
    else:
        referer = "https://www.google.com/"
    return referer

In [16]:
def generate_random_headers(url=None):
    headers = {
        "User-Agent": generate_random_user_agent(),
        # "Accept-Language": generate_random_accept_language(),
        # "Accept-Encoding": generate_random_accept_encoding(),
        # "Accept": generate_random_accept(),
    }
    if url:
        headers["Referer"] = generate_random_referer(url)
    return headers

In [17]:
def visit_url_with_proxy(url, proxy, headers=None):
    """
    Visit a URL using the specified proxy and optional headers.
    Returns the response content as a string.
    """
    if headers is None:
        headers = generate_random_headers(url)
    req = urllib.request.Request(url, headers=headers)
    opener = urllib.request.build_opener(
        urllib.request.ProxyHandler({"https": proxy, "http": proxy}),
        # NOTE: Using _create_unverified_context bypasses SSL cert checks, be cautious.
        urllib.request.HTTPSHandler(context=ssl._create_unverified_context()),
    )
    try:
        with opener.open(req) as response:
            return response.read().decode()
    except Exception as e:
        print(f"Error: {e}")
        return None

In [18]:
price_range = [2000, 5000]
beds = [0, 1, 2, 3, 4]
neighborhoods = [
    "battery-park-city",
    "financial-district",
    "greenwich-village",
    "soho",
    "tribeca",
    "east-village",
    "west-village",
    "chelsea",
    "flatiron",
    "gramercy",
    "hudson-square",
    "noho",
    "nolita",
    "soho",
    "tribeca",
    "west-village",
]

In [19]:
def get_streeteasy_url(
    price_range: tuple[int, int], beds: list[int], neighborhoods: list[str]
):
    url = f"https://streeteasy.com/for-rent/{neighborhoods}/price:{price_range[0]}-{price_range[1]}|beds:{beds}?sort_by=se_score"
    return url

In [20]:
def get_random_streeteasy_url(
    price_range: tuple[int, int], beds: list[int], neighborhoods: list[str]
):
    neighborhood = random.choice(neighborhoods)
    min_price = random.randint(price_range[0], price_range[1])
    max_price = random.randint(min_price, price_range[1])

    # HACK: Round prices to the nearest $50 to look more human
    min_price = round(min_price / 50) * 50
    max_price = round(max_price / 50) * 50

    beds = random.choice(beds)
    url = f"https://streeteasy.com/for-rent/{neighborhood}/price:{min_price}-{max_price}|beds:{beds}?sort_by=se_score"
    print(url)
    return url

In [21]:
url = get_random_streeteasy_url(price_range, beds, neighborhoods)
result = visit_url_with_proxy(url, proxy)
print(result)

https://streeteasy.com/for-rent/battery-park-city/price:2750-3100|beds:0?sort_by=se_score
Error: HTTP Error 403: Forbidden
None


In [22]:
def scrape_single_url(price_range, beds, neighborhoods, proxy):
    # Introduce a random delay before making the request
    delay = random.uniform(MIN_DELAY_SECONDS, MAX_DELAY_SECONDS)
    print(f"Sleeping for {delay:.2f} seconds before scraping...")
    time.sleep(delay)
    url = get_random_streeteasy_url(price_range, beds, neighborhoods)
    result = visit_url_with_proxy(url, proxy)
    print(result)
    return result


def scrape_random_urls(n, price_range, beds, neighborhoods, proxy):
    return [
        scrape_single_url(price_range, beds, neighborhoods, proxy) for _ in range(n)
    ]

In [23]:
async def a_scrape_single_url(price_range, beds, neighborhoods, proxy):
    # Introduce a random delay before making the request
    delay = random.uniform(MIN_DELAY_SECONDS, MAX_DELAY_SECONDS)
    print(f"Sleeping for {delay:.2f} seconds before scraping...")
    await asyncio.sleep(delay)
    url = get_random_streeteasy_url(price_range, beds, neighborhoods)
    result = visit_url_with_proxy(url, proxy)
    print(result)
    return result


def a_scrape_random_urls_parallel(n, price_range, beds, neighborhoods, proxy):
    async def run_parallel():
        tasks = [
            a_scrape_single_url(price_range, beds, neighborhoods, proxy)
            for _ in range(n)
        ]
        results = await asyncio.gather(*tasks)
        return results

    return asyncio.run(run_parallel())

## Price of Request Scraping vs. Headless Browser Scraping
Prices are much lower for request scraping, but the quality of the data is lower since images are not scraped.
This means a much greater volume of text data can be requested holding price constant.

In [24]:
scrape_random_urls(NUM_URLS_TO_SCRAPE, price_range, beds, neighborhoods, proxy)

Sleeping for 10.87 seconds before scraping...
https://streeteasy.com/for-rent/financial-district/price:4000-4500|beds:4?sort_by=se_score
Error: HTTP Error 403: Forbidden
None
Sleeping for 8.29 seconds before scraping...
https://streeteasy.com/for-rent/hudson-square/price:4800-4850|beds:4?sort_by=se_score
Error: HTTP Error 403: Forbidden
None
Sleeping for 7.91 seconds before scraping...
https://streeteasy.com/for-rent/east-village/price:3900-4800|beds:2?sort_by=se_score
Error: HTTP Error 403: Forbidden
None
Sleeping for 5.91 seconds before scraping...
https://streeteasy.com/for-rent/noho/price:2850-3700|beds:0?sort_by=se_score
Error: HTTP Error 403: Forbidden
None
Sleeping for 6.46 seconds before scraping...


KeyboardInterrupt: 