In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re

# Define the base Zillow URL 
base_url = "https://www.zillow.com/tx/?searchQueryState=%7B%22isMapVisible%22%3Afalse%2C%22mapBounds%22%3A%7B%22west%22%3A-106.844420625%2C%22east%22%3A-93.309264375%2C%22south%22%3A24.969911847575055%2C%22north%22%3A37.26862098795179%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A54%2C%22regionType%22%3A2%7D%5D%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A6%7D"

# Define headers for the HTTP request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Referer": "https://www.zillow.com/"
}

# Function to introduce random delays
def random_delay(min_delay=0.2, max_delay=0.5):
    delay = random.uniform(min_delay, max_delay)
    print(f"Sleeping for {delay:.2f} seconds...")
    time.sleep(delay)

# list to store extracted data
data = []

# Loop to scrape multiple pages
page_number = 1  # Starting page number
while True:
    # Construct the URL for the current page
    url = base_url.format(page_number=page_number)
    
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        print(f"Successfully retrieved page {page_number}")
    else:
        print(f"Failed to retrieve page {page_number}. Status code: {response.status_code}")
        break 
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all the property cards where class contains "StyledPropertyCardDataWrapper"
    property_cards = soup.find_all("div", class_=re.compile(".*StyledPropertyCardDataWrapper.*"))

    if not property_cards:  # If no property cards are found, break the loop
        print("No property cards found. Stopping scraping.")
        break

    # Extract details for each property card
    for idx, card in enumerate(property_cards, start=1):
        try:
            # Extract property link
            link = card.find("a", class_="StyledPropertyCardDataArea-c11n-8-107-0__sc-10i1r6-0")['href']
            full_link = f"https://www.zillow.com{link}" if link.startswith('/') else link
            
            # Extract address
            address = card.find("address", {"data-test": "property-card-addr"}).get_text(strip=True)
            
            # Extract price
            price = card.find("span", {"data-test": "property-card-price"}).get_text(strip=True)
            
            # Extract bedrooms, bathrooms, and size
            details = card.find("ul", class_="StyledPropertyCardHomeDetailsList-c11n-8-107-0__sc-1j0som5-0")
            bds, ba, sqft = [li.get_text(strip=True) for li in details.find_all("li")[:3]]
            
            # Append the extracted data
            data.append({
                "Address": address,
                "Price": price,
                "Bedrooms": bds,
                "Bathrooms": ba,
                "Size": sqft,
                "Link": full_link
            })
            print(f"Extracted property {idx + (page_number-1)*20}: {address}")
            
            # Introduce a delay between each property extraction
            random_delay()

        except Exception as e:
            print(f"Error extracting data for property {idx}: {e}")

    # Find the next page link by searching for the "Next" button in the <li> element
    next_page_li = soup.find("li", class_="PaginationJumpItem-c11n-8-107-0__sc-h97wcm-0 kdGqlo")
    next_page_a = next_page_li.find("a", rel="next") if next_page_li else None

    if next_page_a:
        # Extract the next page URL and update the page number
        next_page_url = next_page_a['href']
        page_number = int(re.search(r'(\d+)_p', next_page_url).group(1))
        print(f"Moving to next page: {page_number}")
    else:
        # If no next page link is found, break the loop
        print("No next page found. Stopping scraping.")
        break

# Convert to a DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv("zillow_properties_with_delays.csv", index=False)
print("Data successfully saved to 'zillow_properties_with_delays.csv'")


Successfully retrieved page 1
Extracted property 1: 3407 Amber Forest Dr, Houston, TX 77068
Sleeping for 0.40 seconds...
Extracted property 2: 1920 CREEK HOLW, San Antonio, TX 78259
Sleeping for 0.32 seconds...
Extracted property 3: 10799 Aaron St, El Paso, TX 79924
Sleeping for 0.24 seconds...
Extracted property 4: 2203 BLUEBERRY HILL ST, San Antonio, TX 78232
Sleeping for 0.46 seconds...
Extracted property 5: 6217 Decatur Ct, Frisco, TX 75035
Sleeping for 0.46 seconds...
Extracted property 6: 8783 TIMBER POINT ST, San Antonio, TX 78250
Sleeping for 0.27 seconds...
Extracted property 7: 8328 Meadow Sweet Ln, Fort Worth, TX 76123
Sleeping for 0.36 seconds...
Extracted property 8: 6313 Truman Dr, Fort Worth, TX 76112
Sleeping for 0.35 seconds...
Extracted property 9: 8442 Horsepen Bend Dr, Conroe, TX 77385
Sleeping for 0.22 seconds...
No next page found. Stopping scraping.
Data successfully saved to 'zillow_properties_with_delays.csv'
