In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re
import os  # Needed to check if file exists

def scrape_propertypro_production(target_count=10000):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Referer": "https://www.google.com/"
    }
    
    # Approx 20 listings per page, so we calculate pages needed
    est_pages = target_count // 20 
    print(f"üöÄ Starting Large Scale Scrape. Aiming for {target_count} listings (~{est_pages} pages).")
    
    csv_filename = "propertypro_lagos_10k.csv"
    total_scraped = 0
    
    # If file exists, delete it so we start fresh (or change mode to 'a' to resume)
    if os.path.exists(csv_filename):
        os.remove(csv_filename)
        
    for page in range(1, est_pages + 1):
        url = f"https://www.propertypro.ng/property-for-sale/in/lagos?page={page}"
        
        try:
            # Random delay to look human (2 to 5 seconds)
            time.sleep(random.uniform(2, 5)) 
            
            response = requests.get(url, headers=headers, timeout=15)
            if response.status_code != 200:
                print(f"‚ö†Ô∏è Page {page} failed. Status: {response.status_code}")
                continue
            
            soup = BeautifulSoup(response.content, 'html.parser')
            listings = soup.find_all('div', class_='property-listing')
            
            if not listings:
                print(f"‚ö†Ô∏è Page {page} was empty. Stopping early.")
                break

            page_data = []
            
            for card in listings:
                try:
                    title_div = card.find('div', class_='pl-title')
                    if not title_div: continue
                    title = title_div.find('h3').text.strip()
                    link = "https://propertypro.ng" + title_div.find('a')['href']
                    location = title_div.find('p').text.strip()
                    
                    price_div = card.find('div', class_='pl-price')
                    price_raw = price_div.find('h3').text.strip()
                    features_text = price_div.find('h6').text.strip()
                    
                    # Regex to extract numbers
                    numbers = re.findall(r'\d+', features_text)
                    beds = numbers[0] if len(numbers) >= 1 else 0
                    baths = numbers[1] if len(numbers) >= 2 else 0

                    page_data.append({
                        "Title": title,
                        "Location": location,
                        "Price": price_raw,
                        "Bedrooms": beds,
                        "Bathrooms": baths,
                        "URL": link
                    })
                except:
                    continue
            
            # --- THE SAFETY SAVE ---
            # Append this page's data to the CSV immediately
            if page_data:
                df = pd.DataFrame(page_data)
                # If file doesn't exist, write header. If it does, skip header.
                header = not os.path.exists(csv_filename)
                df.to_csv(csv_filename, mode='a', header=header, index=False)
                
                total_scraped += len(page_data)
                print(f"‚úÖ Page {page}/{est_pages} done. Total: {total_scraped} houses.")

        except Exception as e:
            print(f"‚ùå Error on page {page}: {e}")
            time.sleep(10) # Wait longer if there is an error
            
    print(f"üéâ DONE! Saved {total_scraped} listings to {csv_filename}")

if __name__ == "__main__":
    # Set this to 10000 when you are ready to let it run for ~1 hour
    scrape_propertypro_production(target_count=5000)
    

üöÄ Starting Large Scale Scrape. Aiming for 5000 listings (~250 pages).
‚úÖ Page 1/250 done. Total: 22 houses.
‚úÖ Page 2/250 done. Total: 44 houses.
‚úÖ Page 3/250 done. Total: 66 houses.
‚úÖ Page 4/250 done. Total: 88 houses.
‚úÖ Page 5/250 done. Total: 110 houses.
‚úÖ Page 6/250 done. Total: 132 houses.
‚úÖ Page 7/250 done. Total: 154 houses.
‚úÖ Page 8/250 done. Total: 176 houses.
‚úÖ Page 9/250 done. Total: 198 houses.
‚úÖ Page 10/250 done. Total: 220 houses.
‚úÖ Page 11/250 done. Total: 242 houses.
‚úÖ Page 12/250 done. Total: 264 houses.
‚úÖ Page 13/250 done. Total: 286 houses.
‚úÖ Page 14/250 done. Total: 308 houses.
‚úÖ Page 15/250 done. Total: 330 houses.
‚úÖ Page 16/250 done. Total: 352 houses.
‚úÖ Page 17/250 done. Total: 374 houses.
‚úÖ Page 18/250 done. Total: 396 houses.
‚úÖ Page 19/250 done. Total: 418 houses.
‚úÖ Page 20/250 done. Total: 440 houses.
‚úÖ Page 21/250 done. Total: 462 houses.
‚úÖ Page 22/250 done. Total: 484 houses.
‚úÖ Page 23/250 done. Total: 506 house