**Scrapping Nigeria Rent and Sale Propety from PropertPro** 

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os

# Function to extract property details
def extract_property_details(soup):
    properties = []
    
    # Find all property listings
    listings = soup.find_all('div', class_='property-listing-grid')
    
    for listing in listings:
        try:
            # Extract price
            price = listing.find('div', class_='pl-price').find('h3').text.strip()
        except AttributeError:
            price = 'N/A'
        
        try:
            # Extract title
            title = listing.find('div', class_='pl-title').text.strip()
        except AttributeError:
            title = 'N/A'

        try:
            # Extract property ID
            pid = listing.find('p').text.strip().replace("PID :", "")
        except AttributeError:
            pid = 'N/A'

        try:
            # Extract number of beds and baths
            details = listing.find('h6').text.strip()
        except AttributeError:
            details = 'N/A'
        
        properties.append({
            'Title': title,
            'Price': price,
            'PID': pid,
            'Details': details
        })
        
    return properties

# URL template (with page number placeholder)
url_template = 'https://www.propertypro.ng/property-for-sale?page={}'

# Number of pages to crawl (adjust based on the total pages)
num_pages = 719

# File to save progress
output_file = 'propertypro_sale_listings.csv'
last_page_file = 'last_page.txt'

# List to store all scraped properties
all_properties = []

# Check if there's already a last saved page to resume from
if os.path.exists(last_page_file):
    with open(last_page_file, 'r') as f:
        last_page = int(f.read().strip()) + 1
else:
    last_page = 1

# Crawl through multiple pages, starting from the last saved page
for page_num in range(last_page, num_pages + 1):
    print(f"Scraping page {page_num}...")
    url = url_template.format(page_num)
    
    try:
        # Send a GET request to fetch the HTML content
        response = requests.get(url, timeout=10)  # Add timeout to handle slow responses
        
        if response.status_code == 200:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract property details from the page
            properties = extract_property_details(soup)
            
            # Add the scraped data to the list
            all_properties.extend(properties)
            
            # Save progress every 20 pages
            if page_num % 20 == 0:
                df = pd.DataFrame(all_properties)
                
                if os.path.exists(output_file):
                    # Append to the existing file
                    df.to_csv(output_file, mode='a', header=False, index=False)
                else:
                    # Save as a new file
                    df.to_csv(output_file, index=False)
                
                all_properties = []  # Clear list after saving
                print(f"Saved progress at page {page_num}.")
            
            # Update last scraped page
            with open(last_page_file, 'w') as f:
                f.write(str(page_num))
            
            # Delay to avoid overloading the server
            time.sleep(2)
        else:
            print(f"Failed to fetch page {page_num}, status code: {response.status_code}")
    
    except requests.exceptions.RequestException as e:
        print(f"Error on page {page_num}: {e}")
        # Retry fetching the page after a short delay
        time.sleep(5)
        continue  # Skip to the next page

# Final save of any remaining data
if all_properties:
    df = pd.DataFrame(all_properties)
    df.to_csv(output_file, mode='a', header=False, index=False)
    print(f"Final data saved to '{output_file}'.")

print("Scraping complete.")

In [None]:
   # Function to extract property details for rent listings
def extract_rental_details(soup):
    properties = []
    
    # Find all property listings for rent
    listings = soup.find_all('div', class_='property-listing-grid')
    
    for listing in listings:
        try:
            # Extract price
            price = listing.find('div', class_='pl-price').find('h3').text.strip()
        except AttributeError:
            price = 'N/A'
        
        try:
            # Extract title
            title = listing.find('div', class_='pl-title').text.strip()
        except AttributeError:
            title = 'N/A'

        try:
            # Extract property ID
            pid = listing.find('p').text.strip().replace("PID :", "")
        except AttributeError:
            pid = 'N/A'

        try:
            # Extract number of beds and baths
            details = listing.find('h6').text.strip()
        except AttributeError:
            details = 'N/A'
        
        properties.append({
            'Title': title,
            'Price': price,
            'PID': pid,
            'Details': details
        })
        
    return properties

# URL template for houses for rent (with page number placeholder)
url_template = 'https://www.propertypro.ng/property-for-rent?page={}'

# Number of pages to crawl (you can adjust this)
num_pages = 293

# List to store all scraped properties
all_rental_properties = []

# Crawl through multiple pages for rent listings
for page_num in range(1, num_pages + 1):
    print(f"Scraping page {page_num} for rent listings...")
    url = url_template.format(page_num)
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        rental_properties = extract_rental_details(soup)
        all_rental_properties.extend(rental_properties)
        time.sleep(2)
    else:
        print(f"Failed to fetch page {page_num}, status code: {response.status_code}")

# Convert the data into a DataFrame
df_rentals = pd.DataFrame(all_rental_properties)

# Save the data into a CSV file
df_rentals.to_csv('propertypro_rent_listings.csv', index=False)
print("Data saved to 'propertypro_rent_listings.csv'")

**Scrapping Rent and Sale Properties for Kenya**   

In [21]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import json
import os

# Function to extract property details
def extract_property_details(soup):
    properties = []
    
    # Find all property listings
    listings = soup.find_all('div', class_='sc_panelWrapper')
    
    for listing in listings:
        try:
            # Extract price
            price = listing.find('div', class_='p24_price').text.strip()
        except AttributeError:
            price = 'N/A'
        
        try:
            # Extract title
            title = listing.find('div', class_='p24_regularTile').text.strip()
        except AttributeError:
            title = 'N/A'

        try:
            # Extract property details (e.g., bedrooms, bathrooms)
            details = listing.find('span', class_='js_listingTileImageHolder').text.strip()
        except AttributeError:
            details = 'N/A'
        
        properties.append({
            'Title': title,
            'Price': price,
            'Details': details
        })
        
    return properties

# Base URL template (with placeholders for province name and ID)
base_url = 'https://www.property24.co.ke/property-for-sale-in-{}-p{}?Page={}'

# List of provinces with their associated IDs
provinces = {
    'mombasa': 93,
    'kwale': 85,
    'kilifi': 80,
    'tana river': 105,
    'lamu': 87,
    'taita–taveta': 104,
    'garissa': 73,
    'wajir': 111,
    'mandera': 89,
    'marsabit': 90,
    'isiolo': 75,
    'meru': 91,
    'tharaka-nithi': 106,
    'embu': 72,
    'kitui': 84,
    'machakos': 66,
    'makueni': 88,
    'nyandarua': 100,
    'nyeri': 101,
    'kirinyaga': 81,
    'muranga': 94,
    'kiambu': 79,
    'turkana': 108,
    'west pokot': 112,
    'samburu': 102,
    'trans-nzoia': 107,
    'uasin gishu': 109,
    'elgeyo-marakwet': 71,
    'nandi': 97,
    'baringo': 67,
    'laikipia': 86,
    'nakuru': 96,
    'narok': 98,
    'kajiado': 76,
    'kericho': 78,
    'bomet': 68,
    'kakamega': 77,
    'vihiga': 110,
    'bungoma': 69,
    'busia': 70,
    'siaya': 103,
    'kisumu': 83,
    'homa bay': 74,
    'migori': 92,
    'kisii': 82,
    'nyamira': 99,
    'nairobi': 95
    # Add more provinces and their IDs here...
}

# File to save progress and track last page scraped for each province
progress_file = 'scraping_progress.json'

# Load progress if it exists
if os.path.exists(progress_file):
    with open(progress_file, 'r') as file:
        progress = json.load(file)
else:
    progress = {province: 1 for province in provinces}  # Start from page 1 for all provinces

# Placeholder for all scraped properties
all_properties = []

# Function to scrape a specific province
def scrape_province(province, province_id):
    page_num = progress.get(province, 1)  # Start from the last saved page
    while True:
        print(f"Scraping {province}, Page {page_num}...")
        
        # Construct the URL with the province and page number
        url = base_url.format(province, province_id, page_num)
        
        # Request the page
        response = requests.get(url)
        
        # Check if the request was successful
        if response.status_code != 200:
            print(f"Failed to fetch {url}, status code: {response.status_code}")
            break
        
        # Parse the content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract property details
        properties = extract_property_details(soup)
        
        if not properties:
            print(f"No more listings found for {province}. Stopping at page {page_num}.")
            break
        
        # Add properties to the global list
        all_properties.extend(properties)
        
        # Save progress after each page
        progress[province] = page_num
        with open(progress_file, 'w') as file:
            json.dump(progress, file)
        
        # Save data incrementally to avoid data loss
        pd.DataFrame(all_properties).to_csv('property24_kenya_listings.csv', index=False)
        
        # Check if a 'next' page link exists
        next_button = soup.find('li', class_='pagelink')
        if not next_button:
            print(f"Finished scraping {province} after {page_num} pages.")
            break
        
        # Delay between requests to avoid overloading the server
        time.sleep(2)
        
        # Increment page number
        page_num += 1

# Loop through all provinces
for province, province_id in provinces.items():
    scrape_province(province, province_id)

print("Scraping complete. Data saved to 'property24_kenya_listings.csv'.")

Scraping mombasa, Page 1...
Scraping mombasa, Page 2...
Scraping mombasa, Page 3...
Scraping mombasa, Page 4...
Scraping mombasa, Page 5...
Scraping mombasa, Page 6...
Scraping mombasa, Page 7...
Scraping mombasa, Page 8...
Scraping mombasa, Page 9...
Scraping mombasa, Page 10...
Scraping mombasa, Page 11...
Scraping mombasa, Page 12...
Scraping mombasa, Page 13...
Scraping mombasa, Page 14...
Scraping mombasa, Page 15...
Scraping mombasa, Page 16...
Scraping mombasa, Page 17...
Scraping mombasa, Page 18...
Scraping mombasa, Page 19...
Scraping mombasa, Page 20...
Scraping mombasa, Page 21...
Scraping mombasa, Page 22...
Scraping mombasa, Page 23...
Scraping mombasa, Page 24...
Scraping mombasa, Page 25...
Scraping mombasa, Page 26...
Scraping mombasa, Page 27...
Scraping mombasa, Page 28...
Scraping mombasa, Page 29...
Scraping mombasa, Page 30...
Scraping mombasa, Page 31...
Scraping mombasa, Page 32...
Scraping mombasa, Page 33...
Scraping mombasa, Page 34...
Scraping mombasa, Page 

KeyboardInterrupt: 

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor

# Function to extract property details
def extract_property_details(soup):
    properties = []
    
    # Find all property listings
    listings = soup.find_all('div', class_='property-listing-grid')
    
    for listing in listings:
        try:
            price = listing.find('div', class_='pl-price').find('h3').text.strip()
        except AttributeError:
            price = 'N/A'
        
        try:
            title = listing.find('div', class_='pl-title').text.strip()
        except AttributeError:
            title = 'N/A'

        try:
            pid = listing.find('p').text.strip().replace("PID :", "")
        except AttributeError:
            pid = 'N/A'

        try:
            details = listing.find('h6').text.strip()
        except AttributeError:
            details = 'N/A'
        
        properties.append({
            'Title': title,
            'Price': price,
            'PID': pid,
            'Details': details
        })
        
    return properties

# Function to scrape a single page and handle failures gracefully
def scrape_page(province_name, province_code, page_num, all_properties):
    url = f'https://www.property24.co.ke/property-for-sale-in-{province_name}-p{province_code}?Page={page_num}'
    print(f"Scraping {province_name}, Page {page_num}...")

    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            properties = extract_property_details(soup)
            all_properties.extend(properties)  # Add properties to the shared list
        else:
            print(f"Failed to fetch {url}, status code: {response.status_code}")
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")

# Function to scrape all pages for a province, with an option to start from a specific page
def scrape_province_parallel(province_name, province_code, max_pages, start_page=1, num_threads=5):
    all_properties = []
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        for page_num in range(start_page, max_pages + 1):
            executor.submit(scrape_page, province_name, province_code, page_num, all_properties)

    # Save to CSV after scraping
    df = pd.DataFrame(all_properties)
    df.to_csv(f'property24_{province_name}_listings_resume.csv', index=False)
    print(f"Data saved to 'property24_{province_name}_listings_resume.csv'")

# Province dictionary: Province names and codes
provinces = {
    'nairobi': 95  # Only focusing on Nairobi in this case
}

# Set a maximum number of pages to scrape and the page to resume from
max_pages_to_scrape = 1000  # Limit to 1000 pages for Nairobi
last_scraped_page = 600  # Start from page 600 (based on where you last stopped)

# Resume scraping for Nairobi
scrape_province_parallel('nairobi', 95, min(9333, max_pages_to_scrape), start_page=last_scraped_page, num_threads=10)