In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os

# Function to extract property details
def extract_property_details(soup):
    properties = []
    
    # Find all property listings
    listings = soup.find_all('div', class_='property-listing-grid')
    
    for listing in listings:
        try:
            # Extract price
            price = listing.find('div', class_='pl-price').find('h3').text.strip()
        except AttributeError:
            price = 'N/A'
        
        try:
            # Extract title
            title = listing.find('div', class_='pl-title').text.strip()
        except AttributeError:
            title = 'N/A'

        try:
            # Extract property ID
            pid = listing.find('p').text.strip().replace("PID :", "")
        except AttributeError:
            pid = 'N/A'

        try:
            # Extract number of beds and baths
            details = listing.find('h6').text.strip()
        except AttributeError:
            details = 'N/A'
        
        properties.append({
            'Title': title,
            'Price': price,
            'PID': pid,
            'Details': details
        })
        
    return properties

# URL template (with page number placeholder)
url_template = 'https://www.propertypro.ng/property-for-sale?page={}'

# Number of pages to crawl (adjust based on the total pages)
num_pages = 719

# File to save progress
output_file = 'propertypro_sale_listings.csv'
last_page_file = 'last_page.txt'

# List to store all scraped properties
all_properties = []

# Check if there's already a last saved page to resume from
if os.path.exists(last_page_file):
    with open(last_page_file, 'r') as f:
        last_page = int(f.read().strip()) + 1
else:
    last_page = 1

# Crawl through multiple pages, starting from the last saved page
for page_num in range(last_page, num_pages + 1):
    print(f"Scraping page {page_num}...")
    url = url_template.format(page_num)
    
    try:
        # Send a GET request to fetch the HTML content
        response = requests.get(url, timeout=10)  # Add timeout to handle slow responses
        
        if response.status_code == 200:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract property details from the page
            properties = extract_property_details(soup)
            
            # Add the scraped data to the list
            all_properties.extend(properties)
            
            # Save progress every 20 pages
            if page_num % 20 == 0:
                df = pd.DataFrame(all_properties)
                
                if os.path.exists(output_file):
                    # Append to the existing file
                    df.to_csv(output_file, mode='a', header=False, index=False)
                else:
                    # Save as a new file
                    df.to_csv(output_file, index=False)
                
                all_properties = []  # Clear list after saving
                print(f"Saved progress at page {page_num}.")
            
            # Update last scraped page
            with open(last_page_file, 'w') as f:
                f.write(str(page_num))
            
            # Delay to avoid overloading the server
            time.sleep(2)
        else:
            print(f"Failed to fetch page {page_num}, status code: {response.status_code}")
    
    except requests.exceptions.RequestException as e:
        print(f"Error on page {page_num}: {e}")
        # Retry fetching the page after a short delay
        time.sleep(5)
        continue  # Skip to the next page

# Final save of any remaining data
if all_properties:
    df = pd.DataFrame(all_properties)
    df.to_csv(output_file, mode='a', header=False, index=False)
    print(f"Final data saved to '{output_file}'.")

print("Scraping complete.")

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Saved progress at page 20.
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Saved progress at page 40.
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 4

In [None]:
# Function to extract property details for rent listings
def extract_rental_details(soup):
    properties = []
    
    # Find all property listings for rent
    listings = soup.find_all('div', class_='property-listing-grid')
    
    for listing in listings:
        try:
            # Extract price
            price = listing.find('div', class_='pl-price').find('h3').text.strip()
        except AttributeError:
            price = 'N/A'
        
        try:
            # Extract title
            title = listing.find('div', class_='pl-title').text.strip()
        except AttributeError:
            title = 'N/A'

        try:
            # Extract property ID
            pid = listing.find('p').text.strip().replace("PID :", "")
        except AttributeError:
            pid = 'N/A'

        try:
            # Extract number of beds and baths
            details = listing.find('h6').text.strip()
        except AttributeError:
            details = 'N/A'
        
        properties.append({
            'Title': title,
            'Price': price,
            'PID': pid,
            'Details': details
        })
        
    return properties

# URL template for houses for rent (with page number placeholder)
url_template = 'https://www.propertypro.ng/property-for-rent?page={}'

# Number of pages to crawl (you can adjust this)
num_pages = 293

# List to store all scraped properties
all_rental_properties = []

# Crawl through multiple pages for rent listings
for page_num in range(1, num_pages + 1):
    print(f"Scraping page {page_num} for rent listings...")
    url = url_template.format(page_num)
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        rental_properties = extract_rental_details(soup)
        all_rental_properties.extend(rental_properties)
        time.sleep(2)
    else:
        print(f"Failed to fetch page {page_num}, status code: {response.status_code}")

# Convert the data into a DataFrame
df_rentals = pd.DataFrame(all_rental_properties)

# Save the data into a CSV file
df_rentals.to_csv('propertypro_rent_listings.csv', index=False)
print("Data saved to 'propertypro_rent_listings.csv'")

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os

# Function to extract property details
def extract_property_details(soup):
    properties = []
    
    # Find all property listings
    listings = soup.find_all('div', class_='WarringPurloin js_listingTile')
    
    if not listings:
        return None  # Return None if there are no listings (end of pages)
    
    for listing in listings:
        try:
            # Extract price
            price = listing.find('span', class_='p24_price').text.strip()
        except AttributeError:
            price = 'N/A'
        
        try:
            # Extract title (property type)
            title = listing.find('a')['title']
        except AttributeError:
            title = 'N/A'
        
        try:
            # Extract the property URL
            property_url = listing.find('a')['href']
            full_url = f"https://www.property24.co.ke{property_url}"
        except AttributeError:
            full_url = 'N/A'

        properties.append({
            'Title': title,
            'Price': price,
            'Property URL': full_url
        })
        
    return properties

# URL template (with page number placeholder and province ID placeholder)
url_template = 'https://www.property24.co.ke/property-for-sale?provinceids={}&page={}'

# List of province IDs (you can expand this list with more province IDs)
province_ids = ['93', '85', '80', '105', '87','104','73','111','89','90','75','91','106','72','84','66','88','100','101','81','94',
                '79','108','112','102','107','109','71','97','67','86','96','98','76','78','68','77','110','69','70','103','83','74','92','82'
                '99','95']  # Add all provinces you want to scrape

# File to save progress
output_file = 'kenya_property_listings.csv'
last_page_file = 'kenya_last_page.txt'

# List to store all scraped properties
all_properties = []

# Check if there's already a last saved page to resume from
if os.path.exists(last_page_file):
    with open(last_page_file, 'r') as f:
        last_page = int(f.read().strip()) + 1
else:
    last_page = 1

# Loop through each province and scrape its listings
for province_id in province_ids:
    print(f"Scraping province ID: {province_id}...")
    
    page_num = last_page
    while True:
        print(f"Scraping page {page_num} for province {province_id}...")
        
        # Generate the URL for the current page and province
        url = url_template.format(province_id, page_num)
        
        try:
            # Send a GET request to fetch the HTML content
            response = requests.get(url, timeout=10)  # Add timeout to handle slow responses
            
            if response.status_code == 200:
                # Parse the HTML content using BeautifulSoup
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Extract property details from the page
                properties = extract_property_details(soup)
                
                if properties is None:
                    print(f"No more listings found at page {page_num}. Stopping province {province_id}.")
                    break  # Stop if no listings found (end of pages)
                
                # Add the scraped data to the list
                all_properties.extend(properties)
                
                # Save progress every 5 pages
                if page_num % 5 == 0:
                    df = pd.DataFrame(all_properties)
                    
                    if os.path.exists(output_file):
                        # Append to the existing file
                        df.to_csv(output_file, mode='a', header=False, index=False)
                    else:
                        # Save as a new file
                        df.to_csv(output_file, index=False)
                    
                    all_properties = []  # Clear list after saving
                    print(f"Saved progress at page {page_num} for province {province_id}.")
                
                # Update last scraped page
                with open(last_page_file, 'w') as f:
                    f.write(str(page_num))
                
                # Delay to avoid overloading the server
                time.sleep(2)
            else:
                print(f"Failed to fetch page {page_num}, status code: {response.status_code}")
                break  # Stop if the page fails to load
        
        except requests.exceptions.RequestException as e:
            print(f"Error on page {page_num} for province {province_id}: {e}")
            # Retry fetching the page after a short delay
            time.sleep(5)
            continue  # Skip to the next page
        
        # Move to the next page
        page_num += 1

# Final save of any remaining data
if all_properties:
    df = pd.DataFrame(all_properties)
    df.to_csv(output_file, mode='a', header=False, index=False)
    print(f"Final data saved to '{output_file}'.")

print("Scraping complete.")