In [10]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np

# Function to fetch a page and parse its content
def fetch_page(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return BeautifulSoup(response.text, 'html.parser')
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

# Function to extract property details from a page
def extract_property_info(soup):
    if not soup:
        return [], [], [], [], []

    Info = soup.find('ul', class_="e20beb46")
    if not Info:
        return [], [], [], [], []

    # Extract property details based on aria-label
    locations = [tag.text.strip() for tag in Info.find_all('div', {'aria-label': 'Location'})]
    areas = [tag.text.strip() for tag in Info.find_all('span', {'aria-label': 'Area'})]
    prices = [tag.text.strip() for tag in Info.find_all('span', {'aria-label': 'Price'})]
    bedrooms = [tag.text.strip() for tag in Info.find_all('span', {'aria-label': 'Beds'})]
    bathrooms = [tag.text.strip() for tag in Info.find_all('span', {'aria-label': 'Baths'})]
    
    return locations, areas, prices, bedrooms, bathrooms

# Function to pad lists to the same length
def pad_lists(lists):
    max_length = max(len(lst) for lst in lists)
    return [lst + [None] * (max_length - len(lst)) for lst in lists]

# Main scraping function
def scrape_zameen_data(pages=50):
    base_url = "https://www.zameen.com/Homes/Karachi-2-{}.html"
    Locations, Areas, Prices, Bedrooms, Bathrooms = [], [], [], [], []

    for i in range(1, pages + 1):
        url = base_url.format(i)
        soup = fetch_page(url)
        time.sleep(2)  # Delay to avoid overwhelming the server

        loc, area, price, bed, bath = extract_property_info(soup)
        Locations.extend(loc)
        Areas.extend(area)
        Prices.extend(price)
        Bedrooms.extend(bed)
        Bathrooms.extend(bath)

    # Ensure all lists have the same length
    Locations, Areas, Prices, Bedrooms, Bathrooms = pad_lists([Locations, Areas, Prices, Bedrooms, Bathrooms])

    return Locations, Areas, Prices, Bedrooms, Bathrooms

# Run the scraper and create a DataFrame
if __name__ == "__main__":
    Locations, Areas, Prices, Bedrooms, Bathrooms = scrape_zameen_data()

    # Create a DataFrame
    data = {
        "Location": Locations,
        "Area": Areas,
        "Price": Prices,
        "Bedrooms": Bedrooms,
        "Bathrooms": Bathrooms
    }
    df = pd.DataFrame(data)

    # Replace None with NaN
    df.replace({None: np.nan}, inplace=True)

    # Save the DataFrame to an Excel file
    output_file = "zameen_data.xlsx"
    df.to_excel(output_file, index=False, sheet_name="Properties")

    print(f"Data saved to {output_file}")

Data saved to zameen_data.xlsx


In [11]:
df.head()

Unnamed: 0,Location,Area,Price,Bedrooms,Bathrooms
0,"Bahria Heights, Bahria Town Karachi",122 Sq. Yd.,53 Lakh,2,2
1,"Falcon Complex New Malir, Malir",500 Sq. Yd.,10.25 Crore,5,5
2,"Askari 5 - Sector J, Askari 5",367 Sq. Yd.,4.85 Crore,4,4
3,"Askari 5 - Sector J, Askari 5",276 Sq. Yd.,3.35 Crore,3,3
4,"Askari 5 - Sector J, Askari 5",276 Sq. Yd.,25.45 Lakh,3,3
