## OpenSooq scraping

## Import Required Libraries

In [59]:
import requests
from bs4 import BeautifulSoup
import time
import re
import pandas as pd

## Configuration Setup

In [60]:
# Base URL of the property listings
base_url = 'https://om.opensooq.com/en/property/property-for-sale'

# Headers to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}

# Output filename for CSV
output_filename = 'opensooq_properties.csv'

## Data Extraction Function

In [61]:
def extract_property_data(listing):
    """Extract property details from a single listing card"""
    # Extract title
    title_elem = listing.find('h2')
    title = title_elem.text.strip() if title_elem else None
    
    # Extract price (usually the last line in the card)
    card_text = listing.get_text(separator='\n', strip=True)
    lines = [line.strip() for line in card_text.split('\n') if line.strip()]
    price = lines[-1] if lines else None
    
    # Extract location (usually 2 lines before "Chat")
    location = None
    for i, line in enumerate(lines):
        if 'Chat' in line and i >= 2:
            location = f"{lines[i-2]}, {lines[i-1]}"
            break
    
    # Extract size (from Surface Area)
    size = None
    p_tag = listing.find('p')
    if p_tag:
        match = re.search(r'Surface\s*Area:\s*([0-9,]+)\s*m2', p_tag.text)
        if match:
            size = match.group(1).replace(',', '') + ' m²'
    
    # Extract link
    relative_link = listing.get('href', '')
    link = f"https://om.opensooq.com{relative_link}" if relative_link else None
    
    return {
        "source": "OpenSooq",
        "title": title,
        "price": price,
        "location": location,
        "size": size,
        "listing_type": "Property for Sale",
        "link": link
    }

## Pagination Handling

In [62]:
def handle_pagination(soup, current_url):
    """Handle pagination and return next page URL or None if no more pages"""
    next_page_anchor = soup.find('a', attrs={'data-id': 'nextPageArrow'})
    if not next_page_anchor:
        print("No next page found. Done scraping.")
        return None
    
    next_page_relative_url = next_page_anchor.get('href', '')
    if not next_page_relative_url:
        print("No valid next page URL. Stopping.")
        return None
    
    new_url = f"https://om.opensooq.com{next_page_relative_url}"
    
    # Prevent infinite loop if same page repeats
    if new_url == current_url:
        print("Next page URL is same as current. Stopping.")
        return None
    
    return new_url

## Main Scraping Function

In [63]:
def scrape_opensooq_properties():
    """Main function to scrape property listings"""
    opensooq_data = []
    current_page_url = base_url
    
    print("Starting OpenSooq Oman property scraping...")
    
    while True:
        print(f"Scraping page: {current_page_url}")
        
        try:
            response = requests.get(current_page_url, headers=headers)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching page {current_page_url}: {e}")
            break
        
        soup = BeautifulSoup(response.content, 'html.parser')
        listings = soup.find_all('a', class_=lambda x: x and 'postListItemData' in x)
        
        if not listings:
            print("No listings found on page. Stopping.")
            break
        
        for listing in listings:
            property_data = extract_property_data(listing)
            opensooq_data.append(property_data)
        
        current_page_url = handle_pagination(soup, current_page_url)
        if not current_page_url:
            break
        
        time.sleep(1)  # Polite delay between requests
    
    return opensooq_data

## Execute Scraping and Save Results

In [64]:
# Execute scraping
property_data = scrape_opensooq_properties()

# Convert to DataFrame
df = pd.DataFrame(property_data)

# Display first 5 rows
df.head()

Starting OpenSooq Oman property scraping...
Scraping page: https://om.opensooq.com/en/property/property-for-sale
Scraping page: https://om.opensooq.com/en/property/property-for-sale?page=2
Scraping page: https://om.opensooq.com/en/property/property-for-sale?page=3
Scraping page: https://om.opensooq.com/en/property/property-for-sale?page=4
Scraping page: https://om.opensooq.com/en/property/property-for-sale?page=5
Scraping page: https://om.opensooq.com/en/property/property-for-sale?page=6
Scraping page: https://om.opensooq.com/en/property/property-for-sale?page=7
Scraping page: https://om.opensooq.com/en/property/property-for-sale?page=8
Scraping page: https://om.opensooq.com/en/property/property-for-sale?page=9
Scraping page: https://om.opensooq.com/en/property/property-for-sale?page=10
Scraping page: https://om.opensooq.com/en/property/property-for-sale?page=11
Scraping page: https://om.opensooq.com/en/property/property-for-sale?page=12
Scraping page: https://om.opensooq.com/en/proper

Unnamed: 0,source,title,price,location,size,listing_type,link
0,OpenSooq,118 m2 3 Bedrooms Apartments for Sale in Musca...,"45,000 OMR",", Bosher, 964439XX",118 m²,Property for Sale,https://om.opensooq.com/en/search/264252105
1,OpenSooq,Building for Sale in Muscat Ghala,"950,000 OMR",", Ghala, 924618XX",2200 m²,Property for Sale,https://om.opensooq.com/en/search/253985415
2,OpenSooq,77 m2 2 Bedrooms Apartments for Sale in Muscat...,"31,000 OMR",", Azaiba, 715560XX",77 m²,Property for Sale,https://om.opensooq.com/en/search/266186647
3,OpenSooq,52 m2 Studio Apartments for Sale in Muscat Al ...,"29,000 OMR",", Al Mawaleh, 964439XX",52 m²,Property for Sale,https://om.opensooq.com/en/search/265496647
4,OpenSooq,320 m2 5 Bedrooms Villa for Sale in Muscat Qur...,"100,000 OMR",", Quriyat, 985055XX",320 m²,Property for Sale,https://om.opensooq.com/en/search/266519035


## Save Scraping Data into CSV File

In [65]:
# Save to CSV
df.to_csv(output_filename, index=False)
print(f"Data successfully saved to {output_filename}")

# Show summary
print(f"\nScraped {len(df)} properties")
df.info()

Data successfully saved to opensooq_properties.csv

Scraped 8155 properties
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8155 entries, 0 to 8154
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   source        8155 non-null   object
 1   title         8155 non-null   object
 2   price         8155 non-null   object
 3   location      8148 non-null   object
 4   size          3943 non-null   object
 5   listing_type  8155 non-null   object
 6   link          8155 non-null   object
dtypes: object(7)
memory usage: 446.1+ KB
