# Mawa Scraping:
This notebook demonstrates step-by-step web scraping from mawa., from setup and request to parsing, data extraction, and saving results.

In [18]:
# import necessary libraries
import requests
import csv
import time
from bs4 import BeautifulSoup

# 1.CONFIG

In [22]:
BASE_PAGE       = "https://www.mawa.om/en/rent"
LISTING_API     = "https://www.mawa.om/en/PropertyListing"
OUTPUT_FILE     = "mawa_rent_listings.csv"
DESIRED_RECORDS = 1000
PER_PAGE        = 50        
SLEEP_BETWEEN   = 0.1       

# 2.start a session & pretend to be Chrome 

In [24]:
session = requests.Session()
session.headers.update({
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/138.0.7204.101 Safari/537.36"
    ),
    "Accept": (
        "text/html,application/xhtml+xml,application/xml;"
        "q=0.9,image/webp,image/apng,*/*;q=0.8"
    ),
    "Accept-Language": "en-US,en;q=0.9",
    "Connection": "keep-alive",
})

# 3.GET the rent page HTML for cookies + CSRF 

In [25]:
r = session.get(BASE_PAGE)
r.raise_for_status()  # should now succeed
soup = BeautifulSoup(r.text, "html.parser")
csrf = soup.select_one('meta[name="csrf-token"]')["content"]

# prepare headers for the AJAX call 
session.headers.update({
    "Accept": "text/html, */*; q=0.01",
    "X-Requested-With": "XMLHttpRequest",
    "X-CSRF-TOKEN": csrf,
    "Referer": BASE_PAGE,
})

# 4.fetch one page of listing‐cards HTML 

In [26]:
def fetch_page(page_num):
    form = {
        "user_type":       "",
        "furnished_type":  "",
        "sort_by":         "newest",
        "location":        "",
        "min_area":        "",
        "max_area":        "",
        "min_price":       "",
        "max_price":       "",
        "amenities_id":    "",
        "bed":             "",
        "bath":            "",
        "property_type":   "",
        "keyword":         "",
        "property_for":    3,      # 3 = rent
        "page":            page_num,
        "_token":          csrf,
    }
    resp = session.post(LISTING_API, data=form, timeout=10)
    resp.raise_for_status()
    return resp.text

# 5.parse the HTML fragment into Python dicts 

In [27]:
def parse_listings(html_fragment):
    frag_soup = BeautifulSoup(html_fragment, "html.parser")
    cards = frag_soup.select("div.col-sm-6.col-lg-4")
    out = []
    for card in cards:
        a = card.select_one(".inner_feature_cntnt_heading a")
        if not a:
            continue

        href = a["href"].strip()
        url  = href if href.startswith("http") else f"https://www.mawa.om{href}"
        title = a.find(["h2","h3"]).get_text(strip=True) if a.find(["h2","h3"]) else ""

        loc_el   = card.select_one(".view_map_flag p")
        location = loc_el.get_text(strip=True) if loc_el else ""

        price_el = card.select_one(".prop_img .price p") or card.select_one(".price p")
        price    = price_el.get_text(strip=True) if price_el else ""

        beds_el  = card.select_one("i.fa-bed")
        bedrooms = beds_el.parent.get_text(strip=True) if beds_el and beds_el.parent else ""

        baths_el   = card.select_one("i.fa-bath")
        bathrooms  = baths_el.parent.get_text(strip=True) if baths_el and baths_el.parent else ""

        area_el = card.select_one("i.fa-area-chart")
        area    = area_el.parent.get_text(strip=True) if area_el and area_el.parent else ""

        type_el = card.select_one("i.fa-building-o")
        ptype   = type_el.parent.get_text(strip=True) if type_el and type_el.parent else ""

        out.append({
            "URL":           url,
            "Title":         title,
            "Location":      location,
            "Price":         price,
            "Bedrooms":      bedrooms,
            "Bathrooms":     bathrooms,
            "Area":          area,
            "PropertyType":  ptype
        })
    return out

# 6.loop pages until we have enough 

In [28]:
all_listings = []
page = 1
while len(all_listings) < DESIRED_RECORDS:
    print(f"Fetching page {page}… (have {len(all_listings)})")
    fragment = fetch_page(page)
    listings = parse_listings(fragment)
    if not listings:
        print("No more listings returned—stopping early.")
        break

    all_listings.extend(listings)
    if len(all_listings) >= DESIRED_RECORDS:
        break

    page += 1
    time.sleep(SLEEP_BETWEEN)

Fetching page 1… (have 0)
Fetching page 2… (have 12)
Fetching page 3… (have 24)
Fetching page 4… (have 36)
Fetching page 5… (have 48)
Fetching page 6… (have 60)
Fetching page 7… (have 72)
Fetching page 8… (have 84)
Fetching page 9… (have 96)
Fetching page 10… (have 108)
Fetching page 11… (have 120)
Fetching page 12… (have 132)
Fetching page 13… (have 144)
Fetching page 14… (have 156)
Fetching page 15… (have 168)
Fetching page 16… (have 180)
Fetching page 17… (have 192)
Fetching page 18… (have 204)
Fetching page 19… (have 216)
Fetching page 20… (have 228)
Fetching page 21… (have 240)
Fetching page 22… (have 252)
Fetching page 23… (have 264)
Fetching page 24… (have 276)
Fetching page 25… (have 288)
Fetching page 26… (have 300)
Fetching page 27… (have 312)
Fetching page 28… (have 324)
Fetching page 29… (have 336)
Fetching page 30… (have 348)
Fetching page 31… (have 360)
Fetching page 32… (have 372)
Fetching page 33… (have 384)
Fetching page 34… (have 396)
Fetching page 35… (have 408)
Fetc

# 7.trim to EXACT

In [29]:
all_listings = all_listings[:DESIRED_RECORDS]
print(f"Collected {len(all_listings)} listings total.")

Collected 1000 listings total.


# 8.SAVE TO CSV 

In [None]:
if all_listings:
    keys = list(all_listings[0].keys())
    with open(OUTPUT_FILE, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        writer.writerows(all_listings)
    print(f" Saved {len(all_listings)} records to '{OUTPUT_FILE}'")
else:
    print(" No data scraped—check if the site structure changed.")