In [None]:
from pyspark.sql import SparkSession
from dotenv import load_dotenv
import os

load_dotenv()

spark = SparkSession.builder.appName("RealEstate_Mubawab_Docker").getOrCreate()

storage_account = "strealestatehamza"
container = "realestate"

adls_key = os.getenv("ADLS_ACCOUNT_KEY")
if not adls_key:
    raise RuntimeError("ADLS_ACCOUNT_KEY missing from .env")

spark.conf.set(
    f"fs.azure.account.key.{storage_account}.dfs.core.windows.net",
    adls_key,
)
spark.conf.set(
    f"fs.azure.account.key.{storage_account}.blob.core.windows.net",
    adls_key,
)

In [None]:
# ========================================
# CELLULE 1 : 
# ========================================
import re
import csv
import json
import time
import random
from typing import List, Dict, Any
from urllib.parse import urljoin
from datetime import datetime

import requests
from bs4 import BeautifulSoup

BASE_URL = "https://www.mubawab.ma"

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0 Safari/537.36"
    )
}

In [None]:
# ========================================
# CELLULE 2 : Fonctions basiques
# ========================================

def fetch_html(url: str) -> str:
    """T√©l√©charge le HTML d'une page Mubawab."""
    resp = requests.get(url, headers=HEADERS, timeout=20)
    resp.raise_for_status()
    return resp.text


def extract_id_from_url(url: str) -> str:
    """
    Extrait l'ID num√©rique Mubawab depuis une URL du type:
      https://www.mubawab.ma/fr/a/8256920/...
    """
    m = re.search(r"/a/(\d+)", url.split("?")[0])
    return m.group(1) if m else ""

In [None]:
# ========================================
# CELLULE 3 CORRIG√âE : breadcrumbs, cat√©gorie, localisation & date
# ========================================

def get_breadcrumbs(soup: BeautifulSoup) -> Dict[str, Any]:
    """
    Retourne breadcrumbs_list et breadcrumbs (string).
    Extrait depuis le div.adBread selon le HTML fourni.
    """
    crumbs: List[str] = []

    # 1) Chercher le div.adBread sp√©cifique √† Mubawab
    ad_bread = soup.find("div", class_=re.compile(r"adBread"))
    if ad_bread:
        # Extraire tous les liens <a> dans ce div
        for a in ad_bread.find_all("a", class_="darkblue", href=True):
            text = a.get_text(strip=True)
            if text:
                crumbs.append(text)

    # 2) Fallback: essai classique ul.breadcrumb li a
    if not crumbs:
        ul = soup.find("ul", class_=re.compile(r"breadcrumb"))
        if ul:
            for li in ul.find_all("li"):
                a = li.find("a")
                text = (a or li).get_text(strip=True)
                if text:
                    crumbs.append(text)

    # 3) Fallback: nav[aria-label="Fil d'ariane"]
    if not crumbs:
        nav = soup.find("nav", attrs={"aria-label": re.compile("Fil", re.I)})
        if nav:
            for li in nav.find_all("li"):
                text = li.get_text(strip=True)
                if text:
                    crumbs.append(text)

    return {
        "breadcrumbs_list": crumbs,
        "breadcrumbs": " > ".join(crumbs) if crumbs else "",
    }


def get_category_label(soup: BeautifulSoup) -> str:
    """
    R√©cup√®re un libell√© de cat√©gorie :
        ex: 'Villa, √† vendre' ou 'Appartement, √† louer'
    """
    cat_label = ""
    type_bien = ""

    # R√©cup√©ration du bloc Caract√©ristiques g√©n√©rales
    for feature in soup.select("div.caractBlockProp div.adMainFeature"):
        label_el = feature.select_one("p.adMainFeatureContentLabel")
        value_el = feature.select_one("p.adMainFeatureContentValue")
        if not label_el or not value_el:
            continue
        label = label_el.get_text(strip=True)
        value = value_el.get_text(strip=True)

        if "Type de bien" in label:
            type_bien = value
            break

    # Essai de deviner louer/vendre √† partir du texte de page
    full_text = soup.get_text(" ", strip=True).lower()
    suffix = ""
    if "√† louer" in full_text or "a louer" in full_text:
        suffix = ", √† louer"
    elif "√† vendre" in full_text or "a vendre" in full_text:
        suffix = ", √† vendre"

    if type_bien:
        cat_label = type_bien + suffix
    return cat_label


def get_location_and_date(soup: BeautifulSoup) -> Dict[str, str]:
    """
    R√©cup√®re:
      - location: ex 'Hay Targa √† Marrakech'
      - published_date: date/heure de scraping (maintenant)
      - scraping_time: timestamp exact du scraping
    """
    location = ""
    
    # Location: ex <h3 class="greyTit">Gu√©liz √† Marrakech</h3>
    grey = soup.find("h3", class_=re.compile(r"greyTit"))
    if grey:
        location = grey.get_text(" ", strip=True)

    # Date de publication = temps de scraping
    scraping_time = datetime.utcnow()
    published_date = scraping_time.strftime("%Y-%m-%d %H:%M:%S")

    return {
        "location": location,
        "published_date": published_date,
        "scraping_time": scraping_time.isoformat(),
    }

In [None]:
# ========================================
# CELLULE 4 : titre, prix, description, images
# ========================================

def get_title_and_price(soup: BeautifulSoup) -> Dict[str, str]:
    """
    R√©cup√®re le titre (h1.searchTitle) et le prix (h3.orangeTit).
    """
    title = ""
    price_text = ""

    # Titre
    h1 = soup.find("h1", class_=re.compile(r"searchTitle"))
    if h1:
        title = h1.get_text(strip=True)

    # Prix : h3.orangeTit
    price_block = soup.find("h3", class_=re.compile(r"orangeTit"))
    if price_block:
        price_text = price_block.get_text(strip=True)

    return {
        "title": title,
        "price_text": price_text,
    }


def get_description(soup: BeautifulSoup) -> str:
    """
    R√©cup√®re la description dans le bloc:
      <div class="blockProp"><h1 class="searchTitle">..</h1><p>...</p></div>
    """
    desc = ""

    # Bloc contenant la description (celui qui a un <h1 class="searchTitle">)
    for block in soup.find_all("div", class_=re.compile(r"blockProp")):
        h1 = block.find("h1", class_=re.compile(r"searchTitle"))
        if h1:
            p = block.find("p")
            if p:
                desc = p.get_text(" ", strip=True)
            break

    if not desc:
        # fallback: premier paragraphe long
        p = soup.find("p")
        if p:
            desc = p.get_text(" ", strip=True)

    desc = re.sub(r"\s+", " ", desc).strip()
    return desc


def get_images(soup: BeautifulSoup) -> List[str]:
    """
    R√©cup√®re toutes les URLs d'images de la galerie principale Mubawab.
    On cible les URLs 'mubawab-media.com/ad/...'.
    """
    urls: List[str] = []

    # Overlay principale
    for img in soup.select("div.overlayPhoto img[src]"):
        src = img.get("src")
        if src and "mubawab-media.com/ad/" in src and src not in urls:
            urls.append(src)

    # Galerie
    for img in soup.select("#picturesGallery img[src]"):
        src = img.get("src")
        if src and "mubawab-media.com/ad/" in src and src not in urls:
            urls.append(src)

    # Slider
    for img in soup.select("#picturesSlider img[src]"):
        src = img.get("src")
        if src and "mubawab-media.com/ad/" in src and src not in urls:
            urls.append(src)

    # Fallback g√©n√©rique
    for img in soup.find_all("img", src=True):
        src = img["src"]
        if "mubawab-media.com/ad/" in src and src not in urls:
            urls.append(src)

    return urls


In [None]:
# ========================================
# CELLULE 5 : attributs, √©quipements, infos vendeur
# ========================================

def get_attributes_and_equipments(soup: BeautifulSoup) -> Dict[str, Any]:
    """
    R√©cup√®re :
      - attributes: dict { "Type de bien": "Villa", "Etat": "Bon √©tat", ... }
      - equipments: liste ["Terrasse", "Piscine", "Climatisation", ...]
    """
    attributes: Dict[str, str] = {}
    equipments: List[str] = []

    # 1) Caract√©ristiques g√©n√©rales
    for feature in soup.select("div.caractBlockProp div.adMainFeature"):
        label_el = feature.select_one("p.adMainFeatureContentLabel")
        value_el = feature.select_one("p.adMainFeatureContentValue")
        if not label_el or not value_el:
            continue
        label = label_el.get_text(" ", strip=True)
        value = value_el.get_text(" ", strip=True)
        if label and value:
            attributes[label] = value

    # 2) D√©tails surface / SDB / etc. dans le header (ic√¥nes triangles, etc.)
    for span in soup.select("div.adDetails div.adDetailFeature span"):
        txt = span.get_text(" ", strip=True)
        if not txt:
            continue

        # mapping simple
        if "m¬≤" in txt or "m2" in txt:
            key = "Surface"
        elif "Salle de bain" in txt or "Salles de bains" in txt:
            key = "Salle de bain"
        elif "Pi√®ce" in txt:
            key = "Pi√®ces"
        else:
            key = txt  # on garde tel quel

        attributes[key] = txt

    # 3) Coordonn√©es + locationId/locationType (on les range aussi dans attributes)
    lat_el = soup.select_one("input#latField")
    lng_el = soup.select_one("input#lngField")
    loc_id_el = soup.select_one("input#locationId")
    loc_type_el = soup.select_one("input#locationType")

    if lat_el and lat_el.get("value"):
        attributes["Latitude"] = lat_el["value"].strip()
    if lng_el and lng_el.get("value"):
        attributes["Longitude"] = lng_el["value"].strip()
    if loc_id_el and loc_id_el.get("value"):
        attributes["Location ID"] = loc_id_el["value"].strip()
    if loc_type_el and loc_type_el.get("value"):
        attributes["Location Type"] = loc_type_el["value"].strip()

    # 4) √âquipements (petites ic√¥nes sous Caract√©ristiques g√©n√©rales)
    for feat in soup.select("div.caractBlockProp div.adFeatures div.adFeature span"):
        txt = feat.get_text(" ", strip=True)
        if txt:
            equipments.append(txt)

    return {
        "attributes": attributes,
        "equipments": equipments,
    }


def get_seller_info(soup: BeautifulSoup) -> Dict[str, Any]:
    """
    R√©cup√®re seller_name, seller_url, seller_is_store (Agence vs Particulier).
    """
    seller_name = ""
    seller_url = ""
    seller_is_store = False  # True si agence / pro

    business_info = soup.select_one("div.businessInfo")
    if business_info:
        name_el = business_info.select_one("span.businessName")
        if name_el:
            full_txt = name_el.get_text(" ", strip=True)
            txt_lower = full_txt.lower()

            if "particulier" in txt_lower:
                seller_is_store = False
                seller_name = (
                    full_txt.replace("Particulier", "")
                    .replace("particulier", "")
                    .strip(" -|,")
                )
            elif "agence" in txt_lower:
                seller_is_store = True
                seller_name = (
                    full_txt.replace("Agence", "")
                    .replace("agence", "")
                    .strip(" -|,")
                )
            else:
                seller_name = full_txt

        # lien vers la page agence / pro s'il existe
        a = business_info.find("a", href=True)
        if a:
            seller_url = urljoin(BASE_URL, a["href"])

    return {
        "seller_name": seller_name,
        "seller_url": seller_url,
        "seller_is_store": seller_is_store,
    }


In [None]:
# ========================================
# CELLULE 6
# ========================================
def parse_mubawab_ad(url: str) -> Dict[str, Any]:
    """
    Scrape une annonce Mubawab (page /fr/a/...) et retourne un dict.
    """
    html = fetch_html(url)
    soup = BeautifulSoup(html, "html.parser")

    ad_id = extract_id_from_url(url)

    crumbs = get_breadcrumbs(soup)
    cat_label = get_category_label(soup)
    loc_date = get_location_and_date(soup)
    title_price = get_title_and_price(soup)
    description = get_description(soup)
    images = get_images(soup)
    attrs_equip = get_attributes_and_equipments(soup)
    seller = get_seller_info(soup)

    ad_data: Dict[str, Any] = {
        "id": ad_id,
        "url": url,
        "title": title_price["title"],
        "price_text": title_price["price_text"],
        "location": loc_date["location"],
        "published_date": loc_date["published_date"],
        "scraping_time": loc_date["scraping_time"],
        "breadcrumbs_list": crumbs["breadcrumbs_list"],
        "breadcrumbs": crumbs["breadcrumbs"],
        "category_label": cat_label,
        "description": description,
        "images": images,
        "attributes": attrs_equip["attributes"],
        "equipments": attrs_equip["equipments"],
        "seller_name": seller["seller_name"],
        "seller_url": seller["seller_url"],
        "seller_is_store": seller["seller_is_store"],
    }

    return ad_data

In [None]:
# ========================================
# CELLULE 7 : extraire URLs depuis listing
# ========================================

def get_ad_urls_from_listing(listing_url: str) -> List[str]:
    """
    R√©cup√®re toutes les URLs d'annonces d'une page listing Mubawab.
    """
    html = fetch_html(listing_url)
    soup = BeautifulSoup(html, "html.parser")

    ad_urls = set()

    for a in soup.find_all("a", href=True):
        href = a["href"].strip()

        # Lien relatif ‚Üí absolu
        if href.startswith("/"):
            href = urljoin(BASE_URL, href)

        # Garder seulement les liens Mubawab
        if "mubawab.ma" not in href:
            continue

        # Garder seulement les liens d'annonces /fr/a/
        if "/fr/a/" in href:
            clean = href.split("?")[0]
            ad_urls.add(clean)

    ad_urls = sorted(ad_urls)
    print(f"üåê Trouv√© {len(ad_urls)} annonces sur {listing_url}")
    return ad_urls



In [None]:

# ========================================
# CELLULE 8 :
# ========================================
def save_ads_to_csv(ad_dicts: List[Dict[str, Any]], filename: str) -> None:
    """Sauvegarde une liste d'annonces (dict) dans un fichier CSV."""
    if not ad_dicts:
        print(f"Aucune annonce √† sauvegarder pour {filename}")
        return

    fieldnames = [
        "id",
        "url",
        "title",
        "price_text",
        "location",
        "published_date",
        "scraping_time",
        "breadcrumbs",
        "breadcrumbs_list",
        "category_label",
        "description",
        "attributes",
        "equipments",
        "seller_name",
        "seller_url",
        "seller_is_store",
    ]

    with open(filename, "w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()

        for ad in ad_dicts:
            row: Dict[str, Any] = {}

            for key in fieldnames:
                if key in ("attributes", "equipments", "breadcrumbs_list"):
                    continue
                row[key] = ad.get(key, "")

            row["attributes"] = json.dumps(
                ad.get("attributes", {}), ensure_ascii=False
            )
            row["equipments"] = "; ".join(ad.get("equipments", []))
            row["breadcrumbs_list"] = json.dumps(
                ad.get("breadcrumbs_list", []), ensure_ascii=False
            )

            writer.writerow(row)

    print(f"‚úÖ Sauvegard√© {len(ad_dicts)} annonces dans {filename}")


In [None]:
# ========================================
# CELLULE 9 : scraper listing -> liste
# ========================================

def scrape_listing_to_list(listing_url: str) -> List[Dict[str, Any]]:
    """
    Scrape la 1√®re page d'un listing Mubawab et retourne une liste de dicts.
    """
    ad_urls = get_ad_urls_from_listing(listing_url)

    all_ads: List[Dict[str, Any]] = []
    total = len(ad_urls)

    for i, ad_url in enumerate(ad_urls, 1):
        print(f"[{i}/{total}] Scraping {ad_url}")
        try:
            ad_data = parse_mubawab_ad(ad_url)
            all_ads.append(ad_data)
        except Exception as e:
            print(f"  ‚ùå Erreur sur {ad_url}: {e}")

        time.sleep(random.uniform(1.0, 2.5))

    return all_ads

In [None]:
# ========================================
# CELLULE 10 : Write RAW Mubawab in PARQUET (RECOMMENDED)
# ========================================
from datetime import datetime
from pyspark.sql import functions as F

# ---- ADLS config ----
storage_account = "strealestatehamza"
container = "realestate"

# ---- Scraping listing URLs ----
ventes_url = (
    "https://www.mubawab.ma/fr/cc/"
    "immobilier-a-vendre-all:o:n:sc:"
    "apartment-sale,commercial-sale,farm-sale,house-sale,land-sale,"
    "office-sale,other-sale,riad-sale,villa-sale"
)

locations_url = (
    "https://www.mubawab.ma/fr/cc/"
    "immobilier-a-louer-all:o:n:sc:"
    "apartment-rent,commercial-rent,farm-rent,house-rent,land-rent,"
    "office-rent,other-rent,riad-rent,room-rent,villa-rent"
)

# ---- Scrape ----
raw_ventes_ads = scrape_listing_to_list(ventes_url)
raw_locations_ads = scrape_listing_to_list(locations_url)

print(f"Ventes r√©cup√©r√©es: {len(raw_ventes_ads)}")
print(f"Locations r√©cup√©r√©es: {len(raw_locations_ads)}")

# ---- Create Spark DataFrames directly from Python dicts ----
ventes_df = spark.createDataFrame(raw_ventes_ads)
locations_df = spark.createDataFrame(raw_locations_ads)

now = datetime.utcnow().isoformat()

ventes_df = (
    ventes_df
    .withColumn("source_site", F.lit("mubawab"))
    .withColumn("offre", F.lit("vente"))
    .withColumn("ingest_ts", F.lit(now))
)

locations_df = (
    locations_df
    .withColumn("source_site", F.lit("mubawab"))
    .withColumn("offre", F.lit("location"))
    .withColumn("ingest_ts", F.lit(now))
)

ventes_df.printSchema()
locations_df.printSchema()

# ---- Build ADLS path ----
date_path = datetime.utcnow().strftime("%Y/%m/%d/%H%M%S")
base_path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/raw"

ventes_path = f"{base_path}/mubawab/ventes/{date_path}"
locations_path = f"{base_path}/mubawab/locations/{date_path}"

# ---- WRITE AS PARQUET (NOT CSV) ----
(
    ventes_df
    .coalesce(1)
    .write
    .mode("overwrite")
    .parquet(ventes_path)
)

(
    locations_df
    .coalesce(1)
    .write
    .mode("overwrite")
    .parquet(locations_path)
)

print("‚úÖ RAW Mubawab Ventes (PARQUET) ->", ventes_path)
print("‚úÖ RAW Mubawab Locations (PARQUET) ->", locations_path)
