## scraping

In [5]:
# import libraries
import requests
from bs4 import BeautifulSoup
import csv
import time
import random
import re
import os
from urllib.parse import urljoin
import logging
from typing import Set ,List, Optional ,Dict

In [4]:
# For extracting listing links

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('mubawab_link_extractor.log'),
        logging.StreamHandler()
    ]
)

class MubawabLinkExtractor:

    DEFAULT_HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept-Language': 'fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7',
    }

    def __init__(self, base_url: str, output_file: str = "mubawab_links.csv"):

        self.base_url = base_url
        self.headers = self.DEFAULT_HEADERS.copy()
        self.session = requests.Session()
        self.all_links: Set[str] = set()
        self.output_file = output_file

        os.makedirs(os.path.dirname(output_file), exist_ok=True)

    def _request_with_retry(self, url: str, retries: int = 3) -> Optional[str]:

        for attempt in range(retries):
            try:
                response = self.session.get(url, headers=self.headers, timeout=10)
                response.raise_for_status()
                return response.text
            except Exception as e:
                if attempt < retries - 1:
                    wait_time = 2 ** attempt
                    logging.warning(f"Attempt {attempt + 1} failed for {url}. Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                    continue
                logging.error(f"Failed to fetch {url} after {retries} attempts: {e}")
                return None

    def extract_links_from_page(self, page_url: str) -> Set[str]:

        html_content = self._request_with_retry(page_url)
        if not html_content:
            return set()

        soup = BeautifulSoup(html_content, 'html.parser')
        new_links = set()

        # Try different methods to find listings
        listings = soup.find_all('div', class_=['listingBox', 'listingBoxsPremium'])
        if not listings:
            listings = soup.find_all('a', href=re.compile(r'/fr/[pa]/\d+'))

        if not listings:
            script_tags = soup.find_all('script', type='text/javascript')
            for script in script_tags:
                if 'listingBox' in str(script):
                    listings = re.findall(r'href=[\'"]?([^\'" >]+)', str(script))
                    listings = [l for l in listings if re.match(r'/fr/[pa]/\d+', l)]
                    break

        for listing in listings:
            if hasattr(listing, 'attrs'):
                link = listing.find('a', href=True)
                href = link['href'] if link else listing.get('linkref', '')
            else:
                href = listing

            if not href:
                continue

            if not href.startswith('http'):
                href = urljoin('https://www.mubawab.ma', href)
            clean_url = re.sub(r'\?.*', '', href)

            if re.match(r'https://www.mubawab.ma/fr/[pa]/\d+', clean_url):
                new_links.add(clean_url)

        return new_links

    def extract_listing_links(self, max_pages: int = 37) -> Set[str]:

        for page in range(1, max_pages + 1):
            page_url = f"{self.base_url}:p:{page}"
            logging.info(f"Scraping page {page}/{max_pages}: {page_url}")

            new_links = self.extract_links_from_page(page_url)
            new_count = len(new_links - self.all_links)
            self.all_links.update(new_links)

            logging.info(f"Found {len(new_links)} links, {new_count} new unique links")
            logging.info(f"Total unique links: {len(self.all_links)}")

            self.save_links_to_csv()

            time.sleep(random.uniform(3, 4))

            if new_count == 0 and page > 5:
                logging.info("No new links found on multiple pages. Stopping early.")
                break

        return self.all_links

    def save_links_to_csv(self):

        with open(self.output_file, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['URL'])
            writer.writerows([[link] for link in sorted(self.all_links)])
        logging.info(f"Saved {len(self.all_links)} unique links to {self.output_file}")

if __name__ == "__main__":
    BASE_URL = "https://www.mubawab.ma/fr/ct/rabat/immobilier-a-vendre"
    OUTPUT_FILE = "data/mubawab_links.csv"

    extractor = MubawabLinkExtractor(BASE_URL, OUTPUT_FILE)
    extractor.extract_listing_links()
    logging.info("Link extraction completed!")

In [10]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('mubawab_data_extractor.log'),
        logging.StreamHandler()
    ]
)


class MubawabDataExtractor:
    DEFAULT_HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept-Language': 'fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7',
    }

    def __init__(self, input_file: str = "data/mubawab_links.csv",
                 output_file: str = "data/mubawab_properties.csv"):
        self.headers = self.DEFAULT_HEADERS.copy()
        self.session = requests.Session()
        self.input_file = input_file
        self.output_file = output_file

        os.makedirs(os.path.dirname(output_file), exist_ok=True)

    def _request_with_retry(self, url: str, retries: int = 3) -> Optional[str]:
        for attempt in range(retries):
            try:
                response = self.session.get(url, headers=self.headers, timeout=10)
                response.raise_for_status()
                return response.text
            except Exception as e:
                if attempt < retries - 1:
                    wait_time = 2 ** attempt
                    logging.warning(f"Attempt {attempt + 1} failed for {url}. Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    logging.error(f"Failed to fetch {url} after {retries} attempts: {e}")
        return None

    def load_links(self) -> List[str]:
        try:
            with open(self.input_file, 'r', encoding='utf-8') as f:
                reader = csv.reader(f)
                next(reader)  # Skip header
                return [row[0] for row in reader if row]
        except FileNotFoundError:
            logging.error(f"Input file not found: {self.input_file}")
            return []

    def scrape_listing_details(self, url: str) -> Optional[Dict]:
        logging.info(f"Scraping details from: {url}")
        html_content = self._request_with_retry(url)
        if not html_content:
            return None

        soup = BeautifulSoup(html_content, 'html.parser')
        data = {'url': url}

        try:
            prop_id = re.search(r'/fr/[pa]/(\d+)', url)
            data['id'] = prop_id.group(1) if prop_id else 'N/A'

            title = soup.find('h1', class_='titleListing')
            data['title'] = title.get_text(strip=True) if title else 'N/A'

            price = soup.find('h3', class_='orangeTit')
            if price:
                price_text = price.get_text(strip=True)
                data['price'] = re.sub(r'[^\d]', '', price_text) or 'N/A'
            else:
                data['price'] = 'N/A'

            location = soup.find('h2', class_='greyTit')
            data['location'] = location.get_text(strip=True) if location else 'N/A'

            features = {
                'area': ('icon-triangle', 'm²'),
                'rooms': ('icon-house-boxes', 'Pièces|places'),
                'bedrooms': ('icon-bed', 'Chambres'),
                'bathrooms': ('icon-bath', 'Salles de bain')
            }

            for field, (icon_class, pattern) in features.items():
                icon = soup.find('i', class_=icon_class)
                if icon:
                    parent = icon.find_parent('div', class_='adDetailFeature')
                    if parent:
                        span = parent.find('span')
                        if span:
                            match = re.search(r'(\d+)', span.get_text(strip=True))
                            data[field] = match.group(1) if match else 'N/A'
                        else:
                            data[field] = 'N/A'
                    else:
                        data[field] = 'N/A'
                else:
                    data[field] = 'N/A'

            desc = soup.find('div', class_='blockDescription')
            data['description'] = desc.get_text(strip=True) if desc else ''

            quartier_element = soup.find('h3', class_='greyTit')
            if quartier_element:
                quartier_text = quartier_element.get_text(strip=True)
                data['quartier'] = quartier_text.split(',')[0].strip()
            else:
                data['quartier'] = 'N/A'

            types = ['appartement', 'maison', 'villa', 'terrain', 'bureau', 'studio']
            lower_title = data['title'].lower()
            data['type'] = next((t for t in types if t in url.lower() or t in lower_title), 'N/A')

            data['property_state'] = 'N/A'
            etat_label = soup.find('p', class_='adMainFeatureContentLabel',
                                   string=re.compile(r'Etat du bien', re.I))
            if etat_label:
                etat_value = etat_label.find_next('p', class_='adMainFeatureContentValue')
                if etat_value:
                    data['property_state'] = etat_value.get_text(strip=True)

            amenities = {
                'cuisine_equiped': 'Non',
                'jardin': 'Non',
                'piscine': 'Non',
                'terrasse': 'Non',
                'garage': 'Non',
                'ascenseur': 'Non'
            }

            feature_divs = soup.find_all('div', class_='adFeature')
            for feature_div in feature_divs:
                icon = feature_div.find('i')
                if not icon:
                    continue

                icon_classes = icon.get('class', [])
                feature_text = feature_div.find('span', class_='fsize11')
                feature_text = feature_text.get_text(strip=True) if feature_text else ''

                if 'icon-fullKitchen' in icon_classes:
                    amenities['cuisine_equiped'] = 'Oui'
                elif 'icon-garden' in icon_classes or 'jardin' in feature_text.lower():
                    amenities['jardin'] = 'Oui'
                elif 'icon-pool' in icon_classes or 'piscine' in feature_text.lower():
                    amenities['piscine'] = 'Oui'
                elif 'icon-terrace' in icon_classes or 'terrasse' in feature_text.lower():
                    amenities['terrasse'] = 'Oui'
                elif 'icon-garage' in icon_classes or 'garage' in feature_text.lower():
                    amenities['garage'] = 'Oui'
                elif 'icon-elevator' in icon_classes or 'ascenseur' in feature_text.lower():
                    amenities['ascenseur'] = 'Oui'

            description = data.get('description', '').lower()
            if amenities['cuisine_equiped'] == 'Non' and ('cuisine équipée' in description or 'cuisine equipee' in description):
                amenities['cuisine_equiped'] = 'Oui'
            if amenities['jardin'] == 'Non' and 'jardin' in description:
                amenities['jardin'] = 'Oui'
            if amenities['piscine'] == 'Non' and 'piscine' in description:
                amenities['piscine'] = 'Oui'

            data.update(amenities)
            return data

        except Exception as e:
            logging.error(f"Error scraping {url}: {e}")
            return None

    def scrape_all_listings(self):
        links = self.load_links()
        if not links:
            logging.error("No links found to process.")
            return

        total = len(links)
        success_count = 0
        fail_count = 0

        logging.info(f"Found {total} listings to scrape.")

        file_exists = os.path.exists(self.output_file)
        fieldnames = [
            'id', 'url', 'title', 'price', 'location', 'type',
            'area', 'rooms', 'bedrooms', 'bathrooms', 'description',
            'property_state', 'jardin', 'piscine', 'cuisine_equiped',
            'terrasse', 'garage', 'ascenseur', 'quartier', 'status'
        ]

        with open(self.output_file, 'a' if file_exists else 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            if not file_exists:
                writer.writeheader()

            for i, link in enumerate(links, 1):
                logging.info(f"➡️ Processing ({i}/{total}): {link}")
                data = self.scrape_listing_details(link)
                if data:
                    data['status'] = 'success'
                    writer.writerow(data)
                    f.flush()
                    success_count += 1
                    logging.info(f"✅ Successfully scraped {data.get('id', 'N/A')}")
                else:
                    writer.writerow({'url': link, 'status': 'failed'})
                    fail_count += 1
                    logging.warning(f"❌ Failed to scrape: {link}")

                time.sleep(random.uniform(2, 4))

        logging.info(f"📊 Scraping finished: {success_count}/{total} succeeded, {fail_count} failed.")


if __name__ == "__main__":
    INPUT_FILE = "data/mubawab_links.csv"
    OUTPUT_FILE = "data/mubawab_properties.csv"

    extractor = MubawabDataExtractor(INPUT_FILE, OUTPUT_FILE)
    extractor.scrape_all_listings()
    logging.info("Data extraction completed!")