In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
import re
import pandas as pd

In [2]:
def setup_driver():
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)
    return driver

def scrape_company_page(driver, url):
    driver.get(url)
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "h1")))
    except TimeoutException:
        print(f"Timeout waiting for product info on {url}")
        return None

    try:
        company_name = driver.find_element(By.TAG_NAME, "h1").text if driver.find_elements(By.TAG_NAME, "h1") else None
        desc_div = driver.find_element(By.CLASS_NAME, "camp_full") if driver.find_elements(By.CLASS_NAME, "camp_full") else None
        company_desc = desc_div.text if desc_div else None
    
        next_sibling_1text = None
        next_sibling_2text = None
    
        if desc_div:
            try:
                next_p1_sibling = desc_div.find_element(By.XPATH, "following-sibling::p[1]")
                next_sibling_1text = next_p1_sibling.text if next_p1_sibling else None
            except NoSuchElementException:
                next_sibling_1text = None
    
            try:
                next_p2_sibling = desc_div.find_element(By.XPATH, "following-sibling::p[2]")
                sibling2_nums = [a.text for a in next_p2_sibling.find_elements(By.TAG_NAME, 'a')]
                next_sibling_2text = ', '.join(sibling2_nums) if sibling2_nums else None
            except NoSuchElementException:
                next_sibling_2text = None
                
    except Exception as e:
        print(f"Error scraping company info: {e}")
        return None

    return {
        "name": company_name,
        "description": company_desc,
        "location": next_sibling_1text,
        "phone number": next_sibling_2text
    }

def scrape_companies(driver, base_url, num_pages):
    all_companies = []
    url_pattern = re.compile(r'^/me/[^/]+$')

    for page in range(1, num_pages + 1):
        if page == 1:
            page_url = base_url
        else:
            page_url = f"{base_url}?start={page}"
        
        driver.get(page_url)
        
        try:
            WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.TAG_NAME, "a")))
        except TimeoutException:
            print(f"Timeout waiting for company links on {page_url}")
            continue

        all_links = driver.find_elements(By.TAG_NAME, "a")
        company_urls = []
        for link in all_links:
            try:
                href = link.get_attribute('href')
                if href:
                    path = href.replace('https://insaat.az', '')
                    if url_pattern.match(path):
                        company_urls.append(href)
            except Exception as e:
                print(f"Error processing link: {e}")

        unique_urls = list(set(company_urls))
        print(f"Found {len(unique_urls)} companies on page {page}")

        for url in unique_urls:
            print(f"Processing company: {url}")
            company_info = scrape_company_page(driver, url)
            if company_info:
                all_companies.append(company_info)
                
    return all_companies

def main():
    driver = setup_driver()
    base_url = "https://insaat.az/shops"
    num_pages = 7
    try:
        companies = scrape_companies(driver, base_url, num_pages)
        
        # Process or save the scraped data
        if companies:
            df = pd.DataFrame(companies)
            df.info()
            file_path = './web-scraping.xlsx'
            with pd.ExcelWriter(file_path, mode='a', engine='openpyxl') as writer:
                df.to_excel(writer, sheet_name='Insaat', index=False)

    finally:
        driver.quit()

if __name__ == "__main__":
    main()

Found 79 companies on page 1
Processing company: https://insaat.az/me/central-home-18
Processing company: https://insaat.az/me/grant-doors-mmc-70
Processing company: https://insaat.az/me/allceramic-group-199
Processing company: https://insaat.az/me/jaluzler-baku-151
Processing company: https://insaat.az/me/dam-ortukleri-6
Processing company: https://insaat.az/me/champion-sement-129
Processing company: https://insaat.az/me/simal-havalandirma-164
Processing company: https://insaat.az/me/baku-container-12
Processing company: https://insaat.az/me/zaman-plastik-qapi-pencere-satisi-ve-qurasdirlmasi-83
Processing company: https://insaat.az/me/glass-house-158
Processing company: https://insaat.az/me/favorite-ceiling-126
Processing company: https://insaat.az/me/alkopan-polkorbanat-satisi-86
Processing company: https://insaat.az/me/ikma-machinery-197
Processing company: https://insaat.az/me/jaluz-perde-group-mmc-68
Processing company: https://insaat.az/me/tontech-177
Processing company: https://