In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import requests
import pandas as pd

def extract_car_brands(url):
    
    driver = webdriver.Chrome()

    try:
        driver.maximize_window()
        driver.get(url)
        WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.ID, 'onetrust-button-group-parent')))
        accept_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))
        )
        accept_button.click()

        open_brands_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'span.ooa-1l6s5c6 button[data-testid="arrow"]'))
        )
        open_brands_button.click()

        WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'ul.ooa-dljx5f')))

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        elements = soup.find_all('p', class_='ooa-6y8xco er34gjf0')

        car_brands = []

        for element in elements:
            text = element.get_text().strip()
            numbers = re.findall(r'\d+', text)
            if numbers and int(numbers[0]) > 1:
                text_without_number = re.sub(r'\(\d+\)', '', text)
                text_without_number = text_without_number.strip()
                text_formatted = text_without_number.lower().replace(' ', '-').replace('ë', 'e')
                car_brands.append(text_formatted)

        return car_brands

    finally:
        driver.quit()


In [2]:
from retrying import retry
import re
import os

data_directory = os.path.join(os.getcwd(), 'data')


@retry(wait_fixed=5000)
def parse_car_links(url):
    df = pd.DataFrame(columns=['Offer Title', 'Price', 'Currency', 'Offered By', 'Brand', 'Model', 'Version', 'Year', 
                           'Mileage', 'Engine Capacity', 'Fuel Type', 'Power', 'Body Type', 'Transmission', 
                           'Number of Doors', 'Number of Seats', 'Color', 'Condition',
                           'Registration Number', 'Drive', 'City Fuel Consumption',
                           'Registered in Poland', 'Accident Free', 'Tuning', 'Car Link'])


    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        span_values = soup.select('a.ooa-g4wbjr.e1y5xfcl0 span')
        if span_values:
            last_span_value = span_values[-1].text.strip()
            pages = min(int(last_span_value), 500)
            print(f"{url} have {pages}")
        else:
            print("No span elements found")

        for page_num in range(1, pages + 1):
            print(f"Parsing page {page_num}/{pages}")
            page_url = f"{url}?page={page_num}"
            response = requests.get(page_url)
            page_soup = BeautifulSoup(response.text, 'html.parser')
            articles = page_soup.find_all('article', class_='ooa-yca59n')
            for article in articles:
                car_link = article.find('a', href=True)['href']
                response = requests.get(car_link)
                car_page_soup = BeautifulSoup(response.text, 'html.parser')
                         
                offer_title = car_page_soup.find('h3', class_='offer-title').text.strip() if car_page_soup.find('h3', class_='offer-title') else None

                price = car_page_soup.find('h3', class_='offer-price__number').text.replace(' ', '').strip() if car_page_soup.find('h3', class_='offer-price__number') else None

                currency = car_page_soup.find('p', class_='offer-price__currency').text.strip() if car_page_soup.find('p', class_='offer-price__currency') else None

                details_items = car_page_soup.find_all('div', class_='ooa-162vy3d e18eslyg3')

                car_data_map = {}

                for item in details_items:
                    key_element = item.find('p', class_='e18eslyg4 ooa-12b2ph5')
                    if key_element:
                        key = key_element.text.strip()
                        value_element = item.find('p', class_='e16lfxpc0 ooa-1pe3502 er34gjf0')
                        if not value_element:
                            value_element = item.find('a', class_='e16lfxpc1 ooa-1ftbcn2')
                        if value_element:
                            value = value_element.text.strip()
                            car_data_map[key] = value

                offered_by = car_data_map.get("Oferta od", None)
                brand = car_data_map.get("Marka pojazdu", None)
                model = car_data_map.get("Model pojazdu", None)
                version = car_data_map.get("Wersja", None)
                year = car_data_map.get("Rok produkcji", None)
                mileage = car_data_map.get("Przebieg", None)
                engine_capacity = car_data_map.get("Pojemność skokowa", None)
                fuel_type = car_data_map.get("Rodzaj paliwa", None)
                power = car_data_map.get("Moc", None)
                body_type = car_data_map.get("Typ nadwozia", None)
                transmission = car_data_map.get("Skrzynia biegów", None)
                num_doors = car_data_map.get("Liczba drzwi", None)
                num_seats = car_data_map.get("Liczba miejsc", None)
                color = car_data_map.get("Kolor", None)
                condition = car_data_map.get("Stan", None)
                registration_number = car_data_map.get("Ma numer rejestracyjny", None)
                drive = car_data_map.get("Napęd", None)
                city_fuel_consumption = car_data_map.get("Spalanie W Mieście", None)
                registered_in_poland = car_data_map.get("Zarejestrowany w Polsce", None)
                accident_free = car_data_map.get("Bezwypadkowy", None)
                tuning = car_data_map.get("Tuning", None)

                car_data = {
                    'Offer Title': offer_title,
                    'Price': price,
                    'Currency': currency,
                    'Offered By': offered_by,
                    'Brand': brand,
                    'Model': model,
                    'Version': version,
                    'Year': year,
                    'Mileage': mileage,
                    'Engine Capacity': engine_capacity,
                    'Fuel Type': fuel_type,
                    'Power': power,
                    'Body Type': body_type,
                    'Transmission': transmission,
                    'Number of Doors': num_doors,
                    'Number of Seats': num_seats,
                    'Color': color,
                    'Condition': condition,
                    'Car Link': car_link,
                    'Registration Number': registration_number,
                    'Drive': drive,
                    'City Fuel Consumption': city_fuel_consumption,
                    'Registered in Poland': registered_in_poland,
                    'Accident Free': accident_free,
                    'Tuning': tuning
                }

                
                df = pd.concat([df, pd.DataFrame([car_data])], ignore_index=True)

        return df

    except Exception as e:
        print(f"An error occurred: {e}")
        return []

URL = "https://www.otomoto.pl/osobowe"

def parse_links_for_brand(brand):
    df = parse_car_links(URL + "/" + brand)
    if not os.path.exists(data_directory):
        os.makedirs(data_directory)
    file_path = os.path.join(data_directory, brand + ".csv")
    df.to_csv(file_path, index=False)

import concurrent.futures

with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(parse_links_for_brand, extract_car_brands(URL))


No span elements found
An error occurred: local variable 'pages' referenced before assignment
No span elements found
An error occurred: local variable 'pages' referenced before assignment
https://www.otomoto.pl/osobowe/audi have 500
Parsing page 1/500
https://www.otomoto.pl/osobowe/buick have 2
Parsing page 1/2
No span elements found
An error occurred: local variable 'pages' referenced before assignment
No span elements found
An error occurred: local variable 'pages' referenced before assignment
https://www.otomoto.pl/osobowe/aston-martin have 3
Parsing page 1/3
https://www.otomoto.pl/osobowe/chevrolet have 51
Parsing page 1/51
No span elements found
An error occurred: local variable 'pages' referenced before assignment
No span elements found
An error occurred: local variable 'pages' referenced before assignment
https://www.otomoto.pl/osobowe/cadillac have 5
Parsing page 1/5
https://www.otomoto.pl/osobowe/chrysler have 25
Parsing page 1/25
https://www.otomoto.pl/osobowe/bmw have 500
Pa