# Geizhals

In [47]:
# IMPORTS
import time
import random
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

### 1. WebDriver & Cookies + 3. Pagination navigation

In [48]:
local_site = "C:/Users/dinor/Downloads/geizhals.html"


def get_products_from_plp(url, number_of_pages):
    # User-Agent
    options = webdriver.ChromeOptions()
    options.add_argument(
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
    driver = webdriver.Chrome()

    all_products = []

    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
        )

        time.sleep(random.randint(2, 4))
        ActionChains(driver).click(driver.find_element(by=By.ID, value="onetrust-accept-btn-handler")).perform()

        for page in range(1, number_of_pages + 1):
            print(f"Scraping page {page}")
            time.sleep(random.randint(4, 8))
            products_data = get_product_data(driver.page_source)
            all_products.extend(products_data)

            try:
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, ".pagination__page--next"))
                )
                next_button.click()
            except Exception as e:
                print(f"Could not navigate to next page: {e}")
                break

    finally:
        driver.quit()

    return all_products

### 2. Productdata extraction

In [49]:
def get_product_data(page_content):
    soup = BeautifulSoup(page_content, "html.parser")

    products_data = []
    items = soup.find_all("article", class_="listview__item")

    for item in items:
        stars = get_element_text(item, "stars-rating-label", "span")
        price = get_element_text(item, "price")
        offers = get_element_text(item, "listview__offercount-link")
        pdp = get_element_text(item, "listview__name-link", attribute="href")

        products_data.append([stars, price, offers, pdp])
        
    return products_data

def get_element_text(item, class_name, sub_element=None, attribute=None):
    try:
        element = item.find(class_=class_name)
        if element:
            if sub_element:
                element = element.find(sub_element)
            if element:
                return element[attribute] if attribute else element.text.strip()
    except Exception as e:
        print(f"Could not find {class_name} element")
    return None

### 4. Save as CSV

In [50]:
def save_to_csv(data, filename):
    df = pd.DataFrame(data, columns=["Stars", "Price", "Offers", "PDP"])
    try:
        existing_df = pd.read_csv(filename)
        df = pd.concat([existing_df, df], ignore_index=True)
    except FileNotFoundError:
        pass    
    df.to_csv(filename, index=False)

### 5. Test

In [54]:
url = "https://geizhals.de/?fs=tv&hloc=at&hloc=de"
number_of_pages = 1

products = get_products_from_plp(url, number_of_pages)
save_to_csv(products, "geizhals_products.csv")

print(pd.read_csv("geizhals_products.csv"))

Scraping page 1
    Stars      Price        Offers                              PDP
0     4.8   € 888,00   74 Angebote       lg-oled-c37la-v126789.html
1     4.5   € 379,00  125 Angebote  samsung-gu-cu7179u-v126378.html
2     4.9  € 1369,00   56 Angebote       lg-oled-g39la-v127135.html
3     4.6   € 859,00   68 Angebote       lg-oled-c31la-v138668.html
4     4.8   € 599,00  100 Angebote        hisense-u7kq-v133024.html
..    ...        ...           ...                              ...
85    4.3   € 929,00   29 Angebote  philips-55oled708-a2963030.html
86    4.5  € 1444,99   63 Angebote     samsung-gq-s95c-v135924.html
87    NaN  € 3299,00   12 Angebote     lg-oled65g49ls-a3164626.html
88    4.8  € 1489,00   12 Angebote     lg-oled65c37la-a2904725.html
89    4.2   € 599,00   25 Angebote     hisense-55u7kq-a2967257.html

[90 rows x 4 columns]
