In [1]:
import pandas as pd
import selenium
import time
import re
from datetime import datetime
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
print(f'{pd.__version__=}')
print(f'{selenium.__version__=}')
print(f'{re.__version__=}')
import logging

pd.__version__='2.0.3'
selenium.__version__='4.18.1'
re.__version__='2.2.1'


In [2]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

file_handler = logging.FileHandler('logs/scrapping.log', encoding='utf-8', mode='w')
file_handler.setLevel(logging.INFO)

file_handler.setFormatter(formatter)

# Create console handler
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.WARNING)
console_handler.setFormatter(formatter)

# Add both handlers to logger
logger.addHandler(file_handler)
logger.addHandler(console_handler)

In [3]:
from selenium.webdriver.common.by import By

In [4]:
from selenium import webdriver

In [5]:
driver = webdriver.Firefox()

In [6]:
# #My Source
wait = WebDriverWait(driver, 4)
# mediamarkt_home = driver.get("https://www.mediamarkt.ch/de/category/_smartphone-680815.html?searchParams=&sort=&view=PRODUCTGRID&page=1")

In [7]:
phone_list = [] 
i = 1
while True:
    url = f"https://www.mediamarkt.ch/de/category/_smartphone-680815.html" + \
          f"?searchParams=&sort=&view=PRODUCTGRID&page={i}"
    
    try:
        driver.get(url)
    except WebDriverException as e:
        print(f"Error: {e}")
        break
    
    smartphones = driver.find_elements(By.CSS_SELECTOR, 
                                       'ul.products-grid > li')

    phone_list_page = []
    
    for phone in smartphones:
       
        #Info: brand, model, size, space
        info = phone.find_element(By.CSS_SELECTOR, "a.product-link").text
        brand_model = info.split("-")[0]
        brand, model = brand_model.split(" ", maxsplit=1)
        
        category_match = re.search(r' - (\w+)', info)
        category = category_match.group(1) if category_match else None
        
        size = re.search(r'\((\d+\.\d+)\s*"', info)
        size = size.group(1) if size else None
        
        storage = re.search(r'(\d+)\s+(GB|TB)', info)
        storage = storage.group(1) + " " + storage.group(2) if storage else None
        
        color = re.split(r',\s*(?=\w)', info)[-1].rstrip(')')
        
        price = "".join([i.text for i in phone.find_elements(By.CSS_SELECTOR, 
                                                                 'div.price > span')])
        price = price.strip("-")
        
        condition = 'new'
        
        link_element = phone.find_element(By.CSS_SELECTOR, 'a.photo')
        phone_url = link_element.get_attribute('href')
        
        
        phone_features = {
            "page" : i,
            "brand": brand,
            "model": model,
            "category": category,
            "size": size,
            "storage": storage,
            "color": color,
            "price": price,
            "source": phone_url,
            "condition": condition,
            "date": pd.to_datetime(datetime.today().strftime('%Y-%m-%d'))
        }

        
        phone_list_page.append(phone_features)
    
    
    phone_list.extend(phone_list_page)    
    
    
    ### going inside each phone page
    for phone in phone_list_page:
    
        driver.get(phone['source'])
        logger.info(phone['source'])
        
        
        
        
        ## catching broken pages
        try:
            body = driver.find_element(By.CSS_SELECTOR, "body > *")
        except NoSuchElementException:
            logger.warning(phone['source'])
            logger.warning("page broken")
            
            continue
        
        time.sleep(2)
        
        try:
            # Wait until article number element is present on the webpage
            css_selector = 'dl.group > dd > span[itemprop]'
            article_number = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, css_selector)))
            article_number = article_number.get_attribute('content').split(':')[1]
            logger.info(f"\t{article_number=}")
            phone['article_number'] = article_number
        
        except (NoSuchElementException, TimeoutException) as e:
            logger.warning(f" {phone['source'] =}")
            logger.warning(f" {css_selector =}")
            logger.warning(e)  
            
        try:
            # Wait until number of reviews element is present on the webpage
            css_selector = '.bv_numReviews_text'
            n_of_reviews = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, css_selector ))).text
            logger.info(f" \t{n_of_reviews=}")
            phone['n_of_reviews'] = n_of_reviews
            
        except (NoSuchElementException, TimeoutException) as e:
            logger.warning(f" {phone['source'] =}")
            logger.warning(f" {css_selector =}")
            logger.warning(e) 
            
            
        try:
            # Wait until rating element is present on the webpage
            css_selector = 'div[itemprop="ratingValue"]'
            rating = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, css_selector)))
            logger.info(f" \t{rating=}")
            phone['rating'] = rating.text
            
        except (NoSuchElementException, TimeoutException) as e:
            logger.warning(f" {phone['source'] =}")
            logger.warning(f" {css_selector =}")
            logger.warning(e) 
            
            
        try:
            css_selector = 'div.box.infobox.availability > ul > li > p > span'
            delivery_time = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))).text
            logger.info(f" \t{delivery_time=}")
            phone['delivery_time'] = delivery_time
       
        except (NoSuchElementException, TimeoutException) as e:
            logger.warning(f" {phone['source'] =}")
            logger.warning(f" {css_selector =}")
            logger.warning(e) 

        
    if not smartphones:
        break
    
    time.sleep(2)
    
    i += 1
   

Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:191:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:509:5
dom.find/</<@chrome://remote/content/shared/DOM.sys.mjs:136:16

Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:191:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:509:5
dom.find/</<@chrome://remote/content/shared/DOM.sys.mjs:136:16

Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:191:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:509:5
dom.find/</<@chrome://remote/content/shared/DOM.sys.mjs:136:16

Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chr

In [8]:
driver.quit()

### list with all phones

In [9]:
phone_list

[{'page': 1,
  'brand': 'APPLE',
  'model': 'iPhone 15 ',
  'category': 'Smartphone',
  'size': '6.1',
  'storage': '128 GB',
  'color': 'Black',
  'price': '716.',
  'source': 'https://www.mediamarkt.ch/de/product/_apple-iphone-15-2203371.html',
  'condition': 'new',
  'date': Timestamp('2024-04-07 00:00:00'),
  'article_number': '2203371',
  'n_of_reviews': '(25)',
  'rating': '4.8',
  'delivery_time': 'Auslieferung in 1-2 Werktagen'},
 {'page': 1,
  'brand': 'APPLE',
  'model': 'iPhone 15 Pro Max ',
  'category': 'Smartphone',
  'size': '6.7',
  'storage': '256 GB',
  'color': 'Black Titanium',
  'price': '1129.',
  'source': 'https://www.mediamarkt.ch/de/product/_apple-iphone-15-pro-max-2203426.html',
  'condition': 'new',
  'date': Timestamp('2024-04-07 00:00:00'),
  'article_number': '2203426',
  'n_of_reviews': '(13)',
  'rating': '4.9',
  'delivery_time': 'Auslieferung in 1-2 Werktagen'},
 {'page': 1,
  'brand': 'APPLE',
  'model': 'iPhone 14 ',
  'category': 'Smartphone',
  's

### transforming in a dataframe

In [10]:
df = pd.DataFrame(phone_list)
df.head(50)

Unnamed: 0,page,brand,model,category,size,storage,color,price,source,condition,date,article_number,n_of_reviews,rating,delivery_time
0,1,APPLE,iPhone 15,Smartphone,6.1,128 GB,Black,716.0,https://www.mediamarkt.ch/de/product/_apple-ip...,new,2024-04-07,2203371,(25),4.8,Auslieferung in 1-2 Werktagen
1,1,APPLE,iPhone 15 Pro Max,Smartphone,6.7,256 GB,Black Titanium,1129.0,https://www.mediamarkt.ch/de/product/_apple-ip...,new,2024-04-07,2203426,(13),4.9,Auslieferung in 1-2 Werktagen
2,1,APPLE,iPhone 14,Smartphone,6.1,128 GB,Midnight,619.0,https://www.mediamarkt.ch/de/product/_apple-ip...,new,2024-04-07,2151025,(12),4.8,"Bestellbar, Auslieferung in 1-2 Werktagen"
3,1,APPLE,iPhone 15 Pro,Smartphone,6.1,256 GB,Black Titanium,1040.0,https://www.mediamarkt.ch/de/product/_apple-ip...,new,2024-04-07,2203408,(14),4.9,"Bestellbar, Auslieferung in 1-2 Werktagen"
4,1,APPLE,iPhone 15 Pro Max,Smartphone,6.7,256 GB,Natural Titanium,1125.0,https://www.mediamarkt.ch/de/product/_apple-ip...,new,2024-04-07,2203429,(32),4.6,"Bestellbar, Auslieferung in 1-2 Werktagen"
5,1,APPLE,iPhone 15,Smartphone,6.1,128 GB,Blue,728.0,https://www.mediamarkt.ch/de/product/_apple-ip...,new,2024-04-07,2203374,(8),5.0,Auslieferung in 1-2 Werktagen
6,1,APPLE,iPhone 11 (2020),Smartphone,6.1,64 GB,Black,349.0,https://www.mediamarkt.ch/de/product/_apple-ip...,new,2024-04-07,2028715,(67),4.7,Auslieferung in 1-4 Werktagen
7,1,APPLE,iPhone 14,Smartphone,6.1,128 GB,Purple,619.0,https://www.mediamarkt.ch/de/product/_apple-ip...,new,2024-04-07,2151027,(12),4.8,Auslieferung in 1-2 Werktagen
8,1,APPLE,iPhone 15,Smartphone,6.1,256 GB,Black,816.0,https://www.mediamarkt.ch/de/product/_apple-ip...,new,2024-04-07,2203376,(4),5.0,Auslieferung in 1-2 Werktagen
9,1,XIAOMI,Redmi A2,Smartphone,6.52,32 GB,Schwarz,84.95,https://www.mediamarkt.ch/de/product/_xiaomi-r...,new,2024-04-07,2178004,(10),4.1,Auslieferung in 1-2 Werktagen


In [11]:
file_name = "scraped_mediamarkt.csv"
# Save the DataFrame to CSV in the same directory as the script
df.to_csv(file_name, index=False)