# Grocery Store Prices Crawler

I originally did this to track beer prices near me. It can be tweaked to fetch the prices from other products aswell.

### To-do list:
- Add sections tracking (e.g. cheese, meats, flour, etc) OR NOT
- Add notifications in Hyprland (notify-send)

#### Functions

In [1]:
def procura(items:list, produto):
    """
    Args: items: list, lista de produtos a se verificar
          produtos: DataFrame, banco de dados que contem os produtos que foram obtidos e terão os preços conferidos

    Returns: NA
    """

    for i in items:
        if 'milka' in i.lower():
            searchfor = 1
        elif 'guinness' in i.lower():
            searchfor = 2
    
    
    if searchfor == 1:
        try:
            encontrado = 0
            for item in items:
                index = 0
                for stock in produto.Chocolate:
                    if item in stock.lower():
                        print(f'Em {produto['Data'][index]}, no {produto.Local[index]}: {stock}, {produto.Embalagem[index]}, por {produto['Custo de compra'][index]}€, preço por kg {produto['Preço por kg'][index]}€.')
                        encontrado = 1
                    index += 1

            if encontrado == 0:
                print('Não encontrei nenhum dos chocolates.')
        except:
            print('Algo deu errado. Algum erro não previsto na verificação dos chocolates.')


    elif searchfor == 2:
        try:
            encontrado = 0
            for item in items:
                index = 0
                for stock in produto.Cerveja:
                    if item in stock.lower():
                        print(f'Em {produto['Data'][index]}, no {produto.Local[index]}: {stock}, {produto.Embalagem[index]}, por {produto['Custo de compra'][index]}€, preço por litro {produto['Preço por litro'][index]}€.')
                        encontrado = 1
                    index += 1

            if encontrado == 0:
                print('Não encontrei nenhuma das cervejas.')
        except:
            print('Algo deu errado. Algum erro não previsto na verificação das cervejas.')


    else:
        print('Não sei o que fazer. Não encontrei nem Milka nem Guinness nas tuas listas. Te vira.')
    


def busca_continente():
    """
    Args: none

    Returns: dois DataFrames, o primeiro contendo a relação das cervejas pesquisadas no site e o segundo
             contendo os chocolates
    """
    # Imports
    import pandas as pd
    from bs4 import BeautifulSoup
    from datetime import datetime
    from selenium import webdriver
    from time import sleep
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support import expected_conditions as EC

    # Setting supermarket URLs
    urls_cont = ['https://www.continente.pt/bebidas-e-garrafeira/cervejas-e-sidras/cerveja-estrangeira-e-artesanal/',
                'https://www.continente.pt/bebidas-e-garrafeira/cervejas-e-sidras/cerveja-tradicional/']

    choc_cont = ['https://www.continente.pt/mercearia/chocolate-gomas-e-rebucados/chocolates/?start=0&srule=COL-Continente&pmin=0.01']

    # Creates empty DataFrames in order to properly receive the data
    print('Criando DataFrames and setting some configs...')
    cevas = pd.DataFrame()
    chocolates = pd.DataFrame()
    
    # Setting some configs
    mercado = 'Continente'
    data = datetime.now().strftime('%d-%m-%Y %H:%M')
    options = webdriver.FirefoxOptions()
    options.add_argument("--headless")

    # Get the response from the given URLs for beers in Continente.pt
    print('Gathering beer information, just a second please.')
    for url in urls_cont:
    # Using Selenium to fetch page html and deal with lazyloading
        try:
            # Driver accesses URL(s)
            driver = webdriver.Firefox(options=options)
            driver.get(url)

            # Get rid of cookies popup (or whatever the name is pop-up, poupup...)
            driver.find_element(By.ID, 'CybotCookiebotDialogBodyLevelButtonCustomize').click()
            sleep(1) # these sleep are just for it isn't given away that is a crawler 'clicking'
            driver.find_element(By.ID, 'CybotCookiebotDialogBodyButtonDecline').click()
            sleep(1)

            # Loads all the lazyloading
            while True:
                try:
                    WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, "//button[@class='button button--secondary col-button col-button--secondary-dark search-view-more-products-button js-show-more-products' and contains(., 'Ver mais produtos')]"))).click()
                except Exception as e:
                    break
            
            # With the lazyloading already loaded, registers the html code
            html = driver.page_source

            # And closes the headless browser
            driver.quit()

        except Exception as e:
            print(f'Algo ocorre: {e}')
            driver.quit()
        


        soup = BeautifulSoup(html, 'html.parser')

        # With the response, fetch all the available beers names, prices per liter, cost of purchase and packaging
        beers = soup.select('h2')
        prices_pl = soup.select('div[class="pwc-tile--price-secondary"]')
        totals = soup.select('span[class="pwc-tile--price-primary"]')
        embalagem = soup.select('p[class="pwc-tile--quantity"]')

        # Creating empty lists to hold site beer names, prices (liter and absolute) and packaging
        nomes = []
        precos_litro = []
        p_total = []
        pack = []

        # Filtering the name of the website product
        try:
            for i in beers:
                nomes.append("Cerveja "+i.get_text().split('com Álcool')[1].lstrip().rstrip())
        except:
            pass

        # Filtering the prices per liter for each product
        for price in prices_pl:
            precos_litro.append(price.get_text().split('€')[0].replace(',', '.').lstrip().rstrip())

        # Filtering the cost to buy each product
        for price in totals:
            p_total.append(price.get_text().split('€')[0].replace(',', '.').lstrip().rstrip())

        # Filtering the product packaging type
        for item in embalagem:
            pack.append(item.get_text().lstrip().rstrip()) 

        ### Creating a DataFrame with all this information
        # Setting column names
        colunas = ['Data', 'Local', 'Cerveja', 'Embalagem', 'Preço por litro', 'Custo de compra']

        # Date and time as of now
        data = datetime.now().strftime('%d-%m-%Y %H:%M')

        # Joining most lists as a DataFrame
        cervejas = pd.DataFrame([nomes, pack, precos_litro, p_total]).T

        # Concatenating the date and time (just on the first entry)
        cervejas = pd.concat([pd.Series(data), pd.Series(mercado), cervejas], ignore_index=True, axis=1)

        # Renaming the columns
        cervejas.columns = colunas

        # Converting prices to float (they're objects until now)
        cervejas['Preço por litro'] = cervejas['Preço por litro'].astype('float')
        cervejas['Custo de compra'] = cervejas['Custo de compra'].astype('float')

        # Filling the date and supermarket columns
        cervejas['Data'] = pd.to_datetime(data)
        cervejas['Local'] = mercado

        # Concat?
        cevas = pd.concat([cevas, cervejas], axis=0, ignore_index=True)
        cevas.dropna(inplace=True)
        cevas.drop_duplicates(inplace=True)

    print('Beers done. Fetching chocolates information, please wait a little bit more.')
    for url in choc_cont:
        # Using Selenium to fetch page html and deal with lazyloading
        try:
            driver = webdriver.Firefox(options=options)

            # Driver accesses URL(s)            
            driver.get(url)

            # Get rid of cookies popup (or whatever the name is pop-up, poupup...)
            driver.find_element(By.ID, 'CybotCookiebotDialogBodyLevelButtonCustomize').click()
            sleep(1) # these sleep are just for it isn't given away that is a crawler 'clicking'
            driver.find_element(By.ID, 'CybotCookiebotDialogBodyButtonDecline').click()
            sleep(1)

            # Loads all the lazyloading
            while True:
                try:
                    WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, "//button[@class='button button--secondary col-button col-button--secondary-dark search-view-more-products-button js-show-more-products' and contains(., 'Ver mais produtos')]"))).click()
                except Exception as e:
                    break
            
            # With the lazyloading already loaded, registers the html code
            html = driver.page_source

            # And closes the headless browser
            driver.quit()
            
        except Exception as e:
            print(f'Algo ocorre: {e}')
            driver.quit()

        soup = BeautifulSoup(html, 'html.parser')

        # With the response, get all the available beers names, prices per liter, cost of purchase and packaging
        chocolate = soup.select('h2')
        prices_pl = soup.select('div[class="pwc-tile--price-secondary"]')
        totals = soup.select('span[class="pwc-tile--price-primary"]')
        embalagem = soup.select('p[class="pwc-tile--quantity"]')

        # Creating empty lists to hold site chocolate names, prices (kg and absolute) and packaging
        nomes = []
        precos_kg = []
        p_total = []
        pack = []

        # Filtering the name of the website product
        for i in chocolate:
            nomes.append(i.get_text().lstrip().rstrip())

        # Filtering the prices per kg for each product
        for price in prices_pl:
            precos_kg.append(price.get_text().split('€')[0].replace(',', '.').lstrip().rstrip())

        # Filtering the cost to buy each product
        for price in totals:
            p_total.append(price.get_text().split('€')[0].replace(',', '.').lstrip().rstrip())

        # Filtering the product packaging type
        for item in embalagem:
            pack.append(item.get_text().lstrip().rstrip())

        ### Creating a DataFrame with all this information
        # Setting column names (the date is already stated before)
        colunas = ['Data', 'Local', 'Chocolate', 'Embalagem', 'Preço por kg', 'Custo de compra']

        # Joining most lists as a DataFrame
        chocs = pd.DataFrame([nomes, pack, precos_kg, p_total]).T

        # Concatenating the date and time (just on the first entry)
        chocs = pd.concat([pd.Series(data), pd.Series(mercado), chocs], ignore_index=True, axis=1)

        # Renaming the columns
        chocs.columns = colunas

        # Cleaning some '' information (NaN value not NaN)
        mask = chocs['Preço por kg'] == ''
        idx = chocs.loc[mask].index.values
        chocs.drop(idx, inplace=True)

        mask = chocs['Custo de compra'] == ''
        idx = chocs.loc[mask].index.values
        chocs.drop(idx, inplace=True)

        # Converting prices to float (they're objects until now)
        chocs['Preço por kg'] = chocs['Preço por kg'].astype('float')
        chocs['Custo de compra'] = chocs['Custo de compra'].astype('float')

        # Filling the date and supermarket columns
        chocs['Data'] = pd.to_datetime(data)
        chocs['Local'] = mercado

        # Concat everything, cleaning duplicates, reseting index
        chocolates = pd.concat([chocolates, chocs], axis=0, ignore_index=True)
        chocolates.dropna(inplace=True)
        chocolates.drop_duplicates(inplace=True)
        chocolates.reset_index(drop=True, inplace=True)
    
    print('Done.')

    return cevas, chocolates

The code

In [2]:
# Items that I want to track
my_beers = ['coruja india pale ale', 'guinness', 'franziskaner', 'estrella', 'erdinger', 'benediktiner']
my_choc = ['milka', 'biglicious', 'biiig'] # biglicious e biiig são as barras de 300g do Continente/Pingo Doce

#cervejas_c, chocolates_c = busca_continente()

In [3]:
# Imports
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from time import sleep
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Setting supermarket URLs
urls_pingo = ['https://www.pingodoce.pt/on/demandware.store/Sites-pingo-doce-Site/default/Search-Show?cgid=ec_cervejasestrangeiras_1500_200',
            'https://www.pingodoce.pt/on/demandware.store/Sites-pingo-doce-Site/default/Search-Show?cgid=ec_cervejasnacionais_1500_100']

choc_pingo = ['https://www.pingodoce.pt/on/demandware.store/Sites-pingo-doce-Site/default/Search-Show?cgid=ec_tabletes_1200_300_100']

# Creates empty DataFrames in order to properly receive the data
print('Criando DataFrames and setting some configs...')
cevas = pd.DataFrame()
chocolates = pd.DataFrame()

# Setting some configs
mercado = 'Pingo Doce'
data = datetime.now().strftime('%d-%m-%Y %H:%M')
options = webdriver.FirefoxOptions()
options.add_argument("--headless")

Criando DataFrames and setting some configs...


In [4]:
# Get the response from the given URLs for beers in pingodoce.pt
print('Gathering beer information, just a second please.')
for url in urls_pingo:
# Using Selenium to fetch page html and deal with lazyloading
    try:
        # Driver accesses URL(s)
        driver = webdriver.Firefox(options=options)
        driver.get(url)
        sleep(3)

        # Get rid of cookies popup (or whatever the name is pop-up, poupup...)
        WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".onetrust-close-btn-ui"))).click()
        sleep(3) 

        # Loads all the lazyloading
        try:
            WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.col-sm-4"))).click()
            sleep(2)

            stopScrolling = 0
            while True:
                stopScrolling += 1
                driver.execute_script("window.scrollBy(0,1000)")
                sleep(0.5)
                if stopScrolling > 12:
                    break


        except Exception as e:
            print(f'Erro interno: {e}')
            break
   
        
        # With the lazyloading already loaded, registers the html code
        html = driver.page_source

        # And closes the headless browser
        driver.quit()

    except Exception as e:
        print(f'Algo ocorre: {e}')
        driver.quit()

    soup = BeautifulSoup(html, 'html.parser')

    # With the response, fetch all the available beers names, prices per liter, cost of purchase and packing
    marca = soup.select('div[class="product-brand-name"]')
    beers = soup.select('div[class="product-name-link"]')
    prices_pl = soup.select('div[class="product-unit"]')
    totals = soup.select('div[class="product-price"]')


    # Creating empty lists to hold site beer names, prices (liter and absolute) and packaging
    nomes_orig = []
    marcas = []
    precos_litro = []
    p_total = []
    pack = []

    # Filtering the name of the website product
    try:
        for i in beers:
            nomes_orig.append("Cerveja "+i.get_text().split('com Álcool')[1].lstrip().rstrip())
    except:
        pass

    # Filtering the name of the product brand
    for i in marca:
        marcas.append(i.get_text().lstrip().rstrip())

    # Filtering the prices per liter for each product
    for price in prices_pl:
        precos_litro.append(price.get_text().split('|')[1].lstrip().rstrip().split(' ')[0].replace(',', '.'))

    # Filtering the cost to buy each product
    for price in totals:
        p_total.append(price.get_text().split('€')[0].replace(',', '.').lstrip().rstrip())

    # Filtering the product packaging type
    for item in prices_pl:
        pack.append(item.get_text().split('|')[0].lstrip().rstrip())

    # Using Pandas to create a new list for the names + brand of each item
    temp = pd.DataFrame([nomes_orig, marcas]).T
    temp['Unidos'] = temp[0] + str(' ') + temp[1]
    nomes = temp.Unidos.to_list()

    ### Creating a DataFrame with all this information
    # Setting column names
    colunas = ['Data', 'Local', 'Cerveja', 'Embalagem', 'Preço por litro', 'Custo de compra']

    # Date and time as of now
    data = datetime.now().strftime('%d-%m-%Y %H:%M')

    # Joining most lists as a DataFrame
    cervejas = pd.DataFrame([nomes, pack, precos_litro, p_total]).T

    # Concatenating the date and time (just on the first entry)
    cervejas = pd.concat([pd.Series(data), pd.Series(mercado), cervejas], ignore_index=True, axis=1)

    # Renaming the columns
    cervejas.columns = colunas

    # Converting prices to float (they're objects until now)
    cervejas['Preço por litro'] = cervejas['Preço por litro'].astype('float')
    cervejas['Custo de compra'] = cervejas['Custo de compra'].astype('float')

    # Filling the date and supermarket columns
    cervejas['Data'] = pd.to_datetime(data)
    cervejas['Local'] = mercado

    # Concat
    cevas = pd.concat([cevas, cervejas], axis=0, ignore_index=True)
    cevas.dropna(inplace=True)
    cevas.drop_duplicates(inplace=True)

print('Beers done. Fetching chocolates information, please wait a little bit more.')

for url in choc_pingo:
    # Using Selenium to fetch page html and deal with lazyloading
    try:
        # Driver accesses URL(s)
        driver = webdriver.Firefox(options=options)
        driver.get(url)
        sleep(3)

        # Get rid of cookies popup (or whatever the name is pop-up, poupup...)
        WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".onetrust-close-btn-ui"))).click()
        sleep(3) 

        # Loads all the lazyloading
        try:
            WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.col-sm-4"))).click()
            sleep(2)
            #driver.find_element(By.ID, 'header-search-bar').send_keys(Keys.END)
            #sleep(2)
            # scrolling down slowly
            stopScrolling = 0
            while True:
                stopScrolling += 1
                driver.execute_script("window.scrollBy(0,1000)")
                sleep(0.5)
                if stopScrolling > 12:
                    break


        except Exception as e:
            print(f'Erro interno: {e}')
            break
   
        
        # With the lazyloading already loaded, registers the html code
        html = driver.page_source

        # And closes the headless browser
        driver.quit()

    except Exception as e:
        print(f'Algo ocorre: {e}')
        driver.quit()

    soup = BeautifulSoup(html, 'html.parser')

    # With the response, get all the available beers names, prices per liter, cost of purchase and packaging
    marca = soup.select('div[class="product-brand-name"]')
    chocolate = soup.select('div[class="product-name-link"]')
    prices_kg = soup.select('div[class="product-unit"]')
    totals = soup.select('div[class="product-price"]')

    # Creating empty lists to hold site chocolate names, prices (kg and absolute) and packaging
    nomes_orig = []
    marcas = []
    precos_kg = []
    p_total = []
    pack = []

    # Filtering the name of the website product
    for i in chocolate:
        nomes_orig.append(i.get_text().lstrip().rstrip())

    # Filtering the name of the product brand
    for i in marca:
        marcas.append(i.get_text().lstrip().rstrip())

    # Filtering the prices per kg for each product
    for price in prices_kg:
        precos_kg.append(price.get_text().split('|')[1].lstrip().rstrip().split(' ')[0].replace(',', '.'))

    # Filtering the cost to buy each product
    for price in totals:
        p_total.append(price.get_text().split('€')[0].replace(',', '.').lstrip().rstrip())

    # Filtering the product packaging type
    for item in prices_kg:
        pack.append(item.get_text().split('|')[0].lstrip().rstrip())

    # Using Pandas to create a new list for the names + brand of each item
    temp = pd.DataFrame([nomes_orig, marcas]).T
    temp['Unidos'] = temp[0] + str(' ') + temp[1]
    nomes = temp.Unidos.to_list()

    ### Creating a DataFrame with all this information
    # Setting column names (the date is already stated before)
    colunas = ['Data', 'Local', 'Chocolate', 'Embalagem', 'Preço por kg', 'Custo de compra']

    # Joining most lists as a DataFrame
    chocs = pd.DataFrame([nomes, pack, precos_kg, p_total]).T

    # Concatenating the date and time (just on the first entry)
    chocs = pd.concat([pd.Series(data), pd.Series(mercado), chocs], ignore_index=True, axis=1)

    # Renaming the columns
    chocs.columns = colunas

    # Cleaning some '' information (NaN value not NaN)
    mask = chocs['Preço por kg'] == ''
    idx = chocs.loc[mask].index.values
    chocs.drop(idx, inplace=True)

    mask = chocs['Custo de compra'] == ''
    idx = chocs.loc[mask].index.values
    chocs.drop(idx, inplace=True)

    # Converting prices to float (they're objects until now)
    chocs['Preço por kg'] = chocs['Preço por kg'].astype('float')
    chocs['Custo de compra'] = chocs['Custo de compra'].astype('float')

    # Filling the date and supermarket columns
    chocs['Data'] = pd.to_datetime(data)
    chocs['Local'] = mercado

    # Concat everything, cleaning duplicates, reseting index
    chocolates = pd.concat([chocolates, chocs], axis=0, ignore_index=True)
    chocolates.dropna(inplace=True)
    chocolates.drop_duplicates(inplace=True)
    chocolates.reset_index(drop=True, inplace=True)

print('Done.')

Gathering beer information, just a second please.
Beers done. Fetching chocolates information, please wait a little bit more.
Done.


In [6]:
chocolates

Unnamed: 0,Data,Local,Chocolate,Embalagem,Preço por kg,Custo de compra
0,2026-09-02 15:37:00,Pingo Doce,8 Barritas de Chocolate de Leite Kinder Kinder,0.1 Kg,17.90,1.79
1,2026-09-02 15:37:00,Pingo Doce,Tablete de Chocolate de Leite com Avelãs Intei...,0.2 Kg,14.95,2.99
2,2026-09-02 15:37:00,Pingo Doce,Tablete de Chocolate de Leite Extrafino Nestlé,0.25 Kg,21.56,5.39
3,2026-09-02 15:37:00,Pingo Doce,Tablete de Chocolate Excellence 85% Cacau Lindt,0.1 Kg,37.40,3.74
4,2026-09-02 15:37:00,Pingo Doce,Tablete de Chocolate Negro 70% Excellence Lindt,0.1 Kg,33.40,3.34
...,...,...,...,...,...,...
79,2026-09-02 15:37:00,Pingo Doce,Tablete de Chocolate Preto 72% Cacau Pingo Doce,0.2 Kg,14.95,2.99
80,2026-09-02 15:37:00,Pingo Doce,Tablete Kitkat Avelã Kit Kat,0.099 Kg,20.10,1.99
81,2026-09-02 15:37:00,Pingo Doce,Tablete Kitkat Caramelo Salgado Kit Kat,0.099 Kg,20.10,1.99
82,2026-09-02 15:37:00,Pingo Doce,Tablete Kitkat Double Chocolate Kit Kat,0.099 Kg,20.10,1.99


In [4]:
procura(my_choc, chocolates_c)

Em 2026-09-02 11:59:00, no Continente: Tablete de Chocolate de Leite Milka, emb. 250 gr, por 5.99€, preço por kg 23.96€.
Em 2026-09-02 11:59:00, no Continente: Tablete de Chocolate de Leite com Avelãs Inteiras Milka, emb. 250 gr, por 5.99€, preço por kg 23.96€.
Em 2026-09-02 11:59:00, no Continente: Tablete de Chocolate com Cheesecake de Morango Milka, emb. 300 gr, por 5.99€, preço por kg 19.97€.
Em 2026-09-02 11:59:00, no Continente: Tablete de Chocolate de Leite com Bolacha Milka, emb. 300 gr, por 3.99€, preço por kg 13.3€.
Em 2026-09-02 11:59:00, no Continente: Tablete de Chocolate com Amendoim e Caramelo Milka, emb. 276 gr, por 5.99€, preço por kg 21.7€.
Em 2026-09-02 11:59:00, no Continente: Tablete de Chocolate Bubbly Caramelo Milka, emb. 250 gr, por 3.99€, preço por kg 15.96€.
Em 2026-09-02 11:59:00, no Continente: Tablete de Chocolate com Drageias Milka, emb. 250 gr, por 3.99€, preço por kg 15.96€.
Em 2026-09-02 11:59:00, no Continente: Tablete de Chocolate com Amêndoa e Carame