In [22]:
!pip install beautifulsoup4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [23]:
!pip install requests

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [24]:
from bs4 import BeautifulSoup as bs
import requests

import time
import json
from requests.models import Response

import random
import datetime

import pandas as pd

from google.colab import drive


In [25]:
def extract_products(soup_response, main_category):

  # The list of products in the page
  products = [json.loads(item["data-product-tile-impression"]) for item in soup_response.find_all('div') if "data-product-tile-impression" in item.attrs]
  # The list of quantity of each product
  products_quantity = [item.contents[0].replace('Quant. Mínima =', '').replace('Quant. Mínima=', '').replace('emb.', '').strip() if item.contents else 'none quantity' for item in soup_response.find_all("p",  class_=['pwc-tile--quantity col-tile--quantity', 'quantity-product-set quantity-product-set-grid'])]
  # The list of name of each product
  products_name = [json.loads(item.attrs['data-confirmation-image'])['title'] for item in soup_response.find_all("div", attrs= {'data-confirmation-image':True, 'class':'ct-image-container'})]
  # products_prices_p_unit = [value.contents[0].replace('\n', '') for index, value in enumerate(soup_response.find_all("span",  class_=['ct-price-value','pwc-m-unit'])) if (index) % 3 != 0]

  # The list of secondary prices of each product
  price_secondary = soup_response.find_all("div", class_="pwc-tile--price-secondary col-tile--price-secondary")

  # The list of prices_per_unit of each product 
  products_prices_p_unit = []
  [products_prices_p_unit.extend(item.find_all("span", class_="ct-price-value")) for item in price_secondary]

  # The list of unities of each product 
  products_unities = []
  [products_unities.extend(item.find_all("span", class_="pwc-m-unit")) for item in price_secondary]

  decrement_count = 0

  # Log of extraction 
  print('Names =', len(products_name), '// Quantity =', len(products_quantity),
        '// Price-p-Unit =', len(products_prices_p_unit), '// Units =', len(products_unities),
        '// Products =',  len(products))
  
  # Concatenate all product-related information into the same object
  # Most important part of extraction
  for index, product in enumerate(products):

    product['name'] = products_name[index]
    
    split_quantity_product = products_quantity[index].upper().split(" ")
    product['amount'] = "{} {}".format(split_quantity_product[0], split_quantity_product[1] if len(split_quantity_product) > 1 else '' )
    
    product['main_category'] = main_category
    
    product['full_category'] = product.pop('category')
    
    product['time_scraped'] = datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S")

    # Recalculation index based on products with no secondary price
    relative_index = index - decrement_count

    find_product = soup_response.find("div", {'data-pid': product['id']})
    has_price_p_unit = find_product.find("div", class_="pwc-tile--price-secondary col-tile--price-secondary")

    #Check if the product have the secondary price-related
    if(len(products_prices_p_unit) != len(products) and not (has_price_p_unit)): 
      decrement_count += 1

      # Get the information of primary price-related
      price = find_product.find("div", class_="prices-wrapper")
      product_price = price.find('span', class_="ct-price-formatted").contents[0].replace('\n', '')
      product_unit = price.find('span', class_="pwc-m-unit").contents[0].replace('\n', '')

      product['currency'] = product_price[0]
      product['price_p_unit'] = product_price.replace(',', '.')[1:]
      product['unit'] = product_unit.replace('/', '').upper() 

    else:
      product['currency'] = products_prices_p_unit[relative_index].contents[0].replace('\n', '')[0]
      product['price_p_unit'] = products_prices_p_unit[relative_index].contents[0].replace('\n', '').replace(',', '.')[1:]
      product['unit'] = products_unities[relative_index].contents[0].replace('\n', '').replace('/', '').upper() 

    # Concat the information to pattern decided by group
    price_p_unit_concat = "{} {}/{}".format(product['price_p_unit'], product['currency'], product['unit'])
    product['quantity'] =  "{} | {}".format(products_quantity[index].upper(), price_p_unit_concat)

  return products

In [26]:
def extract_data(category_path): 

  all_scraped_values = []
  qtd_page_products = 24
  qtd_total_products = 0
  url = 'https://www.continente.pt/'
  url += category_path
  params = {'pmin': '0.01', 'start': '0', 'sz': qtd_page_products}

  condition = True

  while(condition):

    params['start'] = len(all_scraped_values)
    response = requests.get(url, params=params)
    soup = bs(response.content)

    # Log of URL and params request
    print(url, params)
    
    # Check if this soup response is a products page or not. 
    if(soup.find("div","search-results-products-counter d-flex justify-content-center")):
      products_page = soup.find("div","search-results-products-counter d-flex justify-content-center").contents[0]
      products_splited = products_page.split(" ")
      qtd_page_products = int(products_splited[0])
      qtd_total_products = int(products_splited[2])

      all_scraped_values += extract_products(soup, category_path)

      time.sleep(random.randint(10, 30))

      # Log extration
      print( 'Extracted =', len(all_scraped_values),  '// All Products=', qtd_total_products, '// Extract % =', (len(all_scraped_values) / qtd_total_products) * 100)

      condition = len(all_scraped_values) <= qtd_total_products

    else:
      condition = False

  return all_scraped_values

In [27]:
def extract_continent():
  url = 'https://www.continente.pt/'
  response = requests.get(url)
  soup = bs(response.content)

  list_all_request = []


  # Scrape all URLs to access the categories of the Continente website.  
  # url_list = [str(item['href']).replace(url, '') for item in soup.find_all('a', attrs={'class':'dropdown-link', 'role':'menuitem'})]
  # url_list = list(dict.fromkeys([url[:url.find('/')+1] for url in url_list]))
  # url_list = url_list[url_list.index('mercearia/'):url_list.index('mercearia/')+10] 

  # The list of categories defined by the group for analysis.
  url_list = ['mercearia', 'padaria-e-pastelaria', 'frutas-e-legumes', 'bio-e-escolhas-alimentares', 'laticinios-e-ovos', 'charcutaria-e-queijos',
  'peixaria-e-talho', 'congelados', 'refeicoes-faceis']

  # Iterate by the url_list and pass each category to extract_data(), and concatenate the returned values
  for url in url_list:
    print(url)    
    products = extract_data(url)
    df = pd.DataFrame(products)
    df.to_csv(url+'.csv')
    list_all_request += products
    time.sleep(random.randint(10, 30))

  # Return all values scrapped
  return list_all_request

In [28]:
if __name__ == '__main__':
    drive.mount('drive')

    all_products = extract_continent()
    
    df = pd.DataFrame(all_products)
    
    # The list of columns defined by the group for final dataset
    columns = ['time_scraped', 'quantity', 'main_category', 'name', 'amount', 'unit', 'price', 'price_p_unit', 'currency', 'id', 'brand', 'full_category', 'variant', 'channel']

    df = df[columns]

    df.to_csv('data.csv')
    
    df

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).
mercearia
https://www.continente.pt/mercearia {'pmin': '0.01', 'start': 0, 'sz': 24}
Names = 24 // Quantity = 24 // Price-p-Unit = 24 // Units = 24 // Products = 24
Extracted = 24 // All Products= 5064 // Extract % = 0.47393364928909953
https://www.continente.pt/mercearia {'pmin': '0.01', 'start': 24, 'sz': 24}
Names = 24 // Quantity = 24 // Price-p-Unit = 22 // Units = 22 // Products = 24
Extracted = 48 // All Products= 5064 // Extract % = 0.9478672985781991
https://www.continente.pt/mercearia {'pmin': '0.01', 'start': 48, 'sz': 24}
Names = 24 // Quantity = 24 // Price-p-Unit = 24 // Units = 24 // Products = 24
Extracted = 72 // All Products= 5064 // Extract % = 1.4218009478672986
https://www.continente.pt/mercearia {'pmin': '0.01', 'start': 72, 'sz': 24}
Names = 24 // Quantity = 24 // Price-p-Unit = 23 // Units = 23 // Products = 24
Extracted = 96 // All Products= 5064 // Ex