In [44]:
!pip install selenium
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Hit:2 http://security.ubuntu.com/ubuntu bionic-security InRelease
Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:4 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Ign:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:6 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:7 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Hit:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:9 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:12 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu

In [79]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver', options=chrome_options)

def get_html_soup(url: str, expected_class: str):
    print("Doing request!")
    driver.get(url)
    wait = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, expected_class))
    )
    datos = driver.execute_script("return document.body.innerHTML")
    print("Request executed!")
    return BeautifulSoup(datos, "html.parser")


In [91]:
from typing import List, Dict, Union
from tqdm import tqdm

from abc import ABC, abstractmethod
from typing import List, Dict

def style_string_to_dict(style: str) -> Dict[str, str]:
  attributes = style.strip(";").split("; ")
  style_dict = {}
  for line in attributes:
    [key, value] = line.split(": ")
    style_dict[key] = value
  return style_dict

class ScrapLicores(ABC):
  def __init__(
      self,
      types: List[str],
      links: List[str],
      sizes: List[str],
      classes: Dict[str, List[str]],
      expected_class: str
    ):
    self.links = links
    self.classes = classes
    self.types = types
    self.sizes = sizes
    self.expected_class = expected_class
    self.tags = ['div', 'span', 'h1', 'h2', 'p']
    self.columns = ["sku", "Name", "image_url", "Price"]
  
  @abstractmethod
  def get_image_url_from_style_string(style: str) -> str:
    ...

  @abstractmethod
  def get_skus(self, html_soup, info):
    ...

  @abstractmethod
  def get_brand(self, html_soup, info):
    ...
  
  @abstractmethod
  def get_names(self, html_soup, info):
    ...

  @abstractmethod
  def get_image_url(self, html_soup, info):
    ...

  @abstractmethod
  def get_price(self, html_soup, info):
    ...

  
  def get_info_from_url(self, url, product_type, size):
    html_soup = get_html_soup(url, expected_class=self.expected_class)
    info = []
    info = self.get_skus(html_soup, info)
    info = self.get_brand(html_soup, info)
    info = self.get_names(html_soup, info)
    info.append(size)
    info = self.get_image_url(html_soup, info)
    info = self.get_price(html_soup, info)
    info.append(product_type)
    return info


class ScrapLicoresJumbo(ScrapLicores):
  def __init__(
      self,
      types: List[str],
      links: List[str],
      sizes: List[str],
      classes: Dict[str, List[str]] = {
          "sku": ["product-code"], 
          "name": ["product-name"],
          "brand": ["product-brand"],
          "image_url": ["zoomed-image"],
          "price": ["price-best", "product-sigle-price-wrapper"],
      },
      expected_class: str = "zoomed-image"
    ):
    
    self.links = links
    self.classes = classes
    self.types = types
    self.sizes = sizes
    self.expected_class = expected_class
    self.tags = ['div', 'span', 'h1', 'h2', 'p', "a"]
    self.columns = ["sku", "Name", "image_url", "Price"]
  

  def get_image_url_from_style_string(self, style: str) -> str:
    style_dict = style_string_to_dict(style)
    return style_dict["background-image"].split('("')[1].split("?")[0]

  def get_skus(self, html_soup, info):
    found = False
    for tag in self.tags:
        for className in self.classes["sku"]:
            sku = (html_soup.find(tag, class_=className))
            if sku:
                found = True
                break
        if found:
            break
    if found:
        try:
            sku = sku.string.split(" ")[1]
            info.append(sku)
        except Exception as e:
            print(e)
            info.append(None)
    else:
        info.append(None)
    return info

  def get_names(self, html_soup, info):
    found = False
    for tag in self.tags:
        for className in self.classes["name"]:
            name = (html_soup.find(tag, class_=className))
            if name:
                found = True
                break
        if found:
            break
    if found:
        try:
            name = name.string
            info.append(name)
        except Exception as e:
            print(e)
            info.append(None)
    else:
        info.append(None)
    return info

  def get_brand(self, html_soup, info):
    found = False
    for tag in self.tags:
        for className in self.classes["brand"]:
            brand = (html_soup.find(tag, class_=className))
            if brand:
                found = True
                break
        if found:
            break
    if found:
        try:
            brand = brand.string
            info.append(brand)
        except Exception as e:
            print(e)
            info.append(None)
    else:
        info.append(None)
    return info

  def get_image_url(self, html_soup, info):
    found = False
    for tag in self.tags:
        for className in self.classes["image_url"]:
            style_image_url = (html_soup.find(tag, class_=className))
            if style_image_url:
                found = True
                break
        if found:
            break
    if found:
        try:
            image_url = self.get_image_url_from_style_string(style_image_url["style"])
            info.append(image_url)
        except Exception as e:
            print(e)
            info.append(None)
    else:
        info.append(None)
    return info

  def get_price(self, html_soup, info):
    found = False
    for tag in self.tags:
        for className in self.classes["price"]:
            price = (html_soup.find(tag, class_=className))
            if price:
                found = True
                break
        if found:
            break
    if found:
        try:
            price = price.string.strip("$")
            info.append(price)
        except Exception as e:
            print(e)
            info.append(None)
    else:
        info.append(None)
    return info


class ScrapLicoresLider(ScrapLicores):
  def __init__(
      self,
      types: List[str],
      links: List[str],
      sizes: List[str],
      classes: Dict[str, List[str]] = {
          "sku": ["pdp-desktop-item-number"],
          "name": ["product-detail-display-name"],
          "brand": ["prduct-detail-cart__brand-link"],
          "image_url": ["styled__FigureContainer-sc-13lpau7-2"],
          "price": ["pdp-mobile-sales-price"],
      },
      expected_class: str = "styled__FigureContainer-sc-13lpau7-2"
    ):
    
    self.links = links
    self.types = types
    self.sizes = sizes
    self.classes = classes
    self.expected_class = expected_class
    self.tags = ['div', 'span', 'h1', 'h2', 'p', 'a']
    self.columns = ["sku", "Name", "image_url", "Price"]
  

  def get_image_url_from_style_string(self, style: str) -> str:
    style_dict = style_string_to_dict(style)
    return style_dict["background-image"].split('("')[1].split(")")[0]

  def get_skus(self, html_soup, info):
    found = False
    for tag in self.tags:
        for className in self.classes["sku"]:
            sku = (html_soup.find(tag, class_=className))
            if sku:
                found = True
                break
        if found:
            break
    if found:
        try:
            sku = sku.string.split(" ")[1]
            info.append(sku)
        except Exception as e:
            print(e)
            info.append(None)
    else:
        info.append(None)
    return info

  def get_names(self, html_soup, info):
    found = False
    for tag in self.tags:
        for className in self.classes["name"]:
            name = (html_soup.find(tag, class_=className))
            if name:
                found = True
                break
        if found:
            break
    if found:
        try:
            name = name.string
            info.append(name)
        except Exception as e:
            print(e)
            info.append(None)
    else:
        info.append(None)
    return info

  def get_brand(self, html_soup, info):
    found = False
    for tag in self.tags:
        for className in self.classes["brand"]:
            brand = (html_soup.find(tag, class_=className))
            if brand:
                found = True
                break
        if found:
            break
    if found:
        try:
            brand = brand.string
            info.append(brand)
        except Exception as e:
            print(e)
            info.append(None)
    else:
        info.append(None)
    return info

  def get_image_url(self, html_soup, info):
    found = False
    for tag in self.tags:
        for className in self.classes["image_url"]:
            style_image_url = (html_soup.find(tag, class_=className)).find("figure", recursive=False)
            if style_image_url:
                found = True
                break
        if found:
            break
    if found:
        try:
            image_url = self.get_image_url_from_style_string(style_image_url["style"])
            info.append(image_url)
        except:
            info.append(None)
    else:
        info.append(None)
    return info

  def get_price(self, html_soup, info):
    found = False
    for tag in self.tags:
        for className in self.classes["price"]:
            price = (html_soup.find(tag, class_=className))
            if price:
                found = True
                break
        if found:
            break
    if found:
        try:
            price = price.string.strip("$")
            info.append(price)
        except Exception as e:
            print(e)
            info.append(None)
    else:
        info.append(None)
    return info


def search(scrapper: ScrapLicores, store) -> List[List[str]]:
    productos_filtrados = []
    for index in range(len(scrapper.links)):
        info = scrapper.get_info_from_url(
            scrapper.links[index],
            scrapper.types[index],
            scrapper.sizes[index],
        )
        productos_filtrados.append(info)
        productos_filtrados[index].append(store)
    print(productos_filtrados)
    return (productos_filtrados)


"""
{"store": "Lider",
"products_list": [
  {
      "name": "Cerveza Lager Botellin2",
      "sku": "73094077",
      "brand": "Corona",
      "size": "6 Un x 330 ml c/u",
      "image_url": "https://www.lider.cl/supermercado/product/sku/993393/corona-cerveza-lager-botellin-6-un-x-330-ml-cu",
      "price": 215990,
      "type": "c-corona"
  },
  {
      "name": "Cerveza Corona botella3",
      "sku": "38111466",
      "brand": "Corona",
      "size": "330 CC x6",
      "image_url": "https://www.liquidos.cl/productos/746/cerveza-corona-botella-330-cc-x6-liquidos-cl",
      "price": 2222215890,
      "type": "c-corona"
  }
]}
"""

def format_body(keys: List[str], values: List[str]) -> Dict[str, str]:
    body = []
    print(values)
    print(values[0])
    for i in range(len(values)):
        body.append({keys[j]: values[i][j] for j in range(len(values[i]))})
    return body


In [92]:
print("Start")

# Licores
links_lider = {
    "alto-normal-750": "https://www.lider.cl/supermercado/product/sku/1361/alto-del-carmen-pisco-especial-35-botella-750-ml",
    "alto-normal-1000": "https://www.lider.cl/supermercado/product/sku/468481/alto-del-carmen-pisco-especial-35-botella-1-l",
    "mistral-normal-750": "https://www.lider.cl/supermercado/product/sku/1375/mistral-pisco-35-especial-anejado-en-roble-botella-750-cc",
    "corona-6-330": "https://www.lider.cl/supermercado/product/sku/993393/corona-cerveza-lager-botellin-6-un-x-330-ml-cu",
    "royal-guard-12-350": "https://www.lider.cl/supermercado/product/sku/704136/royal-guard-pack-cerveza-lager-latas-12-un"
}


links_jumbo = {
    "alto-normal-750": "https://www.jumbo.cl/pisco-alto-del-carmen-750-cc-35-gl-especial-botella-verde/p",
    "alto-normal-1000": "https://www.jumbo.cl/pisco-alto-del-carmen-1-l-35/p",
    "mistral-normal-750": "https://www.jumbo.cl/pisco-mistral-750-cc-35/p",
    "corona-6-330": "https://www.jumbo.cl/cerveza-corona-botella-6x330cc-2/p",
    "royal-guard-12-350": "https://www.jumbo.cl/cerveza-royal-guard-pack-12-unid-lata-350-cc-cu/p",    
}

sizes = [
    "1 x 750 cc.",
    "1 x 1000 cc.",
    "1 x 750 cc.",
    "6 x 330 cc.",
    "12 x 350 cc."
]


scrapper_jumbo = ScrapLicoresJumbo(
    types = [i for i in links_jumbo.keys()],
    links = [i for i in links_jumbo.values()],
    sizes = sizes,
)

scrapper_lider = ScrapLicoresLider(
    types= [i for i in links_lider.keys()],
    links = [i for i in links_lider.values()],
    sizes = sizes,
)
productos_lider = search(scrapper_lider, "Lider")
productos_jumbo = search(scrapper_jumbo, "Jumbo")

keys = [
    "sku",
    "name",
    "brand",
    "size",
    "image_url",
    "price",
    "type",
    "store"
]

body_lider = format_body(keys, productos_lider)
print(body_lider)

body_jumbo = format_body(keys, productos_jumbo)
print(body_jumbo)

Start
Doing request!
Request executed!
Doing request!
Request executed!
Doing request!
Request executed!
Doing request!
Request executed!
Doing request!
Request executed!
[['791958', 'Alto del Carmen', 'Pisco especial 35° Botella, 750 ml', '1 x 750 cc.', 'https://images.lider.cl/wmtcl?source=url[file:/productos/1361a.jpg]&sink"', '5.690', 'alto-normal-750', 'Lider'], ['381114', 'Alto del Carmen', 'Pisco especial 35° Botella, 1 L', '1 x 1000 cc.', 'https://images.lider.cl/wmtcl?source=url[file:/productos/468481a.jpg]&sink"', '9.190', 'alto-normal-1000', 'Lider'], ['262050', 'Mistral', 'Pisco 35° Especial Añejado en Roble Botella, 750 cc', '1 x 750 cc.', 'https://images.lider.cl/wmtcl?source=url[file:/productos/1375a.jpg]&sink"', '7.390', 'mistral-normal-750', 'Lider'], ['730940', 'Corona', 'Cerveza Lager Botellin, 6 Un x 330 ml c/u ', '6 x 330 cc.', 'https://images.lider.cl/wmtcl?source=url[file:/productos/993393a.jpg]&sink"', '5.990', 'corona-6-330', 'Lider'], ['523761', 'Royal Guard',

In [89]:
import requests

headers = {"Content-Type": "application/json; charset=utf-8"}



payload_lider = { "store": "Lider", "products_list": body_lider }
payload_jumbo = { "store": "Jumbo", "products_list": body_jumbo }
print(payload_lider)
url = 'https://backend-gpti.herokuapp.com/api/v1/update-products'

Jumbo = requests.post(url, json=payload_jumbo, headers=headers)
print(Jumbo)
Lider = requests.post(url, json=payload_lider, headers=headers)

{'store': 'Lider', 'products_list': [{'sku': '791958', 'name': 'Alto del Carmen', 'brand': '1 x 750 cc.', 'size': 'Pisco especial 35° Botella, 750 ml', 'image_url': 'https://images.lider.cl/wmtcl?source=url[file:/productos/1361a.jpg]&sink"', 'price': '5.690', 'type': 'alto-normal-750', 'store': 'Lider'}, {'sku': '381114', 'name': 'Alto del Carmen', 'brand': '1 x 1000 cc.', 'size': 'Pisco especial 35° Botella, 1 L', 'image_url': 'https://images.lider.cl/wmtcl?source=url[file:/productos/468481a.jpg]&sink"', 'price': '9.190', 'type': 'alto-normal-1000', 'store': 'Lider'}, {'sku': '262050', 'name': 'Mistral', 'brand': '1 x 750 cc.', 'size': 'Pisco 35° Especial Añejado en Roble Botella, 750 cc', 'image_url': 'https://images.lider.cl/wmtcl?source=url[file:/productos/1375a.jpg]&sink"', 'price': '7.390', 'type': 'mistral-normal-750', 'store': 'Lider'}, {'sku': '730940', 'name': 'Corona', 'brand': '6 x 330 cc.', 'size': 'Cerveza Lager Botellin, 6 Un x 330 ml c/u ', 'image_url': 'https://images.