<a href="https://colab.research.google.com/github/Felipecuestas98/Bootcamp-Mintic/blob/main/web_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**WEB Scraping**

### El Web Scraping es una técnica utilizada para extraer datos de sitios web de manera automática. Este proceso implica el uso de programas de software que envían solicitudes a páginas web, recuperan su contenido HTML, y extraen la información deseada, la cual puede ser almacenada y analizada posteriormente. Es una herramienta poderosa para recolectar grandes volúmenes de datos no estructurados que se encuentran en la web y transformarlos en datos estructurados para análisis, investigación, y diversas aplicaciones comerciales.

In [1]:
#Instalación de librerías python necesarias, si es requerido
!pip install requests
!pip install beautifulsoup4
!pip install pandas
!pip install certifi



In [2]:
#Importar librerías para extracción de datos web
import ssl
import certifi
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
#Sitio web de la extracción de datos
url         = "https://zonafit.co/"
response    = requests.get(url)
soup        = BeautifulSoup(response.content, 'html.parser')

In [4]:
#recorre los datos del .parser por el div y la clase card_content para extraer los datos de proveedor, nombre y precio de los suplementos de la pagina de inicio
suplementos = soup.find_all('div',class_="card__content")
#crea una lista con la busqueda
book_list = [suplementos]
book_list

[[<div class="card__content">
  <div class="card__information">
  <h3 class="card__heading">
  <a class="full-unstyled-link" href="/collections/optimum-nutrition">Optimum Nutrition
                </a>
  </h3><p class="card__caption">Optimum Nutrition es una marca reconocida de productos deportivos y nutrición deportiva....<span class="icon-wrap"><svg class="icon icon-arrow" fill="none" viewbox="0 0 14 10" xmlns="http://www.w3.org/2000/svg"><path clip-rule="evenodd" d="M8.537.808a.5.5 0 0 1 .817-.162l4 4a.5.5 0 0 1 0 .708l-4 4a.5.5 0 1 1-.708-.708L11.793 5.5H1a.5.5 0 0 1 0-1h10.793L8.646 1.354a.5.5 0 0 1-.109-.546" fill="currentColor" fill-rule="evenodd"></path></svg>
  </span>
  </p></div>
  </div>,
  <div class="card__content">
  <div class="card__information">
  <h3 class="card__heading">
  <a class="full-unstyled-link" href="/collections/optimum-nutrition">Optimum Nutrition<span class="icon-wrap"><svg class="icon icon-arrow" fill="none" viewbox="0 0 14 10" xmlns="http://www.w3.org/

In [52]:
# prompt: Con base en el anterior script que es el patron de extraccion de los datos genera un recorrido por las paginas del sitio web para alimentar un dataframe con todos los datos.

import ssl
import certifi
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Install necessary libraries if not already installed
# !pip install requests beautifulsoup4 pandas certifi

def scrape_page(url):
    """Scrapes a single page and returns a list of dictionaries containing product information."""

    try:
        response = requests.get(url, verify=True)
        response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
        soup = BeautifulSoup(response.content, 'html.parser')
        suplementos = soup.find_all('div', class_="card__content")

        products = []
        for suplemento in suplementos:
            product = {}

            try:
                product['Proveedor'] = suplemento.find('div', class_="caption-with-letter-spacing light").text.strip()
            except AttributeError:
                product['Proveedor'] = "N/A"

            try:
                product['Nombre del Artículo'] = suplemento.find('a', class_="full-unstyled-link").text.strip()
            except AttributeError:
                product['Nombre del Artículo'] = "N/A"

            try:
                product['Precio'] = suplemento.find('span', class_="price-item price-item--regular").text.strip()
            except AttributeError:
                product['Precio'] = "N/A"

            products.append(product)
        return products

    except requests.exceptions.RequestException as e:
        print(f"Error scraping {url}: {e}")
        return []

# Start with the initial URL
base_url = "https://zonafit.co/"
all_products = []


# Implement Pagination Logic (replace with your actual pagination logic)
# This example assumes that each page has a "next page" link
current_page_url = base_url
while True: # loop until there is no next page link
    products = scrape_page(current_page_url)
    if not products: #check that the page was successfully scrapped
        break; # Stop the loop if the scraping fails
    all_products.extend(products)

    #Find the next page link (replace with your specific pagination structure)
    response = requests.get(current_page_url, verify=True)
    soup = BeautifulSoup(response.content, 'html.parser')
    next_page_link = soup.find('a', class_="next") #Replace with the actual class name for the next page link

    if next_page_link:
        current_page_url = next_page_link.get('href')
        if current_page_url.startswith('/'):
          current_page_url = base_url + current_page_url[1:] #Handle relative URLs
    else:
      break


# Create the DataFrame after scraping all pages
df = pd.DataFrame(all_products)
df

Unnamed: 0,Proveedor,Nombre del Artículo,Precio
0,,Optimum Nutrition,
1,,Optimum Nutrition,
2,,Proton Whey Smartmuscle,
3,Smartmuscle,Proton Whey Smartmuscle,A partir de $150.000
4,,Prostar Whey Ultimate Nutrition,
5,Ultimate Nutrition,Prostar Whey Ultimate Nutrition,A partir de $195.000
6,,Gold Standard 100% Whey Naturally Flavored Opt...,
7,Optimum Nutrition,Gold Standard 100% Whey Naturally Flavored Opt...,A partir de $215.600
8,,Gold Standard 100% Whey Optimum Nutrition,
9,Optimum Nutrition,Gold Standard 100% Whey Optimum Nutrition,A partir de $169.000


In [53]:
# Eliminar filas con datos N/A para tener solo las filas con datos para analizar
df = df[df['Precio'] != 'N/A']
df

Unnamed: 0,Proveedor,Nombre del Artículo,Precio
3,Smartmuscle,Proton Whey Smartmuscle,A partir de $150.000
5,Ultimate Nutrition,Prostar Whey Ultimate Nutrition,A partir de $195.000
7,Optimum Nutrition,Gold Standard 100% Whey Naturally Flavored Opt...,A partir de $215.600
9,Optimum Nutrition,Gold Standard 100% Whey Optimum Nutrition,A partir de $169.000
11,GMN,Be One GMN,A partir de $149.900
13,Proscience,La Wey Fitmafia,A partir de $24.900
15,Isopure Company,Isopure Zero - Low Carb Isopure Company,A partir de $125.000
17,Ultimate Nutrition,ISO Sensation 93,A partir de $209.000
19,Zona FIT,Burner Stack 360 gr + Protein Pancake 1.69 lb,$165.000
21,Zona FIT,Gold Standard 5 lb + Amino X 30 serv,$400.000


# Otros ejemplos de páginas para hacer web scraping

In [None]:
url = 'https://zonafit.co/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
headlines = soup.find_all(class_='ssrcss-1mhwnz8-Promo e1vyq2e80')
news_list = [headline.get_text() for headline in headlines]
print(news_list)

In [None]:
news_df = pd.DataFrame(news_list, columns=['Headline'])
news_df.head()