### Install requirements

### ✅ Virtual env

Virtual environment allows to manage dependencies separately

In [None]:
py -m venv scrap

#or

python -m venv scrap

### ✅ Beautiful Soup : Scrap Tools

In [None]:
pip install BeautifulSoup4

### ✅ Pandas - Matplotlib - seaborn

In [None]:
pip install pandas
pip install mtplotlib
pip install seaborn

### ✅ Install openpyxl : To be able to apply and use excel formats.

In [None]:
pip install openpyxl

### Code

In [None]:
from bs4 import BeautifulSoup
import requests
import time, os
import json
import pandas as pd
from typing import List, Dict

### Parameters

In [None]:
products_list: List[Dict] = []
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept-Language': 'es-MX,es;q=0.9'
    }

### Items to search: Laptops

In [None]:
class Scrap():

    def __init__(self):
        """Initialize the scraper and start the scraping process"""
        self.get_html()

    def get_html(self):
        """
        Retrieve HTML from target pages
        Handles errors and completes the process
        """
        try:
            
            urls = self.range_pages()

            for url in urls:
                response = requests.get(url, headers=headers)
                html = BeautifulSoup(response.text, 'html.parser')

                self.get_items(html)
                time.sleep(1)

        except Exception as e:
            print(f"get_hmtl | Error: {e}")
            return None
        finally:
            print("Scraping completed.")
            time.sleep(2)
    
    def range_pages(self):
        """
        Generate URLs for all pages to be scraped
        
        Returns:
            List[str]: List of generated URLs
        """
        try:
            # I did this count manually because there were not many pages, but I could have done it automatically.
            base_url: str = "https://listado.mercadolibre.com.ve/laptops"
            url_list: List[str] = [base_url]
            total_pages: int = 41
            total_items: int = 49

            for page in range(1, total_pages):
                pagination_url = f"{base_url}-accesorios/laptops_Desde_{total_items}_NoIndex_True"
                url_list.append(pagination_url)
                total_items += 48
            
            return url_list
        except Exception as e:
            print(f"range_page | Error: {e}")
            return None
        
    def get_text_item(self, item: str, html_tag: str, html_class: str, default: str = "N/A"):

        """
        Extract text content from HTML element
        
        Args:
            item: BeautifulSoup element to search within
            html_tag: HTML tag name to search for
            html_class: CSS class name of the element
            default: Default value if element not found
            
        Returns:
            str: Extracted text or default value
        """
        try:
            element = item.find(html_tag, class_=html_class) if item else None
            return element.get_text(strip=True) if element else default
        except Exception as e:
            print(f" get_text_item | Error: {e}")
            return "None"

    def get_attribute(self, item: str, html_tag: str, attribute: str, default: str = "N/A"):
        """
        Extract attribute value from HTML element
        
        Args:
            item: BeautifulSoup element to search within
            html_tag: HTML tag name to search for
            attribute: Attribute name to extract (src/href)
            default: Default value if attribute not found
            
        Returns:
            str: Attribute value or default
        """
        try:
            # get img urls
            if attribute == 'src':
                element = item.select_one(html_tag) if item else None
                return  element.get('data-src') or element.get(attribute)
            
            # get product urls
            if attribute == 'href':
                element = item.select_one(html_tag) if item else None
                return element.get(attribute, default)
            
        except Exception as e:
            print(f" get_attribute | Error: {e}")
            return f"None | atrribute: {attribute}"

    def get_items(self, html):
        """
        Extract all product items from HTML

        Args:
            html: BeautifulSoup parsed HTML content
        """
        if html is None:
            print("Failed to retrieve HTML.")
            return
        
        items = html.find_all('li', class_='ui-search-layout__item')

        for item in items:
            try:
                products_list.append({
                        "brand": self.get_text_item(item,'span',"poly-component__brand"),
                        "description": self.get_text_item(item, 'a','poly-component__title'),
                        "img_url": self.get_attribute(item,'img', 'src'),
                        "product_url": self.get_attribute(item,'a','href'),
                        "seller": self.get_text_item(item,'span','poly-component__seller'),
                        "price": {
                            "amount": self.get_text_item(item,'span','andes-money-amount__fraction'),
                            "discount": self.get_text_item(item,'span','andes-money-amount__discount'),    
                        },
                        "reviews": {
                            "rating": self.get_text_item(item,'span','poly-reviews__rating'),
                            "stars": True if len(item.find_all('span', class_='poly-reviews__starts')) > 0 else False,
                            'total_sales': self.get_text_item(item,'span','poly-reviews__total').strip("()")
                        },
                        "shipping": self.get_text_item(item,'div','poly-component__shipping')
                })
                time.sleep(1)

            except Exception as e:
                print(f"Error processing item: {e}")

run = Scrap()

Scraping completed.


### Converting Data --> JSON

In [None]:
def Get_JSON(file,laptops):
    """
    Convert a list of product dictionaries into a JSON format.
    
    Args:
        products: List of product dictionaries containing scraped data
        
    Returns:
        JSON: JSON formatted string of the product data
    """
    # root
    root = None

    # Make folder
    folder_path = "./products"
    os.makedirs(folder_path, exist_ok=True)

    #file root
    root = os.path.join(f"{folder_path}/{file}.json")
    
    with open(root, "w") as f:
        json.dump([],f)

    def insert_JSON(data, fileName = root):
        with open(fileName, "r+", encoding= "utf-8") as file:
            file_data = json.load(file)
            file_data.append(data)
            file.seek(0)
            json.dump(file_data, file, indent= 4)

    for laptop in laptops:
        insert_JSON(laptop)

Get_JSON("laptops",products_list)

### Getting Data --> Excel - xlsx | CSV

In [None]:
def get_CSV(file: str, products: List[Dict]):
    """
    Convert a list of product dictionaries into a pandas DataFrame.
    
    Args:
        products: List of product dictionaries containing scraped data
        
    Returns:
        pd.DataFrame: Structured dataframe containing all product information
    """
    # Initialize dictionary to store data with more descriptive variable names
    product_data = {
        'brand': [],
        'description': [],
        'seller': [],
        'price_amount': [],
        'price_discount': [],
        'rating': [],
        'has_stars': [],  # More descriptive than just 'stars'
        'total_sales': [],
        'shipping_info': [],  # More descriptive
        'image_url': [],  # Consistent naming
        'product_url': []
    }

    # Extract data from each product
    for product in products:
        try:
            product_data['brand'].append(product.get('brand', ''))
            product_data['description'].append(product.get('description', ''))
            product_data['seller'].append(product.get('seller', ''))
            product_data['price_amount'].append(product.get('price', {}).get('amount', ''))
            product_data['price_discount'].append(product.get('price', {}).get('discount', ''))
            product_data['rating'].append(product.get('reviews', {}).get('rating', ''))
            product_data['has_stars'].append(product.get('reviews', {}).get('stars', False))
            product_data['total_sales'].append(product.get('reviews', {}).get('total_sales', ''))
            product_data['shipping_info'].append(product.get('shipping', ''))
            product_data['image_url'].append(product.get('img_url', ''))
            product_data['product_url'].append(product.get('product_url', ''))
        except Exception as e:
            print(f"Error processing product: {e}")
            break

    df = pd.DataFrame(product_data)
    df.to_excel(f"./products/dirty/{file}.xlsx", engine='openpyxl',index=False, sheet_name='laptops')
    df.to_csv(f"./products/dirty/{file}.csv", index=False, sep=";", encoding='utf-8-sig')

get_CSV("laptops",products_list)