# Web Scrapping

## Scrapping for Mercado Libre Colombia

Web scraping for the MercadoLibre platform, with the aim of obtaining information to answer the following questions:

- How does the average price of specific products vary among different sellers within MercadoLibre?
- Which features (free shipping, interest-free installments, Full) are most frequently associated with top-ranked or best-selling products?
- What topics or issues (e.g., quality, shipping, description) are most frequently mentioned in negative reviews?
- How does the average response time in the public "Questions and Answers" section vary among sellers with different reputations?
- How complete is the information provided in the listings of top sellers compared to those with lower reputations?

In [35]:
# imports

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import pandas as pd

In [36]:
# initialize the Chrome WebDriver

driver = webdriver.Chrome()

In [37]:
# Function to build URLs with pagination

def get_pagination_urls(product_name, num_pages=5):
    urls = []
    formatted_product = product_name.replace(' ', '-')
    
    base_url = f"https://listado.mercadolibre.com.co/{formatted_product}"
    urls.append(base_url)
    
    for page in range(2, num_pages + 1):
        from_item = (page - 1) * 50 + 1
        pagination_url = f"https://listado.mercadolibre.com.co/{formatted_product}_Desde_{from_item}_NoIndex_True"
        urls.append(pagination_url)
    
    return urls

In [38]:
# Generate URLs for multiple pages

product = input("Which product do you want to search for? ")
product_name = product.replace(' ', '_')
# Define how many pages to scrape
num_pages = 5  # You can adjust this as needed

# Get list of URLs for all pages
urls = get_pagination_urls(product, num_pages)

print(f"{len(urls)} pages will be scraped:")
for i, url in enumerate(urls, 1):
    print(f"Page {i}: {url}")

5 pages will be scraped:
Page 1: https://listado.mercadolibre.com.co/iphone
Page 2: https://listado.mercadolibre.com.co/iphone_Desde_51_NoIndex_True
Page 3: https://listado.mercadolibre.com.co/iphone_Desde_101_NoIndex_True
Page 4: https://listado.mercadolibre.com.co/iphone_Desde_151_NoIndex_True
Page 5: https://listado.mercadolibre.com.co/iphone_Desde_201_NoIndex_True


In [39]:

data = []

for page_num, url in enumerate(urls, 1):
    print(f"Procesando página {page_num} de {len(urls)}: {url}")
    
    driver.get(url)
    
    import time
    time.sleep(2)
    
    products = driver.find_elements(By.CSS_SELECTOR, ".ui-search-layout > li")
    print(f"  Se encontraron {len(products)} productos")
    
    for i, product in enumerate(products):
        try:
            # Extract title
            title = product.find_element(By.TAG_NAME, "h3").text
            
            # Extract price
            price = product.find_element(By.CLASS_NAME, "andes-money-amount").text
            
            # Extract link
            link = product.find_element(By.CSS_SELECTOR, "h3 a, .poly-component__title-wrapper > a").get_attribute("href")

            # Extract shipping information - more resilient
            shipping = ""
            try:
                shipping = product.find_element(By.CLASS_NAME, "poly-shipping--next_day").text
            except NoSuchElementException:
                try:
                    shipping = product.find_element(By.CLASS_NAME, "poly-component__shipping").text
                except NoSuchElementException:
                    pass
            
            # Extract rating - more resilient
            rating = ""
            try:
                rating_element = product.find_element(By.CLASS_NAME, "poly-reviews__rating")
                rating = rating_element.text
            except NoSuchElementException:
                pass
            
            # Promotion?
            promotion = ""
            try:
                promotion = product.find_element(By.CLASS_NAME, "poly-component__ads-promotions").text
            except NoSuchElementException:
                pass

            # FULL?
            full = ""
            try:
                full = product.find_element(By.CSS_SELECTOR, ".poly-component__shipped-from > svg").get_attribute("aria-label")
            except NoSuchElementException:
                pass
            
            # Abrir el enlace del producto en una nueva pestaña (guarda el identificador de la ventana actual primero)
            original_window = driver.current_window_handle
            driver.switch_to.new_window('tab')
            driver.get(link)
            
            # Esperar a que cargue la página
            time.sleep(1.5)
            
            # Seller 
            seller = ""
            try:
                seller = driver.find_element(By.CLASS_NAME, "ui-seller-data-header__title").text
                seller_element = product.find_element(By.TAG_NAME, 'h2').text
                if "Vendido por " in seller_element:
                    seller = seller_element.replace("Vendido por ", "")
            except NoSuchElementException:
                pass
            
            driver.close()
            driver.switch_to.window(original_window)

            product_info = {
                "title": title,
                "price": price,
                "link": link,
                "shipping": shipping,
                "rating": rating,
                "seller": seller,
                "promotion": promotion,
                "full": full,
                "page": page_num  
            }
            
            data.append(product_info)
            print(f"  Producto {i+1}: '{seller[:30]}...' procesado")
            
        except NoSuchElementException as e:
            print(f"  Error al procesar producto {i+1}: {str(e)}")
            continue
        
    print(f"Página {page_num} completada. Total de productos hasta ahora: {len(data)}\n")
    time.sleep(2)

print(f"Proceso finalizado. Se recopilaron datos de {len(data)} productos en total.")

Procesando página 1 de 5: https://listado.mercadolibre.com.co/iphone
  Se encontraron 51 productos
  Error al procesar producto 1: Message: no such element: Unable to locate element: {"method":"tag name","selector":"h3"}
  (Session info: chrome=136.0.7103.113); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF704D8CF45+75717]
	GetHandleVerifier [0x00007FF704D8CFA0+75808]
	(No symbol) [0x00007FF704B58F9A]
	(No symbol) [0x00007FF704BAF4C6]
	(No symbol) [0x00007FF704BAF77C]
	(No symbol) [0x00007FF704BA1D7C]
	(No symbol) [0x00007FF704BD73BF]
	(No symbol) [0x00007FF704BA1C46]
	(No symbol) [0x00007FF704BD7590]
	(No symbol) [0x00007FF704BFF39C]
	(No symbol) [0x00007FF704BD7153]
	(No symbol) [0x00007FF704BA0421]
	(No symbol) [0x00007FF704BA11B3]
	GetHandleVerifier [0x00007FF70508D71D+3223453]
	GetHandleVerifier [0x00007FF705087CC2+3200322]
	GetHandleVerifier 

KeyboardInterrupt: 

In [40]:
df= pd.DataFrame(data)
df.to_csv(f"data_{product_name}.csv")