Import libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

URL (link) - Lazada

In [2]:
# URL to scrape
URL = "https://www.lazada.vn/dien-thoai-di-dong/?page=1&spm=a2o4n.searchlist.cate_1.1.5bb94c9aoYhimf"

Define extracting functions

In [3]:
# Function to extract Product Title
def get_title(soup):
    
    try:
        # Find the <h1> tag with class "pdp-mod-product-badge-title"
        title = soup.find("h1", attrs={"class": "pdp-mod-product-badge-title"})
        
        # Extract the text from the <h1> tag
        title_value = title.text

        # Clean up the title text
        title_string = title_value.strip()

    except AttributeError:
        title_string = ""

    return title_string

# Function to extract Product Price
def get_price(soup):

    try:
        price = soup.find("span", attrs={'class':'notranslate pdp-price pdp-price_type_deleted pdp-price_color_lightgray pdp-price_size_xs'}).string.strip()
    except AttributeError:

        try:
            # If there is some deal price
            price = soup.find("span", attrs={'class':'notranslate pdp-price pdp-price_type_normal pdp-price_color_orange pdp-price_size_xl'}).string.strip()

        except:
            price = ""

    return price

# Function to extract Brand Title
def get_brand(soup):

    try:
        # Find the <h1> tag with class "pdp-mod-product-badge-title"
        brand = soup.find("a", attrs={"class": "pdp-link pdp-link_size_s pdp-link_theme_blue pdp-product-brand__brand-link"})
        
        # Extract the text from the <h1> tag
        brand_value = brand.text

        # Clean up the title text
        brand_string = brand_value.strip()

    except AttributeError:
        brand_string = ""

    return brand_string

# Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        review_count = soup.find("a", attrs={'class':'pdp-link pdp-link_size_s pdp-link_theme_blue pdp-review-summary__link'}).string.strip()

    except AttributeError:
        review_count = ""	

    return review_count

Extract products's informations and convert it into CSV file

In [4]:

# URL to scrape
URL = "https://www.lazada.vn/dien-thoai-di-dong/?page=1&spm=a2o4n.searchlist.cate_1.1.5bb94c9aoYhimf"

# Set up WebDriver
driver = webdriver.Chrome(ChromeDriverManager().install())

try:
    # Open the webpage
    driver.get(URL)

    # Optionally, wait for dynamic content to load
    driver.implicitly_wait(10)  # Wait for up to 10 seconds for elements to appear

    # Get the page source after JavaScript has rendered the page
    page_source = driver.page_source

    # Parse the page source with BeautifulSoup
    soup = BeautifulSoup(page_source, "html.parser")

    # Fetch all divs with class 'RfADt', class which contains product's detail link
    divs = soup.find_all("div", attrs={'class': 'RfADt'})

    # Extract <a> tags from each div
    links = []
    for div in divs:
        a_tags = div.find_all("a")
        for a in a_tags:
            href = a.get('href')
            if href:
                absolute_link = "https:" + href  # Lazada links are often relative, so add "https:" prefix
                links.append(absolute_link)

    # print(">>links:", links)
    
    #Specify the dictionary format
    d = {"title":[], "brand":[], "price":[], "reviews":[]}
    
    # If you have at least one link, process the first one
    for link in links:
        # Open the webpage of the first link
        driver.get(link)

        # Optionally, wait for dynamic content to load
        driver.implicitly_wait(10)  # Wait for up to 3 seconds for elements to appear

        # Get the page source after JavaScript has rendered the page
        page_source_detail = driver.page_source

        # Parse the page source with BeautifulSoup
        soup_detail = BeautifulSoup(page_source_detail, "html.parser")
    
        # Extract and store the product title
        d['title'].append(get_title(soup_detail))
        d['brand'].append(get_brand(soup_detail))
        d['price'].append(get_price(soup_detail))
        d['reviews'].append(get_review_count(soup_detail))

        lazada_df = pd.DataFrame.from_dict(d) #converting into data frame 
        lazada_df['title'].replace('', np.nan, inplace=True) #replace all empty value with NaN value
        lazada_df = lazada_df.dropna(subset=['title']) #remove empty title values 
        lazada_df.to_csv("lazada_data.csv", header=True, index=True) #convert into CSV file
finally:
    print("Scraping done!")
    # Close the browser
    driver.quit()


Scraping done!


In [5]:
lazada_df

Unnamed: 0,title,brand,price,reviews
0,[Sale 15.08] Điện Thoại OPPO A58 (6GB/128GB) -...,OPPO,4.990.000 ₫,1132 đánh giá
1,[GIÁ SỈ] Pin điện thoại 5 / 5S / 5C / 6 / 6S /...,Xiaomi,139.000 ₫,31 đánh giá
2,Điện thoại Tecno SPARK GO 2024 (4+4)GB/64GB - ...,TECNO,2.590.000 ₫,15 đánh giá
3,Điện Thoại Samsung Galaxy A05s (4GB/128GB,Samsung,3.999.000 ₫,5 đánh giá
4,Samsung Galaxy A54 5G 8GB/128GB Chính Hãng,Samsung,8.990.000 ₫,4 đánh giá
5,"Điện thoại ZTE Blade A52 3GB l 64GB, Pin 5000m...",ZTE,2.490.000 ₫,12 đánh giá
6,[GIỮA THÁNG SALE TO TỪ 20H 14.08 ] iPhone 15 P...,Apple,25.990.000 ₫,324 đánh giá
7,phone-vn.com Điện Thoại Người Già Good X6 4G ...,Good,305.000 ₫,1 đánh giá
8,[Trả góp 0%] Apple iPhone 15 Pro Max 256GB Chí...,Apple,35.490.000 ₫,11 đánh giá
9,Điện Thoại ZTE Blade V50 Design | 18GB(8GB+10G...,ZTE,2.990.000 ₫,17 đánh giá
