In [None]:
# install pandas, beautifulsoup and selenium before if necessary
# imports
import time
import re

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

from bs4 import BeautifulSoup

import pandas as pd

In [None]:
# im using chromedriver v96, if not supported download new chromedriver aligned with your chrome version
# create the chrome session 
driver = webdriver.Chrome(executable_path="chromedriver.exe")

In [None]:
# wait time between page loaded and scraped (in seconds) to handle slow internet
pageLoadWait = 5

# open the page and save it variable content then make it readable using beautifulsoup
def getListProductPage(pageNum):
    driver.get("https://www.tokopedia.com/p/handphone-tablet/handphone?page=" + str(pageNum))
    driver.find_element_by_tag_name('html').send_keys(Keys.END)

    time.sleep(pageLoadWait)
    content = driver.page_source
    return BeautifulSoup(content, "lxml")

In [None]:
# tokopedia have promotion program, some links start with ta.tokopedia
# it doesn't allow direct call for the promotion link so we need to extract the standard link
def getLink(card):
    link = card["href"]
    if (link.startswith("https://ta.tokopedia.com")):
        link = link[link.index("www.tokopedia.com"):link.index("%3Fsrc")]
        link = "https://" + link.replace('%2F', '/')
    return link

# we take the shop name from the list page because it's done async at product page
# sometimes some card have image of shop before shop name so need to handle that as well
def getShopName(card):
    wrapper = list(list(card.find("div", {"data-testid": "divProductWrapper"}))[1])[2]
    if len(list(list(wrapper)[0])) > 1:
        return list(list(wrapper)[0])[1].getText()
    else:
        return list(list(wrapper)[1])[1].getText()

In [None]:
# tokopedia have tags for their automated testing called "data-testid", we will use those tags for reference
numberOfProduct = 100

data = {}
i = 1
while (len(data.keys())<numberOfProduct):
    for card in getListProductPage(i).findAll("a", {"data-testid": "lnkProductContainer"}):
        data[getLink(card)] = getShopName(card)
        if len(data.keys())==numberOfProduct:
            break
    i+=1
data

In [None]:
# functions to get details from detail page
def getName(body):
    return body.find("h1", {"data-testid": "lblPDPDetailProductName"}).getText()

def getDesc(body):
    result = ""
    for string in body.find("div", {"data-testid": "lblPDPDescriptionProduk"}).strings:
        result += string + "\n"
    return result

def getImg(body):
    imgContainer = body.find("div", {"data-testid": "PDPImageMain"})
    return list(list(list(imgContainer)[0])[1])[0]["src"]

# it's possible that there's no rating so will return null
def getRating(body):
    ratingText = body.find("span", {"data-testid": "lblPDPDetailProductRatingNumber"})
    if (ratingText is not None):
        return float(ratingText.getText())

def getPrice(body):
    textPrice = body.find("div", {"data-testid": "lblPDPDetailProductPrice"}).getText()
    return int(re.sub('[^0-9]','', textPrice))

In [None]:
# because we need description, we have to open the pages of each product to get it

columns = ["name","desc","imgUrl","price","rating","storeName"]
products = []
for link in data:
    driver.get(link)
    time.sleep(pageLoadWait)
    content = driver.page_source
    soup = BeautifulSoup(content, "lxml")
    products.append([getName(soup), getDesc(soup), getImg(soup), getPrice(soup), getRating(soup), data[link]])
products

In [None]:
df = pd.DataFrame(products, columns=columns)
df

In [None]:
df.to_csv(r'scrapResult.csv', sep = ';')