In [1]:
# install pandas, beautifulsoup and selenium before if necessary
# imports
import time
import re

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

from bs4 import BeautifulSoup

import pandas as pd

In [2]:
# im using chromedriver v96, if not supported download new chromedriver aligned with your chrome version
# create the chrome session 
driver = webdriver.Chrome(executable_path="chromedriver.exe")

In [3]:
# wait time between page loaded and scraped (in seconds) to handle slow internet
pageLoadWait = 5

# open the page and save it variable content then make it readable using beautifulsoup
def getListProductPage(pageNum):
    driver.get("https://www.tokopedia.com/p/handphone-tablet/handphone?page=" + str(pageNum))
    driver.find_element_by_tag_name('html').send_keys(Keys.END)

    time.sleep(pageLoadWait)
    content = driver.page_source
    return BeautifulSoup(content, "lxml")

In [4]:
# tokopedia have promotion program, some links start with ta.tokopedia
# it doesn't allow direct call for the promotion link so we need to extract the standard link
def getLink(card):
    link = card["href"]
    if (link.startswith("https://ta.tokopedia.com")):
        link = link[link.index("www.tokopedia.com"):link.index("%3Fsrc")]
        link = "https://" + link.replace('%2F', '/')
    return link

# we take the shop name from the list page because it's done async at product page
# sometimes some card have image of shop before shop name so need to handle that as well
def getShopName(card):
    wrapper = list(list(card.find("div", {"data-testid": "divProductWrapper"}))[1])[2]
    if len(list(list(wrapper)[0])) > 1:
        return list(list(wrapper)[0])[1].getText()
    else:
        return list(list(wrapper)[1])[1].getText()

In [5]:
# tokopedia have tags for their automated testing called "data-testid", we will use those tags for reference
numberOfProduct = 100

data = {}
i = 1
while (len(data.keys())<numberOfProduct):
    for card in getListProductPage(i).findAll("a", {"data-testid": "lnkProductContainer"}):
        data[getLink(card)] = getShopName(card)
        if len(data.keys())==numberOfProduct:
            break
    i+=1
data

{'https://www.tokopedia.com/littlemaster/hp-samsung-e1272-termurah-original-jadul-samsung-lipat-dual-sim-hitam': 'Little Master.shop',
 'https://www.tokopedia.com/kkeep/nokia-jadul-105-dual-sim-2017-hitam': 'Kkeep',
 'https://www.tokopedia.com/trustphonestore/oppo-f5-4-64gb-terpercaya-garansi-1tahun-4-64gb-black': 'Trust Phonestore',
 'https://www.tokopedia.com/sunmobile22/iphone-xr-second-red-64-gb': 'sun mobile',
 'https://www.tokopedia.com/markasapple/iphone-13-pro-max-128gb-256gb-512gb-1tb-garansi-resmi-ibox-sierra-blue-128gb': 'MARKAS APPLE',
 'https://www.tokopedia.com/ptpratamasemesta/iphone-11-128gb-garansi-resmi-tam-ibox-purple-64-new-packed?whid=0': 'PT Pratama Sntra Semesta',
 'https://www.tokopedia.com/xiaomi/xiaomi-poco-f3-8-256gb-nfc-snapdragon-870-48mp-4520mah-hp-android-deep-ocean-blue?whid=0': 'Xiaomi Official Store',
 'https://www.tokopedia.com/shopyzid/samsung-galaxy-a32-5g-nfc-8gb-128gb-8-128-garansi-resmi-awesome-black?whid=0': 'Shopyz ID',
 'https://www.tokopedia.

In [6]:
# functions to get details from detail page
def getName(body):
    return body.find("h1", {"data-testid": "lblPDPDetailProductName"}).getText()

def getDesc(body):
    result = ""
    for string in body.find("div", {"data-testid": "lblPDPDescriptionProduk"}).strings:
        result += string + "\n"
    return result

def getImg(body):
    imgContainer = body.find("div", {"data-testid": "PDPImageMain"})
    return list(list(list(imgContainer)[0])[1])[0]["src"]

# it's possible that there's no rating so will return null
def getRating(body):
    ratingText = body.find("span", {"data-testid": "lblPDPDetailProductRatingNumber"})
    if (ratingText is not None):
        return float(ratingText.getText())

def getPrice(body):
    textPrice = body.find("div", {"data-testid": "lblPDPDetailProductPrice"}).getText()
    return int(re.sub('[^0-9]','', textPrice))

In [7]:
# because we need description, we have to open the pages of each product to get it

columns = ["name","desc","imgUrl","price","rating","storeName"]
products = []
for link in data:
    driver.get(link)
    time.sleep(pageLoadWait)
    content = driver.page_source
    soup = BeautifulSoup(content, "lxml")
    products.append([getName(soup), getDesc(soup), getImg(soup), getPrice(soup), getRating(soup), data[link]])
products

[['hp samsung e1272 termurah original jadul samsung lipat dual sim - Hitam',
  'Samsung dual card dual standby ponsel flip 1272\nUntuk persyaratan warna, harap sertakan dalam instruksi pembelian\nJika tidak, kami akan mengirimkannya secara acak.\nBarang yang diterima memiliki garansi toko satu bulan.\nGaransi, tidak termasuk charger, baterai, kotak\nDan kerusakan yang disebabkan oleh operasi yang salah\nKami telah memeriksa sebelum pengiriman\nPaket termasuk:\n1 * ponsel\n1 * pengisi daya\n1 * baterai\n1 * Manual\n1 kotak\nUmur simpan: 1 bulan\nteknologi jaringan\nGlobal\nDirilis pada kuartal keempat tahun 2013\nStatus yang tersedia. Dirilis pada kuartal keempat tahun 2013\nUkuran tubuh 95 x 46,5 x 18 mm (3,74 x 1,83 x 0,71 inci)\nBerat 82,9 gram (2,93 ons)\nKartu SIM ganda (STANDAR-SIM, siaga ganda)\nJenis tampilan TFT, 65 ribu warna\nUkuran 1,77 inci, 9,9 sentimeter persegi (rasio layar-ke-tubuh sekitar 22,3%)\nResolusi 128 x 160 piksel (~ kepadatan 116 ppi)\nPlatform CPU 208MHz\nSlo

In [10]:
df = pd.DataFrame(products, columns=columns)
df

Unnamed: 0,name,desc,imgUrl,price,rating,storeName
0,hp samsung e1272 termurah original jadul samsu...,Samsung dual card dual standby ponsel flip 127...,https://images.tokopedia.net/img/cache/500-squ...,153500,4.6,Little Master.shop
1,nokia jadul 105 dual sim 2017 - Hitam,Detail produk dari NOKIA 105 DUAL SIM (TA-1174...,https://images.tokopedia.net/img/cache/500-squ...,119000,4.8,Kkeep
2,OPPO F5 4/64GB TERPERCAYA GARANSI 1TAHUN - 4/6...,**BUDAYAKAN MEMBACA DESKRIPSI SEBELUM MEMBELI*...,https://images.tokopedia.net/img/cache/500-squ...,999000,4.9,Trust Phonestore
3,"IPHONE XR SECOND - Red, 64 gb",SELAMAT DATANG DI Sun MOBILE\nHarap Menanyaka...,https://images.tokopedia.net/img/cache/500-squ...,4750000,4.9,sun mobile
4,iPhone 13 Pro Max 128GB 256GB 512GB 1TB Garans...,MARKAS APPLE\nMarkas apple memberikan solusi u...,https://images.tokopedia.net/img/cache/500-squ...,21499000,4.8,MARKAS APPLE
...,...,...,...,...,...,...
95,SAMSUNG Galaxy A32 [8/128GB] - Garansi Resmi S...,SAMSUNG Galaxy A32 [8/128GB] - Garansi Resmi S...,https://images.tokopedia.net/img/cache/500-squ...,3625000,5.0,Rejodadi Surabaya
96,INFINIX SMART 5 RAM 2 32 GARANSI RESMI - NON GIFT,"GARANSI RESMI INFINIX INDONESIA,IMEI TERDAFTAR...",https://images.tokopedia.net/img/cache/500-squ...,1299000,5.0,Bintang Terang Cell
97,REDMI NOTE 10S RAM 8 128 NFC RAM 6 64 NFC GARA...,"GARANSI RESMI XIAOMI INDONESIA,IMEI TERDAFTAR\...",https://images.tokopedia.net/img/cache/500-squ...,2675000,4.9,Bintang Terang Cell
98,Xiaomi Poco X3 Pro Ram 8 Rom 256 GB Garansi Re...,NETWORK Technology\nGSM / HSPA / LTE\nLAUNCH A...,https://images.tokopedia.net/img/cache/500-squ...,3833000,4.9,Best Phone Cell


In [12]:
df.to_csv(r'scrapResult.csv', sep = ';')