# Tools Scrape Tokped

Cara kerja:
1. Sistem membuka chrome lalu mengakses url tokped berdasarkan keyword yang dicari
2. Sistem akan mengambil data produk sebanyak 80 data
3. Data disimpan kedalam file csv
4. Chromedriver dimatikan

In [5]:
#selenium digunakan untuk melakukan crawling dan scrap di website tokopedia nantinya menggunakan chrome
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import urllib.parse
import re
import time
import csv
import uuid

In [6]:
class Tokopedia:
    #fungsi init digunakan untuk melakukan setup chromedriver yang digunakan untuk melakukan crawl dan scrap
    def __init__(self, chromedriver, headless=True) -> None:
        self.driver = self.setup(chromedriver, headless)
        self.data = [] #initiate list kosong untuk menampung data json hasil scrap

    #fungsi yang digunakan untuk melakuakn 
    def setup(self, chromedriver, headless):
        #atur opsi drive dan menonaktifkan logging
        opt = webdriver.ChromeOptions()
        opt.add_experimental_option('excludeSwitches', ['enable-logging'])
        #akan aktif jika headless == True
        if headless:
            opt.add_argument("--headless") #Menjalankan Chrome dalam mode headless, tidak akan ada jendela GUI yang ditampilkan
            opt.add_argument("--window-size=2560,1440") #ukuran windows
            opt.add_argument('--ignore-certificate-errors') #Mengabaikan kesalahan sertifikat SSL yang invalid
            opt.add_argument('--allow-running-insecure-content') #Mengizinkan pengisian konten yang tidak aman
            opt.add_argument("--disable-extensions") #Menonaktifkan ekstensi pada Chrome
            opt.add_argument("--proxy-server='direct://'") #server proksi 
            opt.add_argument("--proxy-bypass-list=*") #daftar alamat yang dilewatkan oleh proksi
            opt.add_argument("--start-maximized") #Memulai browser dengan jendela yang dimaksimalka
            opt.add_argument('--disable-gpu') #nonaktifkan penggunaan GPU oleh Chrome.
            opt.add_argument('--disable-dev-shm-usage') #nonaktifkan penggunaan memori bersama /dev/shm oleh Chrome
            opt.add_argument('--no-sandbox') #menjalankan Chrome dalam mode tanpa sandbox.
            user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36' #initiate user agent
            opt.add_argument(f'user-agent={user_agent}') #atur user agent

        return webdriver.Chrome(executable_path=chromedriver, options=opt)

    #fungsi yang melakukan scraping nama produk, harga, lokasi, rating dan jumlah terjual
    def get_details(self, detail_container, category):
        # Scrape to get all parameters
        detail = dict() #initate dictionary untuk menyimpan produk yang discrape
        detail['category'] = category
        # scraping nama produk
        try:
            name = detail_container.find_element(
                By.XPATH, ".//div[@data-testid='spnSRPProdName']").get_attribute("innerHTML") #mencari elemen html dengan class yang sama
            detail['name'] = name
        except Exception as e:
            detail['name'] = None

        # scraping harga 
        try:
            price = detail_container.find_element(
                By.XPATH, ".//div[@data-testid='spnSRPProdPrice']").get_attribute("innerHTML") #mencari elemen html dengan class yang sama
            price = float(re.sub('[^0-9]', '', price))
            detail['price'] = price
        except Exception as e:
            detail['price'] = None

        # scraping lokasi penjual
        try:
            location = detail_container.find_element(
                By.XPATH, ".//span[@data-testid='spnSRPProdTabShopLoc']").get_attribute("innerHTML") #mencari elemen html dengan class yang sama
            detail['location'] = location
        except Exception as e:
            detail['location'] = None

        # scraping rating produk
        try:
            rating = detail_container.find_element(By.XPATH, ".//*[contains(text(),'Terjual')]").find_element(
                By.XPATH, "preceding-sibling::span[2]").get_attribute("innerHTML") #mencari elemen html dengan class yang sama
            rating = float(rating) #diubah ke float
            detail['rating'] = rating
        except Exception as e:
            detail['rating'] = None

        # Scraping jumlah produk terjual, kalau tidak maka NULL
        try:
            sold = detail_container.find_element(
                By.XPATH, ".//span[contains(text(),'Terjual')]").get_attribute("innerHTML") #mencari elemen html dengan class yang sama
            if ("rb" in sold):
                sold = int(re.sub('[^0-9]', '', sold)) 
                sold = sold * 1000
            else:
                sold = int(re.sub('[^0-9]', '', sold))
            detail['sold'] = sold
        except Exception as e:
            detail['sold'] = None

        return detail

    #fungsi yang digunakan untuk melakukan scraping data dari website tokopedia
    def search(self, cat):
        self.data = []
        #cat = category atau keyword yang dicari
        url_safe_cat = urllib.parse.quote(cat)
        url = f"https://www.tokopedia.com/search?st=product&q={url_safe_cat}" #initiate url tokped
        print(f'Scraping for category {cat}..')
        self.driver.get(url) #get url menggunakan chrome

        for i in range(3):
            time.sleep(1)
            containers = WebDriverWait(self.driver, 100).until(EC.presence_of_all_elements_located(
                (By.XPATH, "//div[@data-testid='master-product-card']")))
            print(f"Found {len(containers)} items..")

            for index, container in enumerate(containers):
                detail_container = container.find_element(By.TAG_NAME, "div").find_element(
                    By.TAG_NAME, "div").find_elements(By.XPATH, "./div")[1].find_element(By.TAG_NAME, "a")
                details = self.get_details(detail_container, cat)

                #scrap url produk
                try:
                    links = container.find_element(
                        By.XPATH, './/a[contains(@href, "tokopedia.com")]')
                    url = links.get_attribute("href")
                    decoded_uri = urllib.parse.unquote(
                        url).split("?")[0]
                    details['url'] = decoded_uri
                    self.data.append(details)
                except Exception:
                    details['url'] = None
                    
                #scrap image produk
                try:
                    image = container.find_element(
                        By.XPATH, './/img[contains(@src, "images.tokopedia")]')
                    details['image'] = image.get_attribute("src")
                except:
                    details['image'] = None
                    print("here")
                self.driver.execute_script("window.scrollTo(0, 1000);")

        #masukkan hasil scrape ke list di def init
        self.data = [dict(t) for t in {tuple(d.items())
                                       for d in self.data} if 'name' in dict(t)]

        return self.data

    #buat file csv untuk hasil scrape
    def to_csv(self, filename):
        if len(self.data) == 0:
            print("No data available to export.")
            return

        # Menulis data ke dalam file CSV
        try:
            with open(filename, 'w', newline='', encoding='utf-8') as file:
                fieldnames = self.data[0].keys()
                writer = csv.DictWriter(file, fieldnames=fieldnames)

                writer.writeheader()
                writer.writerows(self.data)

            print(f"Data telah berhasil disimpan dalam file {filename}.")
        except Exception as e:
            print(f"Error: {str(e)}")

    def close_connection(self):
        self.driver.close()

In [7]:
if __name__ == '__main__':
    tokopedia = Tokopedia(
        "resources/chromedriver/chromedriver.exe", headless=False)

    items = tokopedia.search("PS4 Games")
    print(items)

    tokopedia.to_csv("data2.csv")

    tokopedia.close_connection()

  return webdriver.Chrome(executable_path=chromedriver, options=opt)


Scraping for category PS4 Games..
Found 10 items..
Found 80 items..
Found 80 items..
[{'category': 'PS4 Games', 'name': 'Kaset BD Games PS4 Second', 'price': 250000.0, 'location': 'Depok', 'rating': 4.8, 'sold': 22, 'url': 'https://www.tokopedia.com/minifigure/kaset-bd-games-ps4-second', 'image': 'https://images.tokopedia.net/img/cache/200-square/VqbcmM/2023/3/28/d7698181-2929-4a62-9075-166217d60597.jpg'}, {'category': 'PS4 Games', 'name': 'Dying Light Game PS 4', 'price': 160000.0, 'location': 'Kab. Bandung Barat', 'rating': 4.9, 'sold': 30, 'url': 'https://www.tokopedia.com/gamedepot-1/dying-light-game-ps-4', 'image': 'https://images.tokopedia.net/img/cache/200-square/VqbcmM/2020/5/15/d445ec11-3471-4d1d-bded-d82749496436.jpg'}, {'category': 'PS4 Games', 'name': "BD Kaset Game PS4 Crash Bandicoot 4 Its It's About Time PS 4", 'price': 379000.0, 'location': 'Bekasi', 'rating': 5.0, 'sold': 7, 'url': 'https://www.tokopedia.com/hariahh/bd-kaset-game-ps4-crash-bandicoot-4-its-it-s-about-ti