## Tokopedia Scrapper

### Dependencies

In [25]:
from datetime import datetime

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC


import time
import csv
import chromedriver_autoinstaller
from webdriver_manager.chrome import ChromeDriverManager

### Global Variables

In [26]:
useragentarray = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
]

## Scrapper

In [None]:
def get_url(search_term):
    """Generate an url from the search term"""
    template = "https://www.tokopedia.com/search?q={}"
    search_term = search_term.replace(' ', '+')

    url = template.format(search_term)
    url += '&page={}'

    return url

def scrape(keyword, start_page=1, end_page=2):
    request_url = get_url(keyword)

    options = Options()

    # Set up Chrome profile (bypass login using saved session)
    # chrome://version
    options.add_argument("--user-data-dir=C:/Users/Administrator/AppData/Local/Google/Chrome/User Data")
    options.add_argument("--profile-directory=Default")

    options.add_argument('--remote-debugging-port=9222')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')

    # Other options
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('disable-notifications')
    options.add_argument('--disable-infobars')
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    options.add_experimental_option("prefs", {"profile.default_content_setting_values.notifications": 2})

    # Connect to the target tab using the session ID
    chromedriver_autoinstaller.install()
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    rows = []
    target_url = request_url
    idx = 1

    for page in range(start_page, end_page+1):
        try:
            # driver preparation
            driver.get(target_url.format(page))
            driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

            for i in range(len(useragentarray)):
                driver.execute_cdp_cmd(
                    "Network.setUserAgentOverride", {"userAgent": useragentarray[i]}
                )
                # print(driver.execute_script("return navigator.userAgent;"))
                # driver.get("https://httpbin.io/headers")

            # scroll
            WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "css-5wh65g")))
            driver.execute_script("""
                var scroll = document.body.scrollHeight / 3;
                var i = 0;
                function scrollit(i) {
                window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
                i++;
                if (i < scroll) {
                    setTimeout(scrollit, 500, i);
                    }
                }
                scrollit(i);
                """)

            time.sleep(10)

            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")
            # print(soup.prettify())

            mature_content_detected = soup.find('div', class_='css-mejlb3')
            product_not_found = soup.find('div', class_='css-z5s5at')
            if mature_content_detected or product_not_found:
                print("Mature content or product not found detected. Exiting...")
                break

            for item in soup.find_all('div', {'class': 'css-5wh65g'}):
                link_tag = item.find('a', href=True)
                link = link_tag['href'] if link_tag else None
                
                store_link = link.replace('https://www.tokopedia.com/', '') if link else None
                store_id = store_link.split('/')[0] if store_link else None
                
                store_name_tag = item.find('span', class_='T0rpy-LEwYNQifsgB-3SQw== pC8DMVkBZGW7-egObcWMFQ== flip')
                store_name = store_name_tag.text if store_name_tag else None
                
                rating_tag = item.find('span', class_='_9jWGz3C-GX7Myq-32zWG9w==')
                rating = rating_tag.text if rating_tag else 0
                
                location_tag = item.find('span', class_='pC8DMVkBZGW7-egObcWMFQ== flip')
                location = location_tag.text if location_tag else None

                name_tag = item.find('span', class_='_0T8-iGxMpV6NEsYEhwkqEg==')
                name = name_tag.text if name_tag else None

                price_tag = item.find('div', class_='_67d6E1xDKIzw+i2D2L0tjw==')
                price = price_tag.text if price_tag else None

                img_tag = item.find('img', alt='product-image')
                asset_url = img_tag['src'] if img_tag else None

                # # Debugging
                # print(f"Name: {name}")
                # print(f"Link: {link}")
                # print(f"Image URL: {asset_url}")
                # print(f"Price: {price}")
                # print('---')

                if not (name and price and asset_url and link and store_id and location and store_name):
                    continue

                rows.append([idx, name, price, asset_url, link, store_id, location, store_name, rating])
                idx = idx+1
        except Exception as e:
            print(f"An error occurred on page {page}: {e}")
    
    output_file = f'output/tokopedia_items__{keyword}_{datetime.now().timestamp()}.csv'
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['no', 'name', 'price', 'asset_url', 'link', 'store_id', 'location', 'store_name', 'rating'])
        writer.writerows(rows)

    print(f"Scraping completed for {keyword}. Data saved to {output_file}")

    # Close the driver
    driver.quit()

# Runner
# scrape('onitsuka', end_page=2)
scrape('air jordan', end_page=100)


An error occurred on page 50: Message: 

Scraping completed for air jordan. Data saved to output/tokopedia_items__air jordan_1745997993.602141.csv
