**JLCPCB Component Code Scraper**
=====================================

### Step 1: Load Existing Component Codes

* Reads existing component codes from `ComponentList.csv` to avoid duplicates.

### Step 2: API Request Loop

* Sends a POST request to JLCPCB's API for each page (1-47).
* Extracts component codes from the response using regular expressions.

### Step 3: Save New Component Codes

* Adds new component codes to the list.
* Sorts and saves the updated list to `ComponentList.csv`.

### Notes

* The API request loop pauses for 3 seconds between requests to avoid rate limiting.
* The script uses the `requests` library for API requests and `csv` library for file I/O.
* The regular expression `r'"componentCode":"C(\d+)"'` extracts component codes from the API response (slow and inefficient but works).

In [5]:
import requests
import re
import csv
import time
import random
url = "https://jlcpcb.com/api/overseas-pcb-order/v1/shoppingCart/smtGood/selectSmtComponentList/v2"

headers = {
    "Host": "jlcpcb.com",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0",
    "Accept": "application/json, text/plain, */*",
    "Accept-Language": "en-GB,en;q=0.5",
    "Accept-Encoding": "gzip, deflate, br, zstd",
    "Content-Type": "application/json",
    "Origin": "https://jlcpcb.com",
    "DNT": "1",
    "Connection": "keep-alive",
    "Referer": "https://jlcpcb.com/parts/basic_parts",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "same-origin",
    "Priority": "u=0",
    "Pragma": "no-cache",
    "Cache-Control": "no-cache",
}

component_codes: list[int] = []

with open("scraped/ComponentList.csv", "r", newline="") as f:
    reader = csv.reader(f)
    next(reader)  # Skip the header row
    for row in reader:
        component_codes.append(int(row[0]))

empty_page = False
page = 1
total_unseen_components = 0

while empty_page == False:
    data = {
        "currentPage": page,
        "pageSize": 100,
        "keyword": None,
        "componentLibraryType": "base",
        "preferredComponentFlag": True,
        "stockFlag": None,
        "stockSort": None,
        "firstSortName": None,
        "secondSortName": None,
        "componentBrand": None,
        "componentSpecification": None,
        "componentAttributes": [],
        "searchSource": "search",
    }

    response = requests.post(url, headers=headers, json=data)

    print(f"Page {page}: {response.status_code} {response.headers}")
    
    unseen_components = 0

    if response.status_code == 200:
        page_components = re.findall(r'"componentCode":"C(\d+)"', response.text)
        for component in page_components:
            component_num = int(component)
            if component_num not in component_codes:
                component_codes.append(component_num)
                unseen_components+=1
                total_unseen_components+=1

    print(f"\tFound {len(page_components)} Components, {unseen_components} Unseen Components")
    if len(page_components) < 1:
        empty_page = True
    page +=1
    
    time.sleep(3)  # Pause (try to not get rate limited)
    
component_codes.sort()
# for number in component_codes:
#     print(number)
if total_unseen_components > 0:
    with open("scraped/ComponentList.csv", "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["lcsc"])  # header
        writer.writerows([[code] for code in component_codes])

Page 1: 200 {'Content-Type': 'application/json;charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Server': 'nginx/1.22.0', 'Date': 'Tue, 05 Nov 2024 22:45:08 GMT', 'Cache-Control': 'no-cache,no-store,must-revalidate', 'Expires': 'Thu, 01 Jan 1970 00:00:00 GMT', 'J-Trace-UA': 'ua:jQrM5Q78lW9CPN', 'J-Trace-ID': '40b5376b902641d99087a9dcd0bd4d17', 'Set-Cookie': 'XSRF-TOKEN=ff9e7d77-0772-4bb2-8ae2-b78d80271aed; Max-Age=1800; Expires=Tue, 05 Nov 2024 23:15:07 GMT; Path=/, jlc_s=A; Max-Age=150196224; Expires=Fri, 10 Aug 2029 07:55:31 GMT; Domain=jlcpcb.com; Path=/, JSESSIONID=2E15FE29DCEF0388576EE69443386297; Path=/; Secure; HttpOnly, JLCPCB_SESSION_ID=ae40dc25-4b93-440e-98fa-8f2374bfae30; Path=/; Secure; SameSite=None; HttpOnly', 'Pragma': 'no-cache', 'X-Cache': 'Miss from cloudfront', 'Via': '1.1 0dc09f5a855c4ba90473817ffe2625a4.cloudfront.net (CloudFront)', 'X-Amz-Cf-Pop': 'AKL50-C1', 'X-Amz-Cf-Id': 'w54EV9CgQC9tIyi_EVbRs76IhQ6rhB6QeINKAn2Wi92CN6zJYaJxAQ==', 'x-x