**JLCPCB Component Code Scraper**
=====================================

### Step 1: Load Existing Component Codes

* Reads existing component codes from `ComponentList.csv` to avoid duplicates.

### Step 2: API Request Loop

* Sends a POST request to JLCPCB's API for each page (1-47).
* Extracts component codes from the response using regular expressions.

### Step 3: Save New Component Codes

* Adds new component codes to the list.
* Sorts and saves the updated list to `ComponentList.csv`.

### Notes

* The API request loop pauses for 3 seconds between requests to avoid rate limiting.
* The script uses the `requests` library for API requests and `csv` library for file I/O.
* The regular expression `r'"componentCode":"C(\d+)"'` extracts component codes from the API response (slow and inefficient but works).

In [48]:
import requests
import re
import csv
import time
from datetime import datetime, timedelta

today_date_str = datetime.now().strftime('%Y/%m/%d')
url = "https://jlcpcb.com/api/overseas-pcb-order/v1/shoppingCart/smtGood/selectSmtComponentList/v2"

headers = {
    "Host": "jlcpcb.com",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0",
    "Accept": "application/json, text/plain, */*",
    "Accept-Language": "en-GB,en;q=0.5",
    "Accept-Encoding": "gzip, deflate, br, zstd",
    "Content-Type": "application/json",
    "Origin": "https://jlcpcb.com",
    "DNT": "1",
    "Connection": "keep-alive",
    "Referer": "https://jlcpcb.com/parts/basic_parts",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "same-origin",
    "Priority": "u=0",
    "Pragma": "no-cache",
    "Cache-Control": "no-cache",
}

def update_component(components, lcsc_code):
    for component in components:
        if component['lcsc'] == lcsc_code:
            component['Last Seen'] = today_date_str
            return False

    # Component doesn't exist, add it
    new_component = {
        'lcsc': lcsc_code,
        'First Seen': today_date_str,
        'Last Seen': today_date_str
    }
    components.append(new_component)
    return True

with open("scraped/ComponentList.csv", "r", newline="") as f:
    reader = csv.DictReader(f)
    components = list(reader)
    # print(list(reader))
    # for row in reader:
    #     component_codes.append(row["lcsc"])

empty_page = False
page = 1
total_unseen_components = 0

while empty_page == False:
    request_json = {
        "currentPage": page,
        "pageSize": 100,
        "keyword": None,
        "componentLibraryType": "base",
        "preferredComponentFlag": True,
        "stockFlag": None,
        "stockSort": None,
        "firstSortName": None,
        "secondSortName": None,
        "componentBrand": None,
        "componentSpecification": None,
        "componentAttributes": [],
        "searchSource": "search",
    }

    response = requests.post(url, headers=headers, json=request_json)
    print(f"Page {page}: {response.status_code} {response.headers}")    
    unseen_components = 0

    if response.status_code == 200:
        page_components = re.findall(r'"componentCode":"C(\d+)"', response.text)
        for component in page_components:
            lcsc_code = component
            if update_component(components, lcsc_code):
                unseen_components+=1
                total_unseen_components+=1

    print(f"\tFound {len(page_components)} Components, {unseen_components} Unseen Components")
    if len(page_components) < 1:
        empty_page = True
    page +=1
    
    time.sleep(3)  # Pause (try to not get rate limited)

todays_date = datetime.now()
cutoff_date = todays_date - timedelta(days=14) #Components not seen in 14 days are removed from the list

components = [
    c for c in components 
    if (datetime.strptime(c['Last Seen'], '%Y/%m/%d') >= cutoff_date)
]

with open("scraped/ComponentList.csv", "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=components[0].keys())
    writer.writeheader()
    writer.writerows(components)

Page 1: 200 {'Content-Type': 'application/json;charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Server': 'nginx/1.22.0', 'Date': 'Mon, 11 Nov 2024 00:34:03 GMT', 'Expires': 'Thu, 01 Jan 1970 00:00:00 GMT', 'Content-Encoding': 'br', 'Vary': 'Accept-Encoding', 'Set-Cookie': 'acw_tc=3ca694444cf51814e5630993929e9744b7d7f69d9625bcb15280016730d7f246;path=/;HttpOnly;Max-Age=1800, XSRF-TOKEN=8f788a12-fd7b-4624-8103-405929b84bc3; Max-Age=1800; Expires=Mon, 11 Nov 2024 01:04:03 GMT; Path=/, jlc_s=A; Max-Age=150196224; Expires=Wed, 15 Aug 2029 09:44:27 GMT; Domain=jlcpcb.com; Path=/, JSESSIONID=EC8A542CD55E825713DC0CC117A0676C; Path=/; Secure; HttpOnly, JLCPCB_SESSION_ID=cb80b3aa-2b0e-45be-9b3e-7a06befce31e; Path=/; Secure; SameSite=None; HttpOnly', 'J-Trace-UA': 'ua:jQrM5Q78lW9CPN', 'J-Trace-ID': 'b5990c4ca9074956a1c70bfd3dad1273', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache,no-store,must-revalidate', 'X-Cache': 'Miss from cloudfront', 'Via': '1.1 8f0eabccda03f9d