**JLCPCB Component Code Scraper**
=====================================

### Step 1: Load Existing Component Codes

* Reads existing component codes from `ComponentList.csv` to avoid duplicates.

### Step 2: API Request Loop

* Sends a POST request to JLCPCB's API for each page (1-47).
* Extracts component codes from the response using regular expressions.

### Step 3: Save New Component Codes

* Adds new component codes to the list.
* Sorts and saves the updated list to `ComponentList.csv`.

### Notes

* The API request loop pauses for 3 seconds between requests to avoid rate limiting.
* The script uses the `requests` library for API requests and `csv` library for file I/O.
* The regular expression `r'"componentCode":"C(\d+)"'` extracts component codes from the API response (slow and inefficient but works).

In [None]:
import os
import requests
import re
import csv
import time
from datetime import datetime, timedelta

today_date_str = datetime.now().strftime('%Y/%m/%d')
url = "https://jlcpcb.com/api/overseas-pcb-order/v1/shoppingCart/smtGood/selectSmtComponentList/v2"

headers = {
    "Host": "jlcpcb.com",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0",
    "Accept": "application/json, text/plain, */*",
    "Accept-Language": "en-GB,en;q=0.5",
    "Accept-Encoding": "gzip, deflate, br",
    "Content-Type": "application/json",
    "Origin": "https://jlcpcb.com",
    "DNT": "1",
    "Connection": "keep-alive",
    "Referer": "https://jlcpcb.com/parts/basic_parts",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "same-origin",
    "Priority": "u=0",
    "Pragma": "no-cache",
    "Cache-Control": "no-cache",
}

def update_component(components, lcsc_code):
    for component in components:
        if component['lcsc'] == lcsc_code:
            component['Last Seen'] = today_date_str
            return False

    # Component doesn't exist, add it
    new_component = {
        'lcsc': lcsc_code,
        'First Seen': today_date_str,
        'Last Seen': today_date_str
    }
    components.append(new_component)
    return True

print(f"Current date is: {today_date_str}")

file_location  = os.path.join("scraped", "ComponentList.csv")
print(f"ComponentList.csv: {os.path.getsize(file_location)/1024:.1f}KiB")
with open(file_location, "r", newline="") as f:
    reader = csv.DictReader(f)
    components = list(reader)
    
print(f"Loaded {len(components)} components from {file_location}")

empty_page = False
page = 1
total_unseen_components = 0

while empty_page == False:
    request_json = {
        "currentPage": page,
        "pageSize": 100,
        "keyword": None,
        "componentLibraryType": "base",
        "preferredComponentFlag": True,
        "stockFlag": None,
        "stockSort": None,
        "firstSortName": None,
        "secondSortName": None,
        "componentBrand": None,
        "componentSpecification": None,
        "componentAttributes": [],
        "searchSource": "search",
    }

    response = requests.post(url, headers=headers, json=request_json)
    print(f"Page {page}: {response.status_code} {response.headers}")    
    unseen_components = 0

    if response.status_code == 200:
        page_components = re.findall(r'"componentCode":"C(\d+)"', response.text)
        for component in page_components:
            lcsc_code = component
            if update_component(components, lcsc_code):
                unseen_components+=1
                total_unseen_components+=1

    print(f"\tFound {len(page_components)} Components, {unseen_components} Unseen Components")
    if len(page_components) < 1:
        empty_page = True
    page +=1
    
    time.sleep(3)  # Pause (try to not get rate limited)

print(f"Found {total_unseen_components} unseen components, current total components {len(components)}")

todays_date = datetime.now()
cutoff_date = todays_date - timedelta(days=14) #Components not seen in 14 days are removed from the list

pre_remove_component_count = len(components)

components = [
    c for c in components 
    if (datetime.strptime(c['Last Seen'], '%Y/%m/%d') >= cutoff_date)
]

print(f"Removed {pre_remove_component_count-len(components)} components because they haven't been seen in 14 days, current total components {len(components)}")

with open(file_location, "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=components[0].keys())
    writer.writeheader()
    writer.writerows(components)
    
print(f"ComponentList.csv: {os.path.getsize(file_location)/1024:.1f}KiB")

Page 1: 200 {'Content-Type': 'application/json;charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Server': 'nginx/1.22.0', 'Date': 'Mon, 11 Nov 2024 01:15:07 GMT', 'Cache-Control': 'no-cache,no-store,must-revalidate', 'Expires': 'Thu, 01 Jan 1970 00:00:00 GMT', 'J-Trace-UA': 'ua:jQrM5Q78lW9CPN', 'J-Trace-ID': 'e7ad8e877d484f029d2da8ee8510f8b5', 'Set-Cookie': 'XSRF-TOKEN=b562abb1-f866-4852-85db-96e1a50e0eb3; Max-Age=1800; Expires=Mon, 11 Nov 2024 01:45:07 GMT; Path=/, jlc_s=A; Max-Age=150196224; Expires=Wed, 15 Aug 2029 10:25:31 GMT; Domain=jlcpcb.com; Path=/, JSESSIONID=6A08B1F2E243E951586CC1A0A21F0888; Path=/; Secure; HttpOnly, JLCPCB_SESSION_ID=e4ead000-8e06-46ba-a82c-149a66eb1de2; Path=/; Secure; SameSite=None; HttpOnly', 'Pragma': 'no-cache', 'X-Cache': 'Miss from cloudfront', 'Via': '1.1 b4f0fd171cbe4a010ced201d7f60e126.cloudfront.net (CloudFront)', 'X-Amz-Cf-Pop': 'AKL50-C1', 'X-Amz-Cf-Id': 'gV0ZNnFHAfCdHu4OTGFwsgXqS0d5-g4YqJROmyVnf2cz2BQ3xIBhgg==', 'x-x

In [53]:
print(len(components))

2539
