We want to get the CPU model specifications by scraping the web. 

The main source I have found is: "https://www.techpowerup.com/cpu-specs/"

In [1]:
import time, random, re
from typing import Dict, Iterable, Tuple, List, Optional
from urllib.parse import quote_plus, urljoin

import dill as pickle
import pandas as pd

import requests
from bs4 import BeautifulSoup
from rapidfuzz import process, fuzz
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

In [26]:
# --- FULL UNDERLYING FUNCTIONS (serial, robots-aware, adaptive backoff) ---
# pip install requests beautifulsoup4 lxml rapidfuzz
from io import StringIO

import time, random, re
from typing import Dict, Iterable, Tuple, List, Optional
from urllib.parse import quote_plus, urljoin, urlparse
from email.utils import parsedate_to_datetime
from functools import lru_cache
import urllib.robotparser as robotparser

import requests
from bs4 import BeautifulSoup
from rapidfuzz import process, fuzz
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# -------- constants --------
BASE = "https://www.techpowerup.com"
CPU_SEARCH = f"{BASE}/cpu-specs/?q="
CPU_PAGE_RE = re.compile(r"^/cpu-specs/[^/]+\.c\d+/?$")
USER_AGENT = "Mozilla/5.0 (compatible; SpecScraper/2.0)"

# -------- pacing (strictly serial) --------
MIN_GAP_S = 8.0            # min pause between ANY two requests (seconds)
MAX_GAP_S = 30.0           # cap for adaptive backoff
JITTER = (30, 120)        # extra randomness to avoid cadence
ADAPTIVE_BACKOFF = True

# -------- HTTP session (no parallelism) --------
def _build_session() -> requests.Session:
    s = requests.Session()
    s.headers.update({"User-Agent": USER_AGENT})
    # Handle network flakiness; 429 handled manually
    retry = Retry(
        total=3, connect=3, read=3, backoff_factor=0.6,
        status_forcelist=[500, 502, 503, 504],
        allowed_methods=["GET", "HEAD"]
    )
    s.mount("https://", HTTPAdapter(max_retries=retry, pool_connections=1, pool_maxsize=1))
    s.mount("http://",  HTTPAdapter(max_retries=retry, pool_connections=1, pool_maxsize=1))
    return s

_SESSION = _build_session()
_last_hit = 0.0              # monotonic timestamp of last successful GET
_curr_gap = MIN_GAP_S        # adaptive min gap (AIMD behavior)

# -------- robots.txt (disallow + crawl-delay) --------
_ROBOTS = None
_ROBOTS_LAST_LOAD = 0.0
_ROBOTS_TTL = 60 * 30  # refresh robots every 30 minutes

def _load_robots(force: bool = False):
    global _ROBOTS, _ROBOTS_LAST_LOAD
    now = time.monotonic()
    if force or _ROBOTS is None or (now - _ROBOTS_LAST_LOAD) > _ROBOTS_TTL:
        rp = robotparser.RobotFileParser()
        rp.set_url(urljoin(BASE, "/robots.txt"))
        try:
            rp.read()
            _ROBOTS = rp
            _ROBOTS_LAST_LOAD = now
        except Exception:
            _ROBOTS = None  # fail open; change to fail-closed if you prefer

def _robots_crawl_delay() -> Optional[float]:
    _load_robots()
    if _ROBOTS is None:
        return None
    delay = _ROBOTS.crawl_delay(USER_AGENT)
    if delay is None:
        delay = _ROBOTS.crawl_delay("*")
    return float(delay) if delay is not None else None

def _robots_can_fetch(url: str) -> bool:
    _load_robots()
    if _ROBOTS is None:
        return True  # fail open
    return _ROBOTS.can_fetch(USER_AGENT, url)

# Bootstrap adaptive gap from robots.txt crawl-delay if present
_rd = _robots_crawl_delay()
if _rd:
    _curr_gap = max(_curr_gap, _rd)

# -------- pacing helpers --------
def _sleep_gap():
    """Wait the current adaptive gap plus jitter before issuing a request."""
    global _last_hit
    now = time.monotonic()
    # Respect robots crawl-delay as a floor each time (it may change)
    rd = _robots_crawl_delay()
    floor = max(MIN_GAP_S, rd) if rd else MIN_GAP_S
    gap_target = max(_curr_gap if ADAPTIVE_BACKOFF else MIN_GAP_S, floor)
    wait = gap_target - (now - _last_hit)
    if wait > 0:
        time.sleep(wait)
    time.sleep(random.uniform(*JITTER))  # jitter every time

def _parse_retry_after(v: Optional[str]) -> Optional[float]:
    if not v:
        return None
    v = v.strip()
    if v.isdigit():
        return float(v)
    try:
        tgt = parsedate_to_datetime(v)
        now = parsedate_to_datetime(time.strftime("%a, %d %b %Y %H:%M:%S GMT", time.gmtime()))
        return max(0.0, (tgt - now).total_seconds())
    except Exception:
        return None

def _on_success():
    global _curr_gap
    if ADAPTIVE_BACKOFF:
        _curr_gap = max(MIN_GAP_S, _curr_gap - 0.5)  # gentle recovery

def _on_throttle(retry_after: Optional[float]):
    global _curr_gap
    if ADAPTIVE_BACKOFF:
        _curr_gap = min(MAX_GAP_S, max(MIN_GAP_S * 2, _curr_gap * 2))  # multiplicative increase
    time.sleep((retry_after if retry_after is not None else 10.0) + random.uniform(*JITTER))

def _polite_get(url: str, timeout=(5, 20)) -> requests.Response:
    """Strictly serial GET: robots-aware + min gap + jitter + 429/503 backoff."""
    global _last_hit
    # robots disallow check
    if not _robots_can_fetch(url):
        raise PermissionError(f"Blocked by robots.txt: {url}")

    _sleep_gap()
    resp = _SESSION.get(url, timeout=timeout)

    if resp.status_code in (429, 503):
        ra = _parse_retry_after(resp.headers.get("Retry-After"))
        _on_throttle(ra)
        _sleep_gap()
        resp = _SESSION.get(url, timeout=timeout)

    resp.raise_for_status()
    _last_hit = time.monotonic()
    _on_success()
    return resp

def fetch_html(url: str) -> str:
    """Download HTML with the polite session/limits."""
    return _polite_get(url).text

# -------- normalization helpers (for matching) --------
_TM_RE = re.compile(r"[™®©]")
PARENS_RE = re.compile(r"\((?:.*?)\)")
MULTISPACE_RE = re.compile(r"\s+")
NON_ALNUM_SPACE_RE = re.compile(r"[^0-9a-z]+")

_LEADING_MULTI_RE = re.compile(r"^\s*(\d+)\s*(?:x|×)\s*", re.I)
_TRAILING_CORES_THREADS_RE = re.compile(
    r"""\s*(?:\(\s*)?(?:[/,]|and)?\s*\d+\s*-?\s*(?:core|cores|c)\b
        (?:\s*(?:[/,]|and)\s*\d+\s*-?\s*(?:thread|threads|t)\b)?\s*(?:\)\s*)?\s*$""",
    re.I | re.X,
)

def _strip_multiplier(s: str) -> str:
    return _LEADING_MULTI_RE.sub("", s)

def _strip_core_trailer(s: str) -> str:
    return _TRAILING_CORES_THREADS_RE.sub("", s)

def _simplify_for_match(s: str) -> str:
    s = s.strip()
    s = _strip_multiplier(s)
    s = _strip_core_trailer(s)
    s = s.replace("×", "x").replace("-", " ").replace("_", " ")
    s = _TM_RE.sub("", s)
    s = PARENS_RE.sub("", s)
    s = s.lower()
    s = NON_ALNUM_SPACE_RE.sub(" ", s)
    s = MULTISPACE_RE.sub(" ", s).strip()
    return s

# -------- search results parsing (cached) --------
@lru_cache(maxsize=1024)
def _search_results(query: str) -> List[tuple[str, str]]:
    """Return list of (link_text, absolute_url) from CPU search results (cached in-process)."""
    url = f"{CPU_SEARCH}{quote_plus(query)}"
    soup = BeautifulSoup(_polite_get(url).text, "lxml")
    results = []
    for a in soup.select("a[href]"):
        href = a.get("href", "").strip()
        if CPU_PAGE_RE.match(href):
            abs_url = urljoin(BASE, href)
            text = a.get_text(" ", strip=True) or abs_url
            results.append((text, abs_url))
    return results

def _pick_best(query: str, candidates: List[tuple[str, str]], min_score: int) -> Optional[str]:
    """Rank candidates by fuzzy match to simplified query; return best URL or None."""
    if not candidates:
        return None
    q = _simplify_for_match(query)
    names = [_simplify_for_match(text) for text, _ in candidates]
    best = None
    for scorer in (fuzz.WRatio, fuzz.token_set_ratio, fuzz.token_sort_ratio, fuzz.QRatio, fuzz.partial_ratio):
        cand = process.extractOne(q, names, scorer=scorer)
        if cand and (best is None or cand[1] > best[1]):
            best = cand  # (matched_text, score, idx)
    if best and best[1] >= min_score:
        return candidates[best[2]][1]
    return None

# -------- PUBLIC: strictly-serial resolver via site search --------
def build_model_url_map_via_search(
    models: Iterable[str], *, min_score: int = 86
) -> Tuple[Dict[str, str], Dict[str, dict]]:
    """
    Resolve models to TechPowerUp detail URLs via /cpu-specs/?q=...
    - Serial only: one request at a time
    - robots-aware: disallow + crawl-delay floor
    - adaptive backoff on 429/503 + Retry-After
    """
    model_to_url: Dict[str, str] = {}
    unresolved: Dict[str, dict] = {}
    models = list(models)
    for i, m in enumerate(models, 1):
        start = time.time()
        print(f"[{i}/{len(models)}] {m}")
        # 1) raw query
        results = _search_results(m)
        url = _pick_best(m, results, min_score=min_score)
        # 2) simplified query fallback
        if not url:
            simple_q = _simplify_for_match(m)
            results2 = _search_results(simple_q)
            url = _pick_best(simple_q, results2, min_score=min_score)
        if url:
            model_to_url[m] = url
        else:
            unresolved[m] = {
                "simple_query": _simplify_for_match(m),
                "candidates": results[:3],
            }
        # extra politeness between models (beyond per-request gap)
        time.sleep(random.uniform(*JITTER))
        end = time.time()
        print(f"Iteration taken: {end-start}")
    return model_to_url, unresolved


In [10]:
# Load model list
with open("CPU_models.pkl", "rb") as f:
    models = pickle.load(f)
    print(models)

['AMD Ryzen 9 9950X 16-Core' 'AMD RYZEN AI MAX+ 395'
 'Intel Core i9-13900K' 'AMD Ryzen 7 7700X 8-Core'
 'AMD Ryzen 9 7950X 16-Core' 'Intel Core i9-14900K'
 'AMD EPYC 4364P 8-Core' 'AMD Ryzen 9 7900X 12-Core'
 'Intel Core i9-12900K' 'AMD EPYC 4564P 16-Core' 'AMD EPYC 4464P 12-Core'
 'AMD EPYC 4584PX 16-Core' 'AMD EPYC 4484PX 12-Core'
 'AMD Ryzen 9 5900X 12-Core' 'AMD Ryzen 5 7600X 6-Core'
 'Intel Core i9-11900K' 'AMD Ryzen 9 5950X 16-Core'
 'AMD Ryzen 7 5800X 8-Core' 'AMD Ryzen 7 5800X3D 8-Core'
 'Intel Core i9-10900K' 'Intel Core i5-13600K' 'AMD EPYC 4124P 4-Core'
 'AMD Ryzen Threadripper 3970X 32-Core'
 'AMD Ryzen Threadripper 3990X 64-Core' 'AMD Ryzen 9 3900XT 12-Core'
 'Intel Core i5-12600K' 'Intel Core i7-7900X' 'Intel Core i5-11600K'
 'Intel Core i9-7980XE' 'Intel Core i5-13400' 'AMD Ryzen 7 PRO 5850U'
 'AMD Ryzen Threadripper 2990WX 32-Core' 'AMD Ryzen 7 PRO 6850U'
 'AMD Ryzen 5 3600XT 6-Core' 'AMD EPYC 73F3 16-Core'
 'AMD EPYC 75F3 32-Core' 'AMD Ryzen Threadripper 3960X 24-Core

In [11]:
# Process model list
import re

# Matches: "2 x", "2x", "2 ×" (case-insensitive), with optional spaces
_MULTIPLIER_RE = re.compile(r'^\s*(?P<multi>\d+)\s*(?:x|×)\s*', re.IGNORECASE)

# Matches a *trailing* core/thread suffix like:
#  "32-Core", "32 Core", "32C", optionally plus "/ 64-Thread", ", 64 Threads", "and 64T"
#  optionally wrapped in parentheses, e.g. "(32-Core / 64-Thread)"
_TRAILING_CORES_THREADS_RE = re.compile(
    r"""
    \s*                              
    (?:\(\s*)?                       
    (?:[/,]|and)?\s*                 
    \d+\s*-?\s*(?:core|cores|c)\b    
    (?:                              
        \s*(?:[/,]|and)\s*
        \d+\s*-?\s*(?:thread|threads|t)\b
    )?
    \s*(?:\)\s*)?                    
    \s*$                             
    """,
    re.IGNORECASE | re.VERBOSE,
)

def normalize_model_and_multiplier(raw: str) -> tuple[str, int]:
    """
    Returns (base_model, multiplier).
      - base_model has any leading "<n> x" stripped
        and any trailing "<n>-Core[/<m>-Thread]" stripped.
      - multiplier is the leading <n> (default 1).
    """
    s = raw.strip()

    # 1) Extract leading multiplier (default 1)
    multiplier = 1
    m = _MULTIPLIER_RE.match(s)
    if m:
        multiplier = int(m.group("multi"))
        s = s[m.end():]  # chop it off

    # 2) Remove trailing core/thread decorations (only at end)
    s = _TRAILING_CORES_THREADS_RE.sub("", s)

    # 3) Collapse internal whitespace
    s = " ".join(s.split())
    return s, multiplier


In [12]:
clean_models = []
multipliers = []
for model in models:
    #print(model)
    clean, mult = normalize_model_and_multiplier(model)
    clean_models.append(clean)
    multipliers.append(mult)

model_comparisons = pd.DataFrame({"Models":models,"Cleaned":clean_models, "Multipliers":multipliers})
print(model_comparisons)

                        Models                Cleaned  Multipliers
0    AMD Ryzen 9 9950X 16-Core      AMD Ryzen 9 9950X            1
1        AMD RYZEN AI MAX+ 395  AMD RYZEN AI MAX+ 395            1
2         Intel Core i9-13900K   Intel Core i9-13900K            1
3     AMD Ryzen 7 7700X 8-Core      AMD Ryzen 7 7700X            1
4    AMD Ryzen 9 7950X 16-Core      AMD Ryzen 9 7950X            1
..                         ...                    ...          ...
540  2 x Intel Xeon E5-2687W 0  Intel Xeon E5-2687W 0            2
541      Intel Core i7-12700KF  Intel Core i7-12700KF            1
542   2 x Intel Xeon Gold 6144   Intel Xeon Gold 6144            2
543      Intel Core i7-1185G7E  Intel Core i7-1185G7E            1
544       Intel Core i3-1005G1   Intel Core i3-1005G1            1

[545 rows x 3 columns]


Now we are also going to drop the non Intel or AMD models. This is a limitation of our work.

We don't have a lot of non Intel/AMD options, and comparability is often poor without them due to them being historically dominant and only recently having alternatives (i.e. ARM desktop options).

In [13]:
subs = ["AMD", "Intel"]
pattern = "|".join(re.escape(s) for s in subs)
model_comparisons_keep = model_comparisons[model_comparisons["Models"].str.contains(pattern, case=False, na=False, regex=True)]
model_comparisons_keep.reset_index(inplace=True)
model_comparisons_keep

Unnamed: 0,index,Models,Cleaned,Multipliers
0,0,AMD Ryzen 9 9950X 16-Core,AMD Ryzen 9 9950X,1
1,1,AMD RYZEN AI MAX+ 395,AMD RYZEN AI MAX+ 395,1
2,2,Intel Core i9-13900K,Intel Core i9-13900K,1
3,3,AMD Ryzen 7 7700X 8-Core,AMD Ryzen 7 7700X,1
4,4,AMD Ryzen 9 7950X 16-Core,AMD Ryzen 9 7950X,1
...,...,...,...,...
484,540,2 x Intel Xeon E5-2687W 0,Intel Xeon E5-2687W 0,2
485,541,Intel Core i7-12700KF,Intel Core i7-12700KF,1
486,542,2 x Intel Xeon Gold 6144,Intel Xeon Gold 6144,2
487,543,Intel Core i7-1185G7E,Intel Core i7-1185G7E,1


In [14]:
list(model_comparisons_keep["Cleaned"])

['AMD Ryzen 9 9950X',
 'AMD RYZEN AI MAX+ 395',
 'Intel Core i9-13900K',
 'AMD Ryzen 7 7700X',
 'AMD Ryzen 9 7950X',
 'Intel Core i9-14900K',
 'AMD EPYC 4364P',
 'AMD Ryzen 9 7900X',
 'Intel Core i9-12900K',
 'AMD EPYC 4564P',
 'AMD EPYC 4464P',
 'AMD EPYC 4584PX',
 'AMD EPYC 4484PX',
 'AMD Ryzen 9 5900X',
 'AMD Ryzen 5 7600X',
 'Intel Core i9-11900K',
 'AMD Ryzen 9 5950X',
 'AMD Ryzen 7 5800X',
 'AMD Ryzen 7 5800X3D',
 'Intel Core i9-10900K',
 'Intel Core i5-13600K',
 'AMD EPYC 4124P',
 'AMD Ryzen Threadripper 3970X',
 'AMD Ryzen Threadripper 3990X',
 'AMD Ryzen 9 3900XT',
 'Intel Core i5-12600K',
 'Intel Core i7-7900X',
 'Intel Core i5-11600K',
 'Intel Core i9-7980XE',
 'Intel Core i5-13400',
 'AMD Ryzen 7 PRO 5850U',
 'AMD Ryzen Threadripper 2990WX',
 'AMD Ryzen 7 PRO 6850U',
 'AMD Ryzen 5 3600XT',
 'AMD EPYC 73F3',
 'AMD EPYC 75F3',
 'AMD Ryzen Threadripper 3960X',
 'AMD EPYC 74F3',
 'AMD EPYC 73F3',
 'Intel Xeon E-2278GEL',
 'AMD Ryzen 7 1700 Eight-Core',
 'AMD EPYC 7713',
 'Intel

In [15]:
"""
models = [
    "AMD EPYC 4364P 24-Core",
    "AMD Ryzen Threadripper 3970X 32-Core",
    "2 x AMD EPYC 73F3 16-Core",
    "Core Ultra 3 205",
]
"""
import time
models_to_search = list(model_comparisons_keep["Cleaned"])

start = time.time()
model_to_url, unresolved = build_model_url_map_via_search(models_to_search, min_score=86)
end = time.time()
print("\n")
print(f"Time taken {end-start}")

print("Resolved:", len(model_to_url), "Unresolved:", len(unresolved))
for m, u in model_to_url.items():
    print(m, "->", u)
if unresolved:
    print("\nUnresolved examples:")
    for m, info in list(unresolved.items())[:5]:
        print(" -", m, "simple_query=", info["simple_query"], "candidates=", info["candidates"])


[1/489] AMD Ryzen 9 9950X
Iteration taken: 92.46497225761414
[2/489] AMD RYZEN AI MAX+ 395
Iteration taken: 159.85004591941833
[3/489] Intel Core i9-13900K
Iteration taken: 119.96439981460571
[4/489] AMD Ryzen 7 7700X
Iteration taken: 177.1497790813446
[5/489] AMD Ryzen 9 7950X
Iteration taken: 118.70965123176575
[6/489] Intel Core i9-14900K
Iteration taken: 138.84054517745972
[7/489] AMD EPYC 4364P
Iteration taken: 133.9334352016449
[8/489] AMD Ryzen 9 7900X
Iteration taken: 150.58465218544006
[9/489] Intel Core i9-12900K
Iteration taken: 119.9066858291626
[10/489] AMD EPYC 4564P
Iteration taken: 111.08674573898315
[11/489] AMD EPYC 4464P
Iteration taken: 116.29876804351807
[12/489] AMD EPYC 4584PX
Iteration taken: 239.3287489414215
[13/489] AMD EPYC 4484PX
Iteration taken: 165.8674328327179
[14/489] AMD Ryzen 9 5900X
Iteration taken: 182.4122109413147
[15/489] AMD Ryzen 5 7600X
Iteration taken: 179.9790620803833
[16/489] Intel Core i9-11900K
Iteration taken: 62.650043964385986
[17/48

In [17]:
model_to_url

with open("model_url_association.pkl", "wb") as f:
    pickle.dump(model_to_url, f, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
with open("model_url_association.pkl", "rb") as f:
    model_url_association = pickle.load(f)
    print(model_url_association)

{'AMD Ryzen 9 9950X': 'https://www.techpowerup.com/cpu-specs/ryzen-9-9950x.c3649', 'AMD RYZEN AI MAX+ 395': 'https://www.techpowerup.com/cpu-specs/ryzen-ai-max-395.c3994', 'Intel Core i9-13900K': 'https://www.techpowerup.com/cpu-specs/core-i9-13900k.c2817', 'AMD Ryzen 7 7700X': 'https://www.techpowerup.com/cpu-specs/ryzen-7-7700x.c2848', 'AMD Ryzen 9 7950X': 'https://www.techpowerup.com/cpu-specs/ryzen-9-7950x.c2846', 'Intel Core i9-14900K': 'https://www.techpowerup.com/cpu-specs/core-i9-14900k.c3269', 'AMD EPYC 4364P': 'https://www.techpowerup.com/cpu-specs/epyc-4364p.c3611', 'AMD Ryzen 9 7900X': 'https://www.techpowerup.com/cpu-specs/ryzen-9-7900x.c2847', 'Intel Core i9-12900K': 'https://www.techpowerup.com/cpu-specs/core-i9-12900k.c2505', 'AMD EPYC 4564P': 'https://www.techpowerup.com/cpu-specs/epyc-4564p.c3613', 'AMD EPYC 4464P': 'https://www.techpowerup.com/cpu-specs/epyc-4464p.c3612', 'AMD EPYC 4584PX': 'https://www.techpowerup.com/cpu-specs/epyc-4584px.c3607', 'AMD EPYC 4484PX':

In [27]:
# Build {model: list_of_DataFrames} by parsing the resolved URLs
model_to_tables = {}
errors = {}

for model, url in model_url_association.items():
    start = time.time()
    try:
        html = fetch_html(url)
        print(f"Fetched {url}")
        # read all tables so you can decide later; keep as list[DataFrame]
        tables = pd.read_html(StringIO(html), flavor="lxml")
        model_to_tables[model] = tables
        print(f"OK: {model} -> {url} (tables: {len(tables)})")
        # extra politeness between detail pages
        time.sleep(random.uniform(30, 120))
    except Exception as e:
        errors[model] = {"url": url, "error": repr(e)}
        print(f"ERR: {model} -> {url} :: {e}")
        time.sleep(random.uniform(30, 120))

    end = time.time()
    print(f"Iteration time: {end-start}")

# Package payload and serialize using dill via pickle API
payload = {
    "tables": model_to_tables,   # {model: list[pd.DataFrame]}
    "errors": errors,            # {model: {url, error}}
    "resolved": model_to_url,    # {model: url}
    "unresolved": unresolved,    # {model: {best_guess, score}}
}

out_path = "cpu_specs_raw_tables.pkl"
with open(out_path, "wb") as f:
    pickle.dump(payload, f, protocol=pickle.HIGHEST_PROTOCOL)

print(f"\nSaved {len(model_to_tables)} models to {out_path}. Errors: {len(errors)}")

Fetched https://www.techpowerup.com/cpu-specs/ryzen-9-9950x.c3649
OK: AMD Ryzen 9 9950X -> https://www.techpowerup.com/cpu-specs/ryzen-9-9950x.c3649 (tables: 8)
Iteration time: 169.2925090789795
Fetched https://www.techpowerup.com/cpu-specs/ryzen-ai-max-395.c3994
OK: AMD RYZEN AI MAX+ 395 -> https://www.techpowerup.com/cpu-specs/ryzen-ai-max-395.c3994 (tables: 8)
Iteration time: 128.40356612205505
Fetched https://www.techpowerup.com/cpu-specs/core-i9-13900k.c2817
OK: Intel Core i9-13900K -> https://www.techpowerup.com/cpu-specs/core-i9-13900k.c2817 (tables: 8)
Iteration time: 78.40647411346436
Fetched https://www.techpowerup.com/cpu-specs/ryzen-7-7700x.c2848
OK: AMD Ryzen 7 7700X -> https://www.techpowerup.com/cpu-specs/ryzen-7-7700x.c2848 (tables: 7)
Iteration time: 130.9573380947113
Fetched https://www.techpowerup.com/cpu-specs/ryzen-9-7950x.c2846
OK: AMD Ryzen 9 7950X -> https://www.techpowerup.com/cpu-specs/ryzen-9-7950x.c2846 (tables: 8)
Iteration time: 122.90166401863098
Fetched 

In [28]:
with open("cpu_specs_raw_tables.pkl", "rb") as f:
    payload = pickle.load(f)

{'tables': {'AMD Ryzen 9 9950X': [                   0               1
0            Socket:  AMD Socket AM5
1           Foundry:            TSMC
2      Process Size:            4 nm
3       Transistors:  16,630 million
4          Die Size:     2x 70.6 mm²
5  I/O Process Size:            6 nm
6   I/O Transistors:   3,400 million
7      I/O Die Size:         122 mm²
8           Package:      FC-LGA1718
9             tJMax:            95°C,                     0               1
0             Market:         Desktop
1  Production Status:          Active
2       Release Date:  Aug 15th, 2024
3       Launch Price:            $649
4              Part#:   100-000001277
5     Bundled Cooler:             NaN,                       0              1
0            Frequency:        4.3 GHz
1          Turbo Clock:  up to 5.7 GHz
2           Base Clock:        100 MHz
3           Multiplier:          43.0x
4  Multiplier Unlocked:            Yes
5                  TDP:          170 W
6                 