In [4]:
from bs4 import BeautifulSoup
import requests
import re
from tqdm.notebook import tqdm


def get_path_processor():

    """
    This function scraps intel website in order to
    get the url adress of each intel processors on
    the market currently. Notice that there is 8 different
    kind of processors on the market.
    """

    main_path = "https://ark.intel.com/content/www/us/en/ark.html#@Processors"
    page = requests.get(main_path)

    test_path = []

    soup = BeautifulSoup(page.content, "html.parser")
    tags_1 = soup.find("div", {"data-parent-panel-key": "Processors"}).findAll(
        "div", {"data-wap_ref": "category|subcategory"}
    )

    for i in tqdm(tags_1):

        path, _ = main_path.split("@")
        path = path + "@" + i.attrs["data-panel-key"]
        page = requests.get(path)

        soup = BeautifulSoup(page.content, "html.parser")
        tags_2 = soup.find(
            "div", {"data-parent-panel-key": f'{i.attrs["data-panel-key"]}'}
        ).findAll("a")

        for j in tags_2:

            path = "https://ark.intel.com/" + j.attrs["href"]
            page = requests.get(path)

            soup = BeautifulSoup(page.content, "html.parser")
            tags_3 = soup.findAll(
                "td",
                {
                    "class": "ark-product-name ark-accessible-color component",
                    "data-component": "arkproductlink",
                },
            )

            for k in tags_3:

                path = "https://ark.intel.com/" + k.find("a").attrs["href"]
                test_path.append(path)

    return test_path


def get_tdp(path):

    """
    Given a website page this function extract
    the TDP or Maximum Turbo Power value

    Returns
    -------
    [int]
        [TDP]
    """
    
    maxtdp = soup.find("span", {"data-key": "MaxTDP"})

    if maxtdp is not None :
        return maxtdp.text.strip()
    else:
        return " "


def get_mtp(soup):

    """
    Given a website page this function extract
    the Maximum Turbo Power value

    Returns
    -------
    [int]
        [Maximum Turbo Power]
    """
    mtp = soup.find("span", {"data-key": "MaxTurboPower"})

    if mtp is None:
        return " "
    else:
        return mtp.text.strip()


def get_pbp(soup):

    """
    Given a website page this function extract
    the Processor Base Power value

    Returns
    -------
    [int]
        [Processor Base Power value]
    """

    pbp = soup.find("span", {"data-key": "ProcessorBasePower"})

    if pbp is None:
        return " "
    else:
        return pbp.text.strip()


def get_processor_name(path):
    """
    extract from a path the name of the intel processor

    Parameters
    ----------
    path : [string]
        [https path of the intel processor]
    """
    path = path.split("/")[-1]

    empty = ""
    for i in path.split("-"):
        if i == "cache":
            break
        empty += i + " "

    return empty


path = get_path_processor()

clear_name = []
tdp = []
mtp = []
pbp = []

for i in tqdm(path):

    page = requests.get(i)
    soup = BeautifulSoup(page.content, "html.parser")

    clear_name.append(get_processor_name(i))
    tdp.append(get_tdp(soup))
    mtp.append(get_mtp(soup))
    pbp.append(get_pbp(soup))



  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/2517 [00:00<?, ?it/s]

In [29]:
import pandas as pd 
import numpy as np

df=pd.DataFrame({'url':path,
                'name':clear_name,
                'tdp':tdp,
                'mtp':mtp,
                'pbp':pbp})

df['tdp'] = df['tdp'].apply(lambda x : float(re.findall("\d+\.\d+",x) if x != " " else 0))
#df['mtp'] = df['mtp'].apply(lambda x : float(re.findall('[0-9]+', x)[0] if x != " " else 0))
#df['pbp'] = df['pbp'].apply(lambda x : float(re.findall('[0-9]+', x)[0] if x != " " else 0))

TypeError: float() argument must be a string or a number, not 'list'

In [32]:
df.tail()

Unnamed: 0,url,name,tdp,mtp,pbp
2512,https://ark.intel.com//content/www/us/en/ark/p...,intel quark soc x1020d 16k,2.2 W,,
2513,https://ark.intel.com//content/www/us/en/ark/p...,intel quark soc x1000 16k,2.2 W,,
2514,https://ark.intel.com//content/www/us/en/ark/p...,intel quark microcontroller d1000.html,0.025 W,,
2515,https://ark.intel.com//content/www/us/en/ark/p...,intel quark microcontroller d2000.html,,,
2516,https://ark.intel.com//content/www/us/en/ark/p...,intel quark se c1000 microcontroller.html,,,


In [30]:
re.findall("\d+\.\d+","165 W")

[]