In [23]:
from bs4 import BeautifulSoup
import requests
import re
from tqdm.notebook import tqdm
import pandas as pd 


def get_path_processor():

    """
    This function scraps intel website in order to
    get the url address of each intel processors on
    the market currently. Notice that there is 8 different
    kind of processors on the market.
    """

    main_path = "https://ark.intel.com/content/www/us/en/ark.html#@Processors"
    page = requests.get(main_path)

    processor_urls = []

    soup = BeautifulSoup(page.content, "html.parser")
    processors_range = soup.find("div", {"data-parent-panel-key": "Processors"}).findAll(
        "div", {"data-wap_ref": "category|subcategory"}
    )

    for processor in tqdm(processors_range):

        path, _ = main_path.split("@")
        path = path + "@" + processor.attrs["data-panel-key"]

        #path looks like : https://ark.intel.com/content/www/us/en/ark.html#@PanelLabel122139

        page = requests.get(path)

        soup = BeautifulSoup(page.content, "html.parser")
        refs = soup.find(
            "div", {"data-parent-panel-key": f'{processor.attrs["data-panel-key"]}'}
        ).findAll("a")

        for ref in refs:

            path = "https://ark.intel.com/" + ref.attrs["href"]

            #path looks like : https://ark.intel.com/content/www/us/en/ark/products/series/123588/intel-core-x-series-processors.html

            page = requests.get(path)

            soup = BeautifulSoup(page.content, "html.parser")
            processor_list = soup.findAll(
                "td",
                {
                    "class": "ark-product-name ark-accessible-color component",
                    "data-component": "arkproductlink",
                },
            )

            for processor_name in processor_list:

                #path looks like : https://ark.intel.com/content/www/us/en/ark/products/198019/intel-core-i910900x-xseries-processor-19-25m-cache-3-70-ghz.html
                
                path = "https://ark.intel.com/" + processor_name.find("a").attrs["href"]
                processor_urls.append(path)

    return processor_urls


def get_tdp(soup):

    """
    Given a website page this function extract
    the TDP value

    Parameters
    ----------
    soup : bs4.BeautifulSoup
        HTML document as a nested data structure

    Returns
    -------
    int
        TDP
    """
    
    maxtdp = soup.find("span", {"data-key": "MaxTDP"})

    if maxtdp is not None :
        return maxtdp.text.strip()
    else:
        return " "


def get_mtp(soup):

    """
    Given a website page this function extract
    the Maximum Turbo Power value

    Parameters
    ----------
    soup : bs4.BeautifulSoup
        HTML document as a nested data structure

    Returns
    -------
    int
        Maximum Turbo Power
    """
    mtp = soup.find("span", {"data-key": "MaxTurboPower"})

    if mtp is None:
        return " "
    else:
        return mtp.text.strip()


def get_pbp(soup):

    """
    Given a website page this function extract
    the Processor Base Power value

    Parameters
    ----------
    soup : bs4.BeautifulSoup
        HTML document as a nested data structure

    Returns
    -------
    int
        Processor Base Power value
    """

    pbp = soup.find("span", {"data-key": "ProcessorBasePower"})

    if pbp is None:
        return " "
    else:
        return pbp.text.strip()


def get_processor_name(path):
    """
    extract from a path the name of the intel processor

    Parameters
    ----------
    path : string
        https path of the intel processor

    Returns
    -------
    string
        name extracted from the path
    """
    path = path.split("/")[-1]

    empty = ""
    for i in path.split("-"):
        if i == "cache":
            break
        empty += i + " "

    return empty


path = get_path_processor()

clear_name = []
tdp = []
mtp = []
pbp = []

for i in tqdm(path):

    page = requests.get(i)
    soup = BeautifulSoup(page.content, "html.parser")

    clear_name.append(get_processor_name(i))
    tdp.append(get_tdp(soup))
    mtp.append(get_mtp(soup))
    pbp.append(get_pbp(soup))



  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/2517 [00:00<?, ?it/s]

In [29]:
df=pd.DataFrame({'url':path,
                'name':clear_name,
                'tdp':tdp,
                'mtp':mtp,
                'pbp':pbp})

df['tdp'] = df['tdp'].str.extract("([0-9\.]+)").astype(float)
df['mtp'] = df['mtp'].str.extract("([0-9\.]+)").astype(float)
df['pbp'] = df['pbp'].str.extract("([0-9\.]+)").astype(float)

df.head()

Unnamed: 0,url,name,tdp,mtp,pbp
0,https://ark.intel.com//content/www/us/en/ark/p...,intel core i910900x xseries processor 19 25m,165.0,,
1,https://ark.intel.com//content/www/us/en/ark/p...,intel core i910920x xseries processor 19 25m,165.0,,
2,https://ark.intel.com//content/www/us/en/ark/p...,intel core i910940x xseries processor 19 25m,165.0,,
3,https://ark.intel.com//content/www/us/en/ark/p...,intel core i910980xe extreme edition processor...,165.0,,
4,https://ark.intel.com//content/www/us/en/ark/p...,intel core i79800x xseries processor 16 5m,165.0,,


In [30]:
df.to_csv('intel_processor_power_specifications.csv', index=False)