In [55]:
from bs4 import BeautifulSoup
import requests
from tqdm.notebook import tqdm
import pandas as pd 
import re 

def get_max_power(path):

    """
    Given a website page this function extract
    the TDP or Maximum Turbo Power value 

    Returns
    -------
    [int]
        [TDP or Maximum Turbo Power value]
    """
    page = requests.get(path)
    soup = BeautifulSoup(page.content, 'html.parser')
    try : 
        maxtdp = soup.findAll('span', {"data-key" : "MaxTDP"})
        return maxtdp[0].text.strip()
    except :
        mtp = soup.findAll('span', {"data-key" : "MaxTurboPower"})
        if mtp != []:
            return mtp[0].text.strip()
        else :
            "See with Martin"


def get_intel_path():
    """
    This function scraps intel website in order to 
    get the url adress of each intel processors on
    the market currently. Notice that there is 8 different
    kind of processors on the market.
    """

    main_path = "https://ark.intel.com/content/www/us/en/ark.html#@Processors"
    page = requests.get(main_path)

    test_path = []
    path_prob = []

    soup = BeautifulSoup(page.content, 'html.parser')
    tags = soup.find('div', {"data-parent-panel-key" : "Processors"}).findAll('div',{"data-wap_ref" : "category|subcategory"})

    for i in tqdm(tags) :

        path, _ = main_path.split("@")
        path = path + "@" + i.attrs["data-panel-key"]
        page =  requests.get(path)

        soup = BeautifulSoup(page.content, 'html.parser')
        tags_2 = soup.find('div', {'data-parent-panel-key':f'{i.attrs["data-panel-key"]}'}).findAll('a')

        for j in tags_2 : 

            path = "https://ark.intel.com/" + j.attrs["href"]
            page =  requests.get(path)

            soup = BeautifulSoup(page.content, 'html.parser')

            try :
                test_path.append("https://ark.intel.com/"+soup.find('tbody').find('td').find('a').attrs['href'])
            except :
                path_prob.append(path)

    return test_path, path_prob

def get_processor_name(path):
    """
    extract from a path the name of the intel processor

    Parameters
    ----------
    path : [string]
        [https path of the intel processor]
    """
    path = path.split('/')[-1]

    empty = ""
    for i in path.split('-') :
        if i == 'cache':
            break
        empty += i + ' '

    return empty
    
    
path, _ = get_intel_path()

results = {}
for i in tqdm(path) :
    name = get_processor_name(i)
    try :
        val = get_max_power(i)
        val = int(re.sub("[^0-9]", "", val))
        results[name]= val
    except :
        continue

df = pd.DataFrame({'name':results.keys(),
                'maximum_power':results.values()})

df.head()

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/102 [00:00<?, ?it/s]

dict_keys(['intel core i910900x xseries processor 19 25m ', 'intel core i912900 processor 30m ', 'intel core i910850k processor 20m ', 'intel core i99900ks processor 16m ', 'intel core i98950hk processor 12m ', 'intel core i712650h processor 24m ', 'intel core i710870h processor 16m ', 'intel core i79700e processor 12m ', 'intel core i78557u processor 8m ', 'intel core i77560u processor 4m ', 'intel core i76785r processor 8m ', 'intel core i75700eq processor 6m ', 'intel core i74720hq processor 6m ', 'intel core i512400 processor 18m ', 'intel core i510505 processor 12m ', 'intel core i59500e processor 9m ', 'intel core i58260u processor 6m ', 'intel core i57260u processor 4m ', 'intel core i56585r processor 6m ', 'intel core i55350h processor 4m ', 'intel core i54210h processor 3m ', 'intel core i312100 processor 12m ', 'intel core i310105 processor 6m ', 'intel core i39100e processor 6m ', 'intel core i38140u processor 4m ', 'intel core i37020u processor 3m ', 'intel core i36006u pro