In [None]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup as soup
import urllib.request
import time
import string as st
import pandas as pd
import json
import os

In [None]:
def simple_get(url):
    """
    return a url if good and log the error otherwise
    """
    try:
        with closing(get(url)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                print(resp.content)
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None

def adaptive_get(url,t):
    "times get requests as to not trip rate limits, returns soup of page or None"
    time.sleep(t)
    html_text = simple_get(url)
    
    if html_text == None:
        html_text = try_again(url) #try again returns html text
        t += 10
    
    'if try_again fails to get the link then return none'
    if html_text == None:
        notify(f"URL not captured:{url}")
        return None
    else:
        return (soup(html_text),t)
    
def try_again(url):
    "waits for a longer period of time so download limit can reset"
    notify(f"Rate limit hit. Taking break")
    time.sleep(2500)
    
    html_text = simple_get(url)
    if html_text == None:
        notify(f"Rate limit hit AGAIN :( : Pausing for longer")
        time.sleep(5000)
        html_text = simple_get(url)
    
    return html_text

def notify(string):
    "prints notifications to a file containing progress and the screen"
    print(string)
    with open("progress.txt", 'a') as f:
        f.write(string+"\n")
        
def is_good_response(resp):
    """
    returns true if response is good and html is found
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

def log_error(e):
    """
    print the error
    """
    notify(e)

In [None]:
def find_processors():
    "This is the code to pull the gpu names and the link to each gpu. Since df was saved it doesn't need run again."
    brands = ['Intel','AMD']
    years = list(range(2000,2020))
    base = "https://www.techpowerup.com"
    t=15
    cpu_list = []
    for brand in brands:
        for year in years:
            notify(f"Starting {brand} {year}")
            page_soup,t = adaptive_get(f"https://www.techpowerup.com/cpudb/?mfgr={brand}&released={year}&sort=name",t)
            relevant_lines = page_soup.select("table.processors a")
            if len(relevant_lines) > 0:
                for line in relevant_lines:
                    cpu_name = line.string
                    cpu_link = "".join([base,line['href']])
                    cpu_list.append({'name': cpu_name,"brand":brand,'link':cpu_link})
    return cpu_list

In [None]:
def cpu_details(processor_list):
    detail_list = []
    sleep_time=15
    for i,entry in enumerate(processor_list):
        new_entry = entry.copy()
        if i%5 == 0:
            notify(f"{i/len(processor_list)*100:.1f}% Complete")
            
        link = entry['link']

        "get html from link"
        card_soup, sleep_time = adaptive_get(link, sleep_time)
        
        "add available information from website to processor database"
        cpu_details = card_soup.select('div.sectioncontainer tr')
        for detail in cpu_details:
            try:
                key = str(detail.select("th")[0].text.strip())
                value = str(detail.select("td")[0].text.strip())
            except IndexError:
                continue

            if key != [] and value != []:
                new_entry.update({key:value})
                
        detail_list.append(new_entry)
        
        with open('ongoing_collected_data.json', 'a') as f:
            json.dump(detail_list[-1], f)
        
    return detail_list

In [None]:
"get all processor links"
cpu_links = find_processors()
with open('cpu_urls.json', 'w') as f:
    json.dump(cpu_links,f)

In [None]:
'get links to individual cpus'
if 'cpu_list' not in locals():
    with open('cpu_urls.json', 'r') as f:
        cpu_list = json.load(f)
        
'fill in details on cpus in list using the output of find_processors'
full_details_list = cpu_details(cpu_list)

with open('final_cpu_list.json', 'w') as f:
    json.dump(full_details_list, f)