In [2]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup as soup
import urllib.request
import time
import string as st
import pandas as pd
import json
import os

In [3]:
def simple_get(url):
    """
    return a url if good and log the error otherwise
    """
    try:
        with closing(get(url)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                print(resp.content)
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None

def adaptive_get(url,t):
    "times get requests as to not trip rate limits, returns soup of page or None"
    time.sleep(t)
    html_text = simple_get(url)
    
    if html_text == None:
        html_text = try_again(url) #try again returns html text
        t += 10
    
    'if try_again fails to get the link then return none'
    if html_text == None:
        notify(f"URL not captured:{url}")
        return None
    else:
        return (soup(html_text),t)
    
def try_again(url):
    "waits for a longer period of time so download limit can reset"
    notify(f"Rate limit hit. Taking break")
    time.sleep(2500)
    
    html_text = simple_get(url)
    if html_text == None:
        notify(f"Rate limit hit AGAIN :( : Pausing for longer")
        time.sleep(5000)
        html_text = simple_get(url)
    
    return html_text

def notify(string):
    "prints notifications to a file containing progress and the screen"
    print(string)
    with open("progress.txt", 'a') as f:
        f.write(string+"\n")
        
def is_good_response(resp):
    """
    returns true if response is good and html is found
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

def log_error(e):
    """
    print the error
    """
    notify(e)

In [4]:
def find_processors():
    "This is the code to pull the cpu names and the link to each cpu."
    brands = ['Intel','AMD','ATI','Matrox','NVIDIA','XGI']
    years = [1986,1987,1988,1990,1992,1993]+list(range(1995,2021))
    mobiles = ["Yes","No"]
    base = "https://www.techpowerup.com"
    t=75
    gpu_list = []
    for brand in brands:
        for year in years:
            for mobile in mobiles:
                notify(f"Starting {brand} {year}")
                url = f"https://www.techpowerup.com/gpu-specs/?mfgr={brand}&released={year}&mobile={mobile}&sort=name"
                page_soup,t = adaptive_get(url,t)
                relevant_lines = page_soup.select("table.processors a")
                if len(relevant_lines) > 0:
                    for line in relevant_lines:
                        gpu_name = line.string
                        gpu_link = "".join([base,line['href']])
                        gpu_list.append({'name': gpu_name,"brand":brand, 'link':gpu_link})
    return gpu_list

In [68]:
def gpu_details(processor_list):
    detail_list = []
    sleep_time=75
    for i, entry in enumerate(processor_list):
        new_entry = entry.copy()
        if i%5 == 0:
            notify(f"{i/len(processor_list)*100:.1f}% Complete")
            
        link = entry['link']

        "get html from link"
        card_soup, sleep_time = adaptive_get(link, sleep_time)
        
        "add available information from website to processor database"
        keys = [x.text.strip() for x in card_soup.select('div.sectioncontainer div dt')]
        values = [x.text.strip() for x in card_soup.select('div.sectioncontainer div dd')]

        detail_list.append(dict(zip(keys,values)))
        
        with open('ongoing_gpu_collected_data.json', 'a') as f:
            json.dump(detail_list[-1], f)
        
    return detail_list

In [6]:
"get all processor links"
gpu_links = find_processors()
with open('gpu_urls.json', 'w') as f:
    json.dump(gpu_links,f)

Starting Intel 1986
Starting Intel 1986
Starting Intel 1987
Starting Intel 1987
Starting Intel 1988
Starting Intel 1988
Starting Intel 1990
Starting Intel 1990
Starting Intel 1992
Starting Intel 1993
Starting Intel 1993
Starting Intel 1995
Starting Intel 1995
Starting Intel 1996
Starting Intel 1996
Starting Intel 1997
Starting Intel 1997
Starting Intel 1998
Starting Intel 1998
Starting Intel 1999
Starting Intel 1999
Starting Intel 2000
Starting Intel 2000
Starting Intel 2001
Starting Intel 2001
Starting Intel 2002
Starting Intel 2002
Starting Intel 2003
Starting Intel 2003
Starting Intel 2004
Starting Intel 2004
Starting Intel 2005
Starting Intel 2005
Starting Intel 2006
Starting Intel 2006
Starting Intel 2007
Starting Intel 2007
Starting Intel 2008
Starting Intel 2008
Starting Intel 2009
Starting Intel 2009
Starting Intel 2010
Starting Intel 2010
Starting Intel 2011
Starting Intel 2011
Starting Intel 2012
Starting Intel 2012
Starting Intel 2013
Starting Intel 2013
Starting Intel 2014


In [65]:
gpu_list[0]['link']

'https://www.techpowerup.com/gpu-specs/i740.c1288'

In [73]:
'get links to individual cpus'
if 'gpu_list' not in locals():
    with open('gpu_urls.json', 'r') as f:
        gpu_list = json.load(f)
        
'fill in details on gpus in list using the output of find_processors'
full_details_list = gpu_details(gpu_list)

with open('final_gpu_list.json', 'w') as f:
    json.dump(full_details_list, f)

0.0% Complete
0.1% Complete
0.2% Complete
0.3% Complete
0.4% Complete
0.5% Complete
0.6% Complete
0.7% Complete
0.8% Complete
0.9% Complete
1.0% Complete
1.1% Complete
1.2% Complete
1.2% Complete
1.3% Complete
1.4% Complete
1.5% Complete
1.6% Complete
1.7% Complete
1.8% Complete
1.9% Complete
2.0% Complete
2.1% Complete
2.2% Complete
2.3% Complete
2.4% Complete
2.5% Complete
2.6% Complete
2.7% Complete
2.8% Complete
2.9% Complete
3.0% Complete
3.1% Complete
3.2% Complete
3.3% Complete
3.4% Complete
3.5% Complete
3.5% Complete
3.6% Complete
3.7% Complete
3.8% Complete
3.9% Complete
4.0% Complete
4.1% Complete
4.2% Complete
4.3% Complete
4.4% Complete
4.5% Complete
4.6% Complete
4.7% Complete
4.8% Complete
4.9% Complete
5.0% Complete
5.1% Complete
5.2% Complete
5.3% Complete
5.4% Complete
5.6% Complete
5.7% Complete
5.8% Complete
5.8% Complete
5.9% Complete
6.0% Complete
6.1% Complete
6.2% Complete
6.3% Complete
6.4% Complete
6.5% Complete
6.6% Complete
6.7% Complete
6.8% Complete
6.9% C

53.5% Complete
53.6% Complete
53.7% Complete
53.8% Complete
53.9% Complete
54.0% Complete
54.1% Complete
54.2% Complete
54.3% Complete
54.4% Complete
54.4% Complete
54.5% Complete
54.6% Complete
54.7% Complete
54.8% Complete
54.9% Complete
55.0% Complete
55.1% Complete
55.2% Complete
55.3% Complete
55.4% Complete
55.5% Complete
55.6% Complete
55.7% Complete
55.8% Complete
55.9% Complete
56.0% Complete
56.1% Complete
56.2% Complete
56.3% Complete
56.4% Complete
56.5% Complete
56.6% Complete
56.7% Complete
56.7% Complete
56.8% Complete
56.9% Complete
57.0% Complete
57.1% Complete
57.2% Complete
57.3% Complete
57.4% Complete
57.5% Complete
57.6% Complete
57.7% Complete
57.8% Complete
57.9% Complete
58.0% Complete
58.1% Complete
58.2% Complete
58.3% Complete
58.4% Complete
58.5% Complete
58.6% Complete
58.7% Complete
58.8% Complete
58.9% Complete
59.0% Complete
59.0% Complete
59.1% Complete
59.2% Complete
59.3% Complete
59.4% Complete
59.5% Complete
59.6% Complete
59.7% Complete
59.8% Comp

In [34]:
resp = simple_get('https://www.techpowerup.com/gpu-specs/?mfgr=AMD&released=2015&mobile=No&sort=name')
base = "https://www.techpowerup.com"
test = soup(resp)
relevant_lines = test.select("table.processors a")
if len(relevant_lines) > 0:
    for line in relevant_lines:
        gpu_name = line.string
        gpu_link = "".join([base,line['href']])
        print(gpu_link)
        break

https://www.techpowerup.com/gpu-specs/firepro-s9170.c2744


In [28]:
test.select("table.processors a")

[<a href="/gpu-specs/firepro-s9170.c2744">FirePro S9170</a>,
 <a href="/gpu-specs/amd-hawaii.g738">Hawaii</a>,
 <a href="/gpu-specs/firepro-w4300.c2781">FirePro W4300</a>,
 <a href="/gpu-specs/amd-bonaire.g568">Bonaire</a>,
 <a href="/gpu-specs/radeon-r2-graphics.c2498">Radeon R2 Graphics</a>,
 <a href="/gpu-specs/amd-beema.g750">Beema</a>,
 <a href="/gpu-specs/radeon-r3-graphics.c2494">Radeon R3 Graphics</a>,
 <a href="/gpu-specs/amd-beema.g750">Beema</a>,
 <a href="/gpu-specs/radeon-r5-310-oem.c3025">Radeon R5 310 OEM</a>,
 <a href="/gpu-specs/amd-caicos.g114">Caicos</a>,
 <a href="/gpu-specs/radeon-r5-310-oem.c2680">Radeon R5 310 OEM</a>,
 <a href="/gpu-specs/amd-caicos.g114">Caicos</a>,
 <a href="/gpu-specs/radeon-r5-330-oem.c2727">Radeon R5 330 OEM</a>,
 <a href="/gpu-specs/amd-hainan.g817">Hainan</a>,
 <a href="/gpu-specs/radeon-r5-340-oem.c2683">Radeon R5 340 OEM</a>,
 <a href="/gpu-specs/amd-oland.g389">Oland</a>,
 <a href="/gpu-specs/radeon-r5-340x-oem.c2820">Radeon R5 340X OE