In [1]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup as soup
import urllib.request
import time
import string as st
import pandas as pd

In [2]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    time.sleep(1)
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [3]:
def get_pages(html):
    """returns how many pages there are for that letter
    Input: html from first page in letter
    Output: Int - how many pages there are for that letter
    """
    line = html.select('span[title ^="Page"]')
    try:
        return int(line[0]['title'].split()[-1])
    except: 
        return 1


def read_game_data(game_soup):
    """
    Converts a game page's soup object and returns a dictionary to be added to the final output.
    Input: Game page soup
    Output: a dictionary with relevant game information
    """
    "read in the title of the game"
    game_title = str(game_soup.find('div',{"class":'game_head_title'}).string)
    
    "read in the game info from the header"
    head_text = game_soup.find_all('div',{'class':'game_head_details_row'})
    if len(head_text) < 2:
        return None
    output = {}
    for entry in head_text:
        string = entry.string
        if not isinstance(string,type(None)):
            txt = string.split(':')
            if len(txt) < 2:
                if txt[0].lower() == "cancelled":
                    break
                continue
            output[("Game Info",txt[0].strip())]= txt[1].strip()
            
    "read in requirements from the middle column"
    req_block = game_soup.find_all('div',{'class':'srb_row'})
    if len(req_block) == 0:
        return None
    
    line_list = [str(y.string).strip() for x in req_block[0].children for y in x] 
    req_type_list = line_list #list of requirement types available

    "pulls in requirements line by line"
    for line in req_block[1:]:
        line_list = [str(y.string).strip() for x in line.children for y in x]
        req_type_str = req_type_list[0].split()[0]
        temp = {(req_type_str,line_list[0][:-1]):line_list[1]}
        output.update(temp)
        'if there are also recommended requirements then read those in'
        if len(req_type_list) == 2:
            if len(line_list) == 4:
                req_type_str = req_type_list[1].split()[0]
                temp = {(req_type_str,line_list[2][:-1]):line_list[3]}
                output.update(temp)
            
    return {game_title:output}
    

In [9]:
root_url = "https://gamesystemrequirements.com"
page_list = list(st.ascii_lowercase[19:])
page_list.append('0-9')

In [10]:
"""Structure"""

for letter in page_list:
    output = None
    first_game_flag = True
    print(f"Starting Letter {letter}")
    'iterate through letters in the page_list to create url from pattern'
    url = f"{root_url}/database/{letter}"
    'get the soup version of the page'
    first_page_soup =  soup(simple_get(url), "html.parser")
    num_pages = get_pages(first_page_soup)

    page_games_list = first_page_soup.select(f'a[href ^="game/{letter}"]')
    counter = 0
    for entry in page_games_list:
        if counter%10 == 0:
            print(f"Letter {letter}: Page 1 {counter/len(page_games_list)*100}%")
        counter +=1 
        if len(entry.attrs) == 1:
            game_url = f"{root_url}/{entry['href']}"
            game_soup = soup(simple_get(game_url),'html.parser')
            game_dict = read_game_data(game_soup)
            if first_game_flag == True:
                first_game_flag = False
                output = pd.DataFrame(game_dict).transpose()
            else:
                if not isinstance(game_dict,type(None)):
                    output = output.append(pd.DataFrame(game_dict).transpose())

    if num_pages > 1:
        for i in range(2,num_pages):
            url = f"{root_url}/database/{letter}/page/{i}"
            page_soup = soup(simple_get(url), "html.parser")
            page_games_list = page_soup.select(f'a[href ^="game/{letter}"]')
            counter = 0
            for entry in page_games_list:
                if counter%10 == 0:
                    print(f"Letter {letter}: Page {i} {counter/len(page_games_list)*100}%")
                counter +=1 
                if len(entry.attrs) == 1:
                    game_url = f"{root_url}/{entry['href']}"
                    game_soup = soup(simple_get(game_url),'html.parser')
                    game_dict = read_game_data(game_soup)
                    if not isinstance(game_dict,type(None)):
                        output = output.append(pd.DataFrame(game_dict).transpose())
                        
    output.to_pickle(f"./Game Data {letter}.pkl")


Starting Letter t
Letter t: Page 1 0.0%
Letter t: Page 1 9.70873786407767%
Letter t: Page 1 19.41747572815534%
Letter t: Page 1 29.126213592233007%
Letter t: Page 1 38.83495145631068%
Letter t: Page 1 48.54368932038835%
Letter t: Page 1 58.252427184466015%
Letter t: Page 1 67.96116504854369%
Letter t: Page 1 77.66990291262135%
Letter t: Page 1 87.37864077669903%
Letter t: Page 1 97.0873786407767%
Letter t: Page 2 0.0%
Letter t: Page 2 9.70873786407767%
Letter t: Page 2 19.41747572815534%
Letter t: Page 2 29.126213592233007%
Letter t: Page 2 38.83495145631068%
Letter t: Page 2 48.54368932038835%
Letter t: Page 2 58.252427184466015%
Letter t: Page 2 67.96116504854369%
Letter t: Page 2 77.66990291262135%
Letter t: Page 2 87.37864077669903%
Letter t: Page 2 97.0873786407767%
Letter t: Page 3 0.0%
Letter t: Page 3 9.70873786407767%
Letter t: Page 3 19.41747572815534%
Letter t: Page 3 29.126213592233007%
Letter t: Page 3 38.83495145631068%
Letter t: Page 3 48.54368932038835%
Letter t: Page 3

AttributeError: 'NoneType' object has no attribute 'to_pickle'

In [46]:
"""Structure for the final page with non pattern link naming"""

for letter in ['0-9']:
    output = None
    first_game_flag = True
    print(f"Starting Letter {letter}")
    'iterate through letters in the page_list to create url from pattern'
    url = f"{root_url}/database/{letter}"
    'get the soup version of the page'
    first_page_soup =  soup(simple_get(url), "html.parser")
    num_pages = get_pages(first_page_soup)

    page_games_list = first_page_soup.select(f'a[href ^="game/"]')
    
    for x in page_games_list:
        y = str(x.string)
        if y != "None":
            gl.append(f"game/{y}")
    
    counter = 0
    for entry in page_games_list:
        if counter%10 == 0:
            print(f"Letter {letter}: Page 1 {counter/len(page_games_list)*100}%")
        counter +=1 
        if len(entry.attrs) == 1:
            game_url = f"{root_url}/{entry['href']}"
            game_soup = soup(simple_get(game_url),'html.parser')
            game_dict = read_game_data(game_soup)
            if first_game_flag == True:
                first_game_flag = False
                output = pd.DataFrame(game_dict).transpose()
            else:
                if not isinstance(game_dict,type(None)):
                    output = output.append(pd.DataFrame(game_dict).transpose())

    if num_pages > 1:
        for i in range(2,num_pages):
            url = f"{root_url}/database/{letter}/page/{i}"
            page_soup = soup(simple_get(url), "html.parser")
            page_games_list = page_soup.select(f'a[href ^="game/{letter}"]')
            counter = 0
            for entry in page_games_list:
                if counter%10 == 0:
                    print(f"Letter {letter}: Page {i} {counter/len(page_games_list)*100}%")
                counter +=1 
                if len(entry.attrs) == 1:
                    game_url = f"{root_url}/{entry['href']}"
                    game_soup = soup(simple_get(game_url),'html.parser')
                    game_dict = read_game_data(game_soup)
                    if not isinstance(game_dict,type(None)):
                        output = output.append(pd.DataFrame(game_dict).transpose())
                        
    output.to_pickle(f"./Game Data {letter}.pkl")


Starting Letter 0-9
Letter 0-9: Page 1 0.0%
Letter 0-9: Page 1 10.416666666666668%
Letter 0-9: Page 1 20.833333333333336%
Letter 0-9: Page 1 31.25%
Letter 0-9: Page 1 41.66666666666667%
Letter 0-9: Page 1 52.083333333333336%
Letter 0-9: Page 1 62.5%
Letter 0-9: Page 1 72.91666666666666%
Letter 0-9: Page 1 83.33333333333334%
Letter 0-9: Page 1 93.75%


In [45]:
page_games_list

[[<a href="game/wargames">#WarGames</a>,
  <a href="game/83">'83</a>,
  <a href="game/hack-gu-last-recode">.hack//G.U. Last Recode</a>,
  <a href="game/0-ad">0 A.D.</a>,
  <a href="game/007-legends">007 Legends</a>,
  <a href="game/1-screen-platformer">1 Screen Platformer</a>,
  <a href="game/10-second-ninja-x">10 Second Ninja X</a>,
  <a href="game/1000-amps">1000 Amps</a>,
  <a href="game/103">103</a>,
  <a href="game/11-11-memories-retold">11-11 Memories Retold</a>,
  <a href="game/112-operator">112 Operator</a>,
  <a href="game/140">140</a>,
  <a href="game/16bit-trader">16bit Trader</a>,
  <a href="game/1701-ad-gold-edition">1701 A.D. Gold Edition</a>,
  <a href="game/18-wheels-of-steel-across-america">18 Wheels of Steel: Across America</a>,
  <a href="game/18-wheels-of-steel-american-long-haul">18 Wheels of Steel: American Long Haul</a>,
  <a href="game/18-wheels-of-steel-convoy">18 Wheels of Steel: Convoy</a>,
  <a href="game/18-wheels-of-steel-extreme-trucker">18 Wheels of Stee

In [13]:
game_soup

"read in the game info from the header"
head_text = game_soup.find_all('div',{'class':'game_head_details_row'})
output = {}
for entry in head_text:
    string = entry.string
    if not isinstance(string,type(None)):
        txt = string.split(':')
        if len(txt) < 2:
            if txt[0].lower() == "cancelled":
                print("caught")
            continue
        output[("Game Info",txt[0].strip())]= txt[1].strip()

# req_block = game_soup.find_all('div',{'class':'srb_row'})
# if len(req_block) == 0:
#     print('caught')
# line_list = [str(y.string).strip() for x in req_block[0].children for y in x] 
# req_type_list = line_list #list of requirement types available

# "pulls in requirements line by line"
# for line in req_block[1:]:
#     line_list = [str(y.string).strip() for x in line.children for y in x]
#     req_type_str = req_type_list[0].split()[0]
#     temp = {(req_type_str,line_list[0][:-1]):line_list[1]}
#     'if there are also recommended requirements then read those in'
#     if len(req_type_list) == 2:
#         if len(line_list) == 4:
#             req_type_str = req_type_list[1].split()[0]
#             temp = {(req_type_str,line_list[2][:-1]):line_list[3]}

caught


In [13]:
output

Unnamed: 0_level_0,Game Info,Game Info,Game Info,Game Info,Game Info,Minimum,Minimum,Minimum,Minimum,Minimum,Minimum
Unnamed: 0_level_1,Developer,Genre,Popularity,Release Date,Reviews,CPU,DX,GPU,OS,RAM,Store
A Bird Story,Freebird Games,"Adventure, Role-playing game",~1000# ■,2014. November 07. (PC),Mixed (7.0),> Intel Pentium III 800 MHz,Version 9.0c,1024x768 High Color +,,2 GB RAM,200 MB available space


In [15]:
tst = read_game_data(game_soup2)

In [16]:
tst

{'A House of Many Doors': {('Game Info',
   'Release Date'): '2017. February 03. (PC)',
  ('Game Info', 'Sys. Reqs.'): 'Very low (4/7)',
  ('Game Info', 'Popularity'): '~1600# ■',
  ('Game Info', 'Genre'): 'Role-playing game',
  ('Game Info', 'Developer'): 'Pixel Trickery',
  ('Minimum', 'CPU'): '(Any)',
  ('Recommended', 'CPU'): '2GHz+',
  ('Minimum', 'RAM'): '2 GB RAM',
  ('Recommended', 'RAM'): '4 GB RAM',
  ('Minimum', 'GPU'): '(Any)',
  ('Recommended', 'GPU'): '512MB',
  ('Minimum', 'DX'): 'Version 9.0',
  ('Recommended', 'DX'): 'Version 9.0',
  ('Minimum', 'OS'): 'None',
  ('Recommended', 'OS'): 'Windows XP, Vista, 7, 10',
  ('Minimum', 'Store'): '400 MB available space',
  ('Recommended', 'Store'): '400 MB available space'}}

In [17]:
output.append(pd.DataFrame(tst).transpose())

Unnamed: 0_level_0,Game Info,Game Info,Game Info,Game Info,Game Info,Game Info,Minimum,Minimum,Minimum,Minimum,Minimum,Minimum,Recommended,Recommended,Recommended,Recommended,Recommended,Recommended
Unnamed: 0_level_1,Developer,Genre,Popularity,Release Date,Reviews,Sys. Reqs.,CPU,DX,GPU,OS,RAM,Store,CPU,DX,GPU,OS,RAM,Store
A Bird Story,Freebird Games,"Adventure, Role-playing game",~1000# ■,2014. November 07. (PC),Mixed (7.0),,> Intel Pentium III 800 MHz,Version 9.0c,1024x768 High Color +,,2 GB RAM,200 MB available space,,,,,,
A House of Many Doors,Pixel Trickery,Role-playing game,~1600# ■,2017. February 03. (PC),,Very low (4/7),(Any),Version 9.0,(Any),,2 GB RAM,400 MB available space,2GHz+,Version 9.0,512MB,"Windows XP, Vista, 7, 10",4 GB RAM,400 MB available space


In [18]:
output

Unnamed: 0_level_0,Game Info,Game Info,Game Info,Game Info,Game Info,Minimum,Minimum,Minimum,Minimum,Minimum,Minimum
Unnamed: 0_level_1,Developer,Genre,Popularity,Release Date,Reviews,CPU,DX,GPU,OS,RAM,Store
A Bird Story,Freebird Games,"Adventure, Role-playing game",~1000# ■,2014. November 07. (PC),Mixed (7.0),> Intel Pentium III 800 MHz,Version 9.0c,1024x768 High Color +,,2 GB RAM,200 MB available space


In [14]:
test_url = "https://gamesystemrequirements.com/game/a-bird-story"
test_url2 = "https://gamesystemrequirements.com/game/a-house-of-many-doors"
game_soup = soup(simple_get(test_url),'html.parser')
game_soup2 = soup(simple_get(test_url2),'html.parser')

In [None]:
a = read_game_data(game_soup)
b = read_game_data(game_soup2)      

In [None]:
game_title = game_soup.find('div',{"class":'game_head_title'}).string.strip()

In [None]:
z1 = pd.DataFrame(a).transpose()
z2 = pd.DataFrame(b).transpose()

In [None]:
z1.append(z2)

In [None]:
ind = list(a.keys())[0]
tstdf = pd.DataFrame(a[ind], index = [ind])

In [None]:
tstdf

In [None]:
ind = []
row_name = list(a.keys())[0]
info_dict = a[row_name]
for key in info_dict.keys():
    if not isinstance(info_dict[key],dict):
        ind.append(key)
    else:
        for key2 in info_dict[key]:
            ind.append((key,key2))

In [None]:
tstdf = pd.DataFrame.from_dict(a,orient = 'index')

In [None]:
z= list(tstdf['Minimum system requirements:'])

In [None]:
z2 = pd.DataFrame.from_dict(z[0], columns = pd.MultiIndex.from_arrays([['Min'],list(z[0].keys())]))

In [None]:
for x in z.columns:
    tstdf['Minimum system requirements:',x] = 

In [None]:
ds = tstdf['Minimum system requirements:']

In [None]:
"Scratch Work below this"

In [11]:
url = "https://gamesystemrequirements.com/database/0-9"
s = soup(simple_get(url),"html.parser")

In [None]:
req_types = [x.string[:-1] for x in game_html.find_all('div',{'class':'tbl6'})]

In [18]:
page_games_list = s.select(f'a[href ^="game/"]')

In [37]:
[str(x.string) if x!= None for x in page_games_list]

SyntaxError: invalid syntax (<ipython-input-37-2a15fa503fbb>, line 1)

In [42]:
gl = []
for x in page_games_list:
    y = str(x.string)
    if y != "None":
        gl.append(y)


In [43]:
gl

['#WarGames',
 "'83",
 '.hack//G.U. Last Recode',
 '0 A.D.',
 '007 Legends',
 '1 Screen Platformer',
 '10 Second Ninja X',
 '1000 Amps',
 '103',
 '11-11 Memories Retold',
 '112 Operator',
 '140',
 '16bit Trader',
 '1701 A.D. Gold Edition',
 '18 Wheels of Steel: Across America',
 '18 Wheels of Steel: American Long Haul',
 '18 Wheels of Steel: Convoy',
 '18 Wheels of Steel: Extreme Trucker',
 "18 Wheels of Steel: Haulin'",
 '18 Wheels of Steel: Pedal to the Metal',
 '1848',
 '1954 Alcatraz',
 '1979 Revolution: Black Friday',
 '1993 Space Machine',
 '1NSANE',
 '25 to Life',
 '2Dark',
 '2Moons',
 '2XL Supercross',
 '303 Squadron: Battle of Britain',
 '3SwitcheD',
 '404Sight',
 '428: Shibuya Scramble',
 '4PM',
 '4X4: Hummer ',
 '5 Minutes Rage',
 '5 Star Rio Resort',
 '60 Parsecs!',
 '60 Seconds!',
 '7 Billion Humans',
 '7 Bones and 7 Stones - The Ritual',
 '7 Days to Die',
 '7.62: High Calibre',
 '7554',
 '7th Sector',
 '8-Bit Armies',
 '8-Bit Armies: Arena',
 '8-Bit Invaders!',
 '80  Days

In [None]:
for req in req_types:
    output[req] = {}

if len(output[req]) == 1:
    

In [None]:
[y.string for x in req_block[1].children for y in x]

In [None]:
html.find_all('a')
a = html.select(f'a[href ^="game/{letter}"]')

In [None]:
read_data(f"{root_url}/{a[0]['href']}")

In [None]:
game_html = simple_get(f"{root_url}/{a[0]['href']}")
game = soup(game_html, 'html.parser') 

In [None]:
f"{root_url}/{a[0]['href']}"

In [None]:
game.find_all('div')

In [None]:
output


In [None]:
a[0]

In [None]:
tmp = simple_get(url)

In [8]:
st.ascii_lowercase[19:]

'tuvwxyz'

In [None]:
print (html.get_text())

In [None]:
""" get all pages in a list"""
"""
find root url 
add on appropriate 

"""