### Initial setup
* Libraries
* Splinter Browser config

In [1]:
from bs4 import BeautifulSoup as bs
import pandas as pd
from splinter import Browser
import time
from tqdm import tqdm

In [2]:
# Windows
# executable_path = {"executable_path": "driver/chromedriver.exe"}
# browser = Browser("chrome", **executable_path)

In [3]:
# MAC
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

### Initial Scraping
This section of code scrapes data for the first page of the website. We noted the below:
* The website shows all games by infinitely scrolling. We worked around this limitation by inspecting the HTML and finding the pagination div tag.
* The first page has 50 rows of games and the consequent pages have 25 rows each.
* However, the 50 rows in  Page 1 include the 25 rows from Page 2 i.e. Page 1 displays games of BOTH Page 1 and Page 2 combined.
* The game_name, release_date and price are scraped from the 50 rows.
* After scraping data from this page, we get the URL of Page 3 and use splinter to visit that page to continue scraping. Since Page 1 does not display page numbers, we also retrieve the last page number from Page 3's HTML to use in the for loop coming up.

In [4]:
# Visit the top sellers section on the Steam store using splinter
url="https://store.steampowered.com/search/?filter=topsellers"
browser.visit(url)
time.sleep(2)

# Get the html of the page and create a BeautifulSoup object
html = browser.html
soup = bs(html, 'html.parser')

In [5]:
# Create an empty list to append game info
top_sellers = []

# Start the result counter at zero
result_count = 0

# Get the parent div for each game row
results=soup.find_all("div",class_="responsive_search_name_combined")

# Loop over the rows and retrieve game data for the first 50 games
for result in results:
        
    # get the game title
    game_name = result.find("span",class_="title").text
        
    # get the release date
    release_date = result.find("div",class_="col search_released responsive_secondrow").text
        
    # get the price
    price = result.find("div",{"class":["col search_price responsive_secondrow","col search_price discounted responsive_secondrow"]}).text.strip()
        
    # add the key value pairs to python dictionary and append to the list
    top_sellers.append({"game_name": game_name,
                        "release_date": release_date,
                        "price": price })
    
    # increase counter once each result is processed
    result_count += 1


In [6]:
print(result_count)

50


In [7]:
# After the initial scraping, find the pagination parent div on the main page and get the url for page 3 
pagination_div = soup.find('div', class_='search_pagination_right')
page_3_url = pagination_div.find_all('a')[1]['href']

# use splinter to proceed to the url
browser.visit(page_3_url)
time.sleep(2)       
    
# Create a new soup object out of the new html
newhtml = browser.html
newsoup = bs(newhtml, 'html.parser')

# get max page number from the pagination parent div
pagination_div = newsoup.find('div', class_='search_pagination_right')
max_pages = int(pagination_div.find_all('a')[-2].text)

### Scraping
* This section of code goes into a nested for loop for scraping data and will take about **9-11 minutes** to complete.
* The scraped data is appended to the top_sellers list as a dictionary.
* We find the next page url and use splinter to visit the url after each page loop.


In [8]:
page_num = 2

# Loop over each page to scrape until we hit the total number of pages (-2 as we have already scraped p1 & p2)
# tqdm is a progress bar module that tracks progress of a loop
for i in tqdm(range(0, max_pages-2)):
    
    # Create a new soup object out of the new html
    html = browser.html
    soup = bs(html, 'html.parser')  
    
    # get the parent div for each game row
    results=soup.find_all("div",class_="responsive_search_name_combined")

    # loop over each row (25 rows in each page) and retrieve game data
    for result in results:
        
        # get the game title
        game_name = result.find("span",class_="title").text
        
        # get the release date
        release_date = result.find("div",class_="col search_released responsive_secondrow").text
        
        # get the price
        price = result.find("div",{"class":["col search_price responsive_secondrow","col search_price discounted responsive_secondrow"]}).text.strip()
        
        # add the key value pairs to python dictionary and append to the list
        top_sellers.append({"game_name": game_name,
                            "release_date": release_date,
                            "price": price })
        
        # increase counter for results once each result is processed
        result_count += 1
        
    # increase counter for page number once each page is processed
    page_num +=1
        
    if page_num != max_pages:
        # After scraping the 25 results, find the pagination parent div
        pagination_div = soup.find('div', class_='search_pagination_right')
            
        # get the next page url from the second active page link with a class of 'pagebtn' 
        next_page_url = pagination_div.find_all('a', class_='pagebtn')[1]['href']
            
        # use splinter to proceed to the next page's url
        browser.visit(next_page_url)
        time.sleep(1)     
        
    else:
        print(f"Last page (p. {page_num}) reached - no more pages to scrape.")
        break

    
print(f'Scraping Complete! {result_count} top-selling games scraped.')


100%|█████████▉| 224/225 [09:24<00:02,  2.41s/it]

Last page (p. 227) reached - no more pages to scrape.
Scraping Complete! 5675 top-selling games scraped.


In [9]:
print(max_pages)


227


In [10]:
# Check list of results
for top in range(len(top_sellers)):
    print(f"{top}. {top_sellers[top]['game_name']} | {top_sellers[top]['release_date']} |\
          {top_sellers[top]['price']}")

0. Fall Guys: Ultimate Knockout | 3 Aug, 2020 |          CDN$ 22.79
1. Fall Guys Collector's Edition |  |          CDN$ 33.99
2. Marvel's Avengers | 4 Sep, 2020 |          CDN$ 79.99
3. Factorio | 14 Aug, 2020 |          CDN$ 34.00
4. Microsoft Flight Simulator | 8/18/2020 |          CDN$ 79.99
5. Risk of Rain 2 | 11 Aug, 2020 |          CDN$ 24.99CDN$ 19.99
6. Fall Guys: Collectors Pack | 3 Aug, 2020 |          CDN$ 11.20
7. Among Us | 16 Nov, 2018 |          CDN$ 5.69
8. Horizon Zero Dawn™ Complete Edition | 7 Aug, 2020 |          CDN$ 59.99
9. Battle Brothers - Blazing Deserts | 13 Aug, 2020 |          CDN$ 17.49
10. Marvel's Avengers Deluxe Edition |  |          CDN$ 106.98
11. Endless Space® 2 Collection |  |          CDN$ 103.92CDN$ 24.62
12. Microsoft Flight Simulator | 8/18/2020 |          CDN$ 79.99
13. Battlefield ™ V | 11 Jun, 2020 |          CDN$ 64.99CDN$ 19.49
14. The Henry Stickmin Collection | 7 Aug, 2020 |          CDN$ 17.49
15. Halo: The Master Chief Collection | 3 D

1480. Borderlands 3: Retro Cosmetic Pack | 13 Mar, 2020 |          CDN$ 6.99
1481. Borderlands 3: Gold Weapon Skins Pack | 13 Mar, 2020 |          CDN$ 6.99
1482. Wildermyth | 13 Nov, 2019 |          CDN$ 22.79
1483. NARUTO TO BORUTO: SHINOBI STRIKER | 30 Aug, 2018 |          CDN$ 79.99
1484. Borderlands 3: Toy Box Weapons Pack | 13 Mar, 2020 |          CDN$ 6.99
1485. Borderlands 3: Neon Cosmetic Pack | 13 Mar, 2020 |          CDN$ 6.99
1486. The Binding of Isaac: Afterbirth | 30 Oct, 2015 |          CDN$ 11.99
1487. DOOM Eternal - Year One Pass | Coming in 2020 |          CDN$ 39.99
1488. MORDHAU Supporter Bundle |  |          CDN$ 45.11
1489. Fantasy Strike - Core Pack | 21 Jul, 2020 |          CDN$ 22.79
1490. DUSK | 10 Dec, 2018 |          CDN$ 21.99
1491. eFootball PES 2021 | 15 Sep, 2020 |          CDN$ 42.99
1492. Vampire: The Masquerade - Coteries of New York Deluxe Edition |  |          CDN$ 34.13CDN$ 18.40
1493. Saints Row IV: Game of the Century Edition | 15 Jul, 2014 |    

2980. Ball 3D: Soccer Online | 31 Mar, 2017 |          CDN$ 1.09CDN$ 0.59
2981. Ys VIII: Lacrimosa of DANA - Elixir Set 2 / 霊薬セット（２） | 16 Apr, 2018 |          CDN$ 1.49
2982. Ys VIII: Lacrimosa of DANA - Fish Bait Set 1 / 釣り餌セット（１） | 16 Apr, 2018 |          CDN$ 1.49
2983. Ys VIII: Lacrimosa of DANA - Premium Material Set / プレミアム素材セット | 16 Apr, 2018 |          CDN$ 1.49
2984. Monster Hunter: World - Sticker Set: Mega Man Set | 18 Oct, 2018 |          CDN$ 2.49
2985. Ys VIII: Lacrimosa of DANA - Economy Ingredient Set / 徳用食材セット | 16 Apr, 2018 |          CDN$ 1.49
2986. Ys VIII: Lacrimosa of DANA - Tempest Set 5 / 秘薬セット（５） | 16 Apr, 2018 |          CDN$ 1.49
2987. Ys VIII: Lacrimosa of DANA - Tempest Set 1 / 秘薬セット（１） | 16 Apr, 2018 |          CDN$ 1.49
2988. Mega Man Legacy Collection | 24 Aug, 2015 |          CDN$ 18.99
2989. Ys VIII: Lacrimosa of DANA - Elixir Set 3 / 霊薬セット（３） | 16 Apr, 2018 |          CDN$ 1.49
2990. Ys VIII: Lacrimosa of DANA - Fish Bait Set 2 / 釣り餌セット（２） | 16 Apr, 2

4357. Sepia Tears - Original Soundtrack | 7 Dec, 2018 |          CDN$ 7.99
4358. The Sorrowvirus: A Faceless Short Story | 30 May, 2020 |          CDN$ 6.69
4359. Hero Siege - Chaos Lancer (Skin) | 24 Jul, 2019 |          CDN$ 5.69
4360. Hero Siege - Demonblade (Skin) | 21 Jul, 2020 |          CDN$ 5.69
4361. Hero Siege - Gladiator Marauder (Skin) | 15 Aug, 2019 |          CDN$ 5.69
4362. Hero Siege - Reanimated Warrior (Skin) | 22 Jul, 2020 |          CDN$ 5.69
4363. House of 1000 Doors: Serpent Flame | 23 May, 2019 |          CDN$ 7.99CDN$ 2.39
4364. Hero Siege - Entombed Demon (Skin) | 21 Jul, 2020 |          CDN$ 5.69
4365. Europa Universalis IV: Rights of Man Collection |  |          CDN$ 27.34
4366. Hero Siege - Tribal Amazon (Skin) | 15 Aug, 2019 |          CDN$ 5.69
4367. WWE 2K19 - Accelerator | 9 Oct, 2018 |          CDN$ 6.99
4368. Hero Siege - Riftmancer (Skin) | 17 Oct, 2019 |          CDN$ 5.69
4369. Beat Saber - Pegboard Nerds - "Emoji VIP" | 14 Mar, 2019 |          CDN$

5460. Yu-Gi-Oh! Classic: Champion vs. Creator Part 2 | 28 Mar, 2017 |          CDN$ 2.19
5461. The Deer | 3 Dec, 2015 |          CDN$ 0.89CDN$ 0.59
5462. Yu-Gi-Oh! Classic: Ties of Friendship | 28 Mar, 2017 |          CDN$ 2.19
5463. Yu-Gi-Oh! Classic: Dungeon Dice Monsters Part 1 | 28 Mar, 2017 |          CDN$ 2.19
5464. Streets of Rage 2 | 26 Jan, 2011 |          CDN$ 1.19
5465. Yu-Gi-Oh! Classic: Everything's Relative | 28 Mar, 2017 |          CDN$ 2.19
5466. Yu-Gi-Oh! Classic: Face Off Part 2 | 28 Mar, 2017 |          CDN$ 2.19
5467. Sea Legends: Phantasmal Light Collector's Edition | 17 Sep, 2014 |          CDN$ 5.49CDN$ 1.09
5468. Yu-Gi-Oh! Classic: Yugi vs. Pegasus Match of the Millennium Part 4 | 28 Mar, 2017 |          CDN$ 2.19
5469. Yu-Gi-Oh! Classic: Into the Hornet's Nest | 28 Mar, 2017 |          CDN$ 2.19
5470. Yu-Gi-Oh! Classic: Yugi vs. Pegasus Match of the Millennium Part 1 | 28 Mar, 2017 |          CDN$ 2.19
5471. Yu-Gi-Oh! Classic: The Scars of Defeat | 28 Mar, 2017

In [11]:
# Create a dataframe with all the information
top_sellers_df = pd.DataFrame(top_sellers)
top_sellers_df

Unnamed: 0,game_name,release_date,price
0,Fall Guys: Ultimate Knockout,"3 Aug, 2020",CDN$ 22.79
1,Fall Guys Collector's Edition,,CDN$ 33.99
2,Marvel's Avengers,"4 Sep, 2020",CDN$ 79.99
3,Factorio,"14 Aug, 2020",CDN$ 34.00
4,Microsoft Flight Simulator,8/18/2020,CDN$ 79.99
...,...,...,...
5670,There's Poop In My Soup,"14 Mar, 2016",CDN$ 1.09
5671,Red Orchestra: Ostfront 41-45,"14 Mar, 2006",CDN$ 5.69
5672,Glass Masquerade 2: Illusions,"27 Feb, 2019",CDN$ 5.69
5673,Tom Clancy's Rainbow Six® Siege - Montagne Bus...,"13 Dec, 2016",CDN$ 10.99


In [12]:
# Save dataframe into csv file for reading in transform stage
top_sellers_df.to_csv("data/Steam_top_sellers.csv")