In [1]:
!pip install --no-cache-dir pandas numpy beautifulsoup4 selenium webdriver-manager




[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
import time

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

import concurrent.futures

In [3]:
def scrape(seasons, ranks):
    rank_names = {
        'bro': 'Bronze',
        'sil': 'Silver',
        'gol': 'Gold',
        'pla': 'Platinum',
        'dia': 'Diamond',
        'mas': 'Masters',
        'gra': 'Grandmaster'
    }
    
    id_anchor = 1 # remains static
    data_list = []

    # setup selenium web driver
    service = Service(ChromeDriverManager().install())
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(service=service, options=chrome_options)

    for season in seasons:
        for rank in ranks:
            max_rating = 99999 # arbitrarily high max rating for new season
            page_idx = 0 # start from first page for new season

            while True:
                print(f"Season: {season}, Fetching page: {page_idx + 1}")
                url = f"https://sc2pulse.nephest.com/sc2/?season={season}&queue=LOTV_1V1&team-type=ARRANGED&us=true&eu=true&kr=true&cn=true&{rank}=true&page={page_idx}&type=ladder&ratingAnchor={max_rating}&idAnchor={id_anchor}&count=1#ladder-top"

                driver.get(url)
                time.sleep(10) # change based on tolerance of website

                soup = BeautifulSoup(driver.page_source, 'html.parser')
                ladder_table_container = soup.find('div', id='ladder-table-container')
                ratings_on_page = []

                if ladder_table_container:
                    tbody = ladder_table_container.find('tbody')
                    rows = tbody.find_all('tr')
                    if not rows:
                        print(f"No more data found on this page for season {season}.")
                        break

                    for row in rows:
                        rating = row.find('td', class_='rating').text.strip()
                        race_img = row.find('span', class_='race-percentage-entry').find('img', alt=True)
                        region_img = row.find('img', class_='table-image-long')
                        race = race_img['alt'] if race_img else 'Random'
                        region = region_img['alt'] if region_img else 'Unknown'

                        data_list.append({'Region': region.upper(), 'Season': int(season), 'MMR': int(rating), 'Rank': rank_names.get(rank, rank), 'Race': race.title()})

                if ratings_on_page:
                    max_rating = min(ratings_on_page)
                page_idx += 1

    driver.quit()
    return data_list

In [4]:
total_seasons = np.arange(28, 59) # seasons 28 through 58
season_splits = np.array_split(total_seasons, 2) # split seasons into 2 parts
ranks = ['bro', 'sil', 'gol', 'pla', 'dia', 'mas', 'gra']

# parallelize web scraping
with concurrent.futures.ThreadPoolExecutor() as executor:
    # call `scrape` on all worker nodes
    futures = [executor.submit(scrape, seasons, ranks) for seasons in season_splits]
    
    # empty list to aggregate data
    all_data = []
    for future in concurrent.futures.as_completed(futures):
        all_data.extend(future.result())

# convert to df + save
data = pd.DataFrame(all_data)
data.to_csv('data/sc2_ladder.csv', index=False)

Season: 28, Fetching page: 1
Season: 44, Fetching page: 1
