In [3]:
import pandas as pd
import numpy as np
import shutil
import lxml
import lxml.etree
import time
import os

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service

from webdriver_manager.chrome import ChromeDriverManager

import concurrent.futures

In [4]:
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from webdriver_manager.chrome import ChromeDriverManager
from tqdm import tqdm
from multiprocessing import Pool
import os
import random

def scrape_rank_season(args):
    season, rank, rank_names, id_anchor = args
    data_list = []
    
    # Since we're in a new process, set up the driver again
    service = Service(ChromeDriverManager().install())
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    max_rating = 99999 # arbitrarily high max rating for new season
    page_idx = 0 # start from first page for new season
    
    while True:
        print(f"Season: {season}, Rank: {rank}, Fetching page: {page_idx + 1}, Process ID: {os.getpid()}")
        url = f"https://sc2pulse.nephest.com/sc2/?season={season}&queue=LOTV_1V1&team-type=ARRANGED&us=true&eu=true&kr=true&cn=true&{rank}=true&page={page_idx}&type=ladder&ratingAnchor={max_rating}&idAnchor={id_anchor}&count=1#ladder-top"
        
        driver.get(url)
        delay = random.uniform(30, 60)
        time.sleep(delay) # adjust based on the tolerance of the website
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        ladder_table_container = soup.find('div', id='ladder-table-container')
        ratings_on_page = []
        
        if ladder_table_container:
            tbody = ladder_table_container.find('tbody')
            rows = tbody.find_all('tr')
            if not rows:
                print(f"No more data found on this page for season {season}, rank {rank}.")
                break
                
            for row in rows:
                rating = row.find('td', class_='rating').text.strip()
                race_img = row.find('span', class_='race-percentage-entry').find('img', alt=True)
                region_img = row.find('img', class_='table-image-long')
                race = race_img['alt'] if race_img else 'Random'
                region = region_img['alt'] if region_img else 'Unknown'
                
                data_list.append({'Region': region.upper(), 'Season': int(season), 'MMR': int(rating), 'Rank': rank_names[rank], 'Race': race.title()})
                
            if ratings_on_page:
                max_rating = min(ratings_on_page)
            page_idx += 1
    
    driver.quit()
    return data_list

def scrape(seasons, ranks):
    rank_names = {
        'bro': 'Bronze',
        'sil': 'Silver',
        'gol': 'Gold',
        'pla': 'Platinum',
        'dia': 'Diamond',
        'mas': 'Masters',
        'gra': 'Grandmaster'
    }
    
    id_anchor = 1 # remains static
    args_list = [(season, rank, rank_names, id_anchor) for season in seasons for rank in ranks]
    
    # Set up multiprocessing pool
    with Pool(processes=os.cpu_count()) as pool:
        results = list(tqdm(pool.imap_unordered(scrape_rank_season, args_list), total=len(args_list)))
    
    # Flatten list of lists
    data_list = [item for sublist in results for item in sublist]
    
    return data_list


In [5]:
import numpy as np
import concurrent.futures
from tqdm import tqdm

total_seasons = np.arange(28, 59)  # seasons 28 through 58
season_splits = np.array_split(total_seasons, 4)  # split seasons into 2 parts
id_anchors = np.arange(len(season_splits))

def scrape_with_progress(seasons, id_anchor):
    # Here, you should insert the modified `scrape` function I provided earlier.
    # Ensure it's adapted to accept and process `seasons` and `id_anchor` correctly.
    # For simplicity, this placeholder assumes it returns a list of data.
    return scrape(seasons, id_anchor)

def scrape_all():
    # Using ProcessPoolExecutor for multiprocessing
    with concurrent.futures.ProcessPoolExecutor() as executor:
        # Prepare for progress bar tracking
        tasks = [executor.submit(scrape_with_progress, seasons, id_anchor) for seasons, id_anchor in zip(season_splits, id_anchors)]
        all_data = []
        
        # tqdm wrapper for as_completed to track progress
        for future in tqdm(concurrent.futures.as_completed(tasks), total=len(tasks)):
            all_data.extend(future.result())
    
    return all_data

# Now calling scrape_all to do the actual scraping and tracking progress
all_data = scrape_all()

  0%|          | 0/4 [00:00<?, ?it/s]


BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.