In [3]:
import time
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import ProcessPoolExecutor
from bs4 import BeautifulSoup
import requests

headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) ' +
                         'AppleWebKit/537.36 (KHTML, like Gecko) ' +
                         'Chrome/69.0.3497.100 ' +
                         'Safari/537.36' +
                         'OPR/56.0.3051.99'}

url_head = 'https://www.metacritic.com'

Список URL'ов по каждой доступной на MetaCritic платформе

In [4]:
platforms = {'PlayStation 4' : 'https://www.metacritic.com/browse/games/release-date/available/ps4/date',
            'Xbox One' : 'https://www.metacritic.com/browse/games/release-date/available/xboxone/date',
            'Switch' : 'https://www.metacritic.com/browse/games/release-date/available/switch/date',
            'PC' : 'https://www.metacritic.com/browse/games/release-date/available/pc/date',
            'Wii U': 'https://www.metacritic.com/browse/games/release-date/available/wii-u/date',
            '3DS' : 'https://www.metacritic.com/browse/games/release-date/available/3ds/date',
            'PlayStation Vita' : 'https://www.metacritic.com/browse/games/release-date/available/vita/date',
            'iOS' : 'https://www.metacritic.com/browse/games/release-date/available/ios/date',
            'PlayStation 3' : 'https://www.metacritic.com/browse/games/release-date/available/ps3/date',
            'PlayStation 2' : 'https://www.metacritic.com/browse/games/release-date/available/ps2/date',
            'PlayStation' : 'https://www.metacritic.com/browse/games/release-date/available/ps/date',
            'Xbox 360' : 'https://www.metacritic.com/browse/games/release-date/available/xbox360/date',
            'Xbox' : 'https://www.metacritic.com/browse/games/release-date/available/xbox/date',
            'Wii' : 'https://www.metacritic.com/browse/games/release-date/available/wii/date',
            'DS' : 'https://www.metacritic.com/browse/games/release-date/available/ds/date',
            'GameCube' : 'https://www.metacritic.com/browse/games/release-date/available/gamecube/date',
            'Nintendo 64' : 'https://www.metacritic.com/browse/games/release-date/available/n64/date',
            'Game Boy Advance' : 'https://www.metacritic.com/browse/games/release-date/available/gba/date',
            'PSP' : 'https://www.metacritic.com/browse/games/release-date/available/psp/date',
            'Dreamcast' : 'https://www.metacritic.com/browse/games/release-date/available/dreamcast/date'} 

Маппер для параллельного обкачивания информации

In [5]:
def pool_mapper(urls, func, max_workers=32):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
            return executor.map(func, urls)

Функции для выкачивания информации об играх по их URL

In [15]:
def get_data(url):
    try:
        html = requests.get(url, headers=headers).text
        soup = BeautifulSoup(html, 'html.parser')
    
        name = soup.select_one('.hover_none').text.lstrip().rstrip()
        platform = soup.find('span', class_='platform').text.lstrip().rstrip()
        date = soup.select_one('li.summary_detail.release_data').find(class_="data").text
    
        return [name, platform, date]

    except:
        print('Exception in ' + url)
        return []

    
def get_data_by_list(game_url_list, step=100, time_sleep=5):
    data = []

    for i in range(0, len(game_url_list), step):
        url_cut = game_url_list[i: min(i + step, len(game_url_list))]

        for item in pool_mapper(url_cut, get_data):
            if len(item) > 0:
                data.append(item)
        time.sleep(time_sleep)
        
    return data

Функции для выкачивания списка URL'ов игр

In [None]:
def get_url_from_pages(page_url):
    html = requests.get(page_url, headers=headers).text
    soup = BeautifulSoup(html, 'html.parser')
    
    title_list = soup.findAll('div', class_="basic_stat product_title")
    
    url_list = []
    for item in title_list:
        url_end = item.find('a', href=True)['href']
        url_list.append(url_head + url_end)
        
    return url_list

def get_url_lists(start_urt):
    list_url_lists = []

    url = start_urt
    while True:
        list_url_lists.append(url)
        html = requests.get(url, headers=headers).text
        soup = BeautifulSoup(html, 'html.parser')
    
        res_flipper = soup.find('span', class_='flipper next')
        if res_flipper is None:
            break
        res_url = res_flipper.find('a')
        if res_url is None:
            break
        next_url_end = res_url['href']
        url = url_head + next_url_end

    return list_url_lists

Получить всю библиотеку платформы

In [16]:
def analyze(xlsx_name, start_url):
    list_url_lists = get_url_lists(start_url)
    
    game_url_list = [item for url_list in pool_mapper(list_url_lists, get_url_from_pages, 
                        max_workers=len(list_url_lists)) for item in url_list]

    data = get_data_by_list(game_url_list)
        
    df = pd.DataFrame(data)
    df = df.drop_duplicates()
    df = df.rename(index=str, columns={0: "Name", 1: "Platform", 2: "Date_of_Release"})
    
    df.to_excel(xlsx_name + ".xlsx")
    
    return df

In [8]:
%%time

analyze('GameCube', platforms['GameCube'])

CPU times: user 51.8 s, sys: 2.15 s, total: 53.9 s
Wall time: 1min 56s


Unnamed: 0,Name,Platform,Date_of_Release
0,Madden NFL 08,GameCube,"Aug 14, 2007"
1,Ratatouille,GameCube,"Jun 26, 2007"
2,Surf's Up,GameCube,"Jun 1, 2007"
3,Backyard Sports Baseball 2007,GameCube,"Apr 3, 2007"
4,Disney's Meet the Robinsons,GameCube,"Mar 27, 2007"
5,TMNT,GameCube,"Mar 20, 2007"
6,Zatch Bell! Mamodo Fury,GameCube,"Dec 12, 2006"
7,The Legend of Zelda: Twilight Princess,GameCube,"Dec 11, 2006"
8,Shrek Smash n' Crash Racing,GameCube,"Nov 21, 2006"
9,Tomb Raider: Legend,GameCube,"Nov 14, 2006"
