In [1]:
import sys
sys.path.append(r'C:\Users\damod\OneDrive\Área de Trabalho\Projetos Python\dataprocessing')

Importanção das bibliotecas

In [2]:
from concurrent.futures import ThreadPoolExecutor
import dataprocessing.datacolect as dc
import re
import pandas as pd
import numpy as np

# 1. Sobre a coleta no ***Metacritic***

# 2. Rotina de ***scraping***

## 2.1. Coleta dos ***hrefs*** para acessar as ***urls*** dos jogos

É necessário coletar o *link* para cada jogo em cada uma das páginas do ***Metacritic*** referentes ao ***Xbox One***. Há, para essa plataforma, 319 páginas.

A ***url*** abaixo será usada em cada iteração, adicionando-se apenas o número correto da página.

In [108]:
url = ('https://www.metacritic.com/browse/game/xbox-series-x/all/all-time/new/'
       '?platform=xbox-series-x&releaseYearMin=1910&releaseYearMax=2023&page=')

In [109]:
def get_last_page(url: str):
    soup_class = dc.Soup(url)
    soup = soup_class.get_soup()
    scrape = dc.Scrape()

    last_page_soup = scrape.find_elements(
        tag='span', timeout=0, soup=soup,
        tag_class=('c-navigationPagination_itemButtonContent u-flexbox u-flexbox-alignCenter u-flexbox-justifyCenter')
    )
    last_page = int(re.sub(r'^\s+|\s+$', '', last_page_soup[-2].get_text(), flags=re.MULTILINE))

    return last_page

In [110]:
last_page = get_last_page(url)

In [112]:
def get_hrefs(url: str, last_page: int, init_page: int=1):
    hrefs = list() # lista em que o href de cada jogo será inserido.

    for page in range(init_page, last_page+1): # Para iterar até a última página devemos adicionar 1.
        # Instancia-se o objeto BeautifulSoup:
        soup_class = dc.Soup(url + str(page))
        soup = soup_class.get_soup()

        # Classe para realização do scraping:
        scrape = dc.Scrape()

        # Objeto BeautifulSoup com os cartões para os jogos na página:
        game_cards = scrape.find_elements(
            tag='div', tag_class='c-finderProductCard c-finderProductCard-game',
            timeout=0.05, soup=soup
        )
        # Itera-se cada cartão para obtenção do href:
        for game in game_cards:
            href = scrape.find_element(
                tag='a', tag_class='c-finderProductCard_container g-color-gray80 u-grid',
                timeout=0, soup=game
            )
            href = href['href']
            hrefs.append(href)
    return hrefs

In [113]:
with ThreadPoolExecutor(max_workers=3) as executor:
    games_hrefs = executor.submit(get_hrefs, url, last_page)
games_hrefs = games_hrefs.result()

## 2.2. Coleta das informações sobre os jogos

In [114]:
metacritic_url = 'https://www.metacritic.com'

In [115]:
def get_data_games(hrefs: list, metacritic_url: str):
    data_games = list()
    data_game = dict()
    
    for href in hrefs:
        soup_class = dc.Soup(metacritic_url + str(href))
        soup = soup_class.get_soup()
        scrape = dc.Scrape()

        name_soup = scrape.find_element(
            tag='div', timeout=0.1, soup=soup,
            tag_class='c-productHero_title g-inner-spacing-bottom-medium g-outer-spacing-top-medium'
        )
        name = re.sub(r'^\s+|\s+$', '', name_soup.get_text(), flags=re.MULTILINE)
        try:
            score_box_soup = scrape.find_element(
                tag='div', timeout=0, soup=soup,
                tag_class=("c-siteReviewScore u-flexbox-column u-flexbox-alignCenter"
                           " u-flexbox-justifyCenter g-text-bold c-siteReviewScore_green"
                           " g-color-gray90 c-siteReviewScore_medium")
            )
            score_soup = scrape.find_element(
                tag='span', timeout=0, soup=score_box_soup           
            )
            score = int(score_soup.get_text())
        except AttributeError:
            score = np.NaN
        data_game['name'] = name
        data_game['metascore'] = score

        data_games.append(data_game.copy())
    return data_games

In [116]:
with ThreadPoolExecutor(max_workers=3) as executor:
    metacritic = get_data_games(games_hrefs, metacritic_url)

In [117]:
metacritic = pd.DataFrame(metacritic)

In [118]:
metacritic

Unnamed: 0,name,metascore
0,Forza Motorsport,85.0
1,Lil Gator Game,84.0
2,Wild Card Football,85.0
3,Long Gone Days,85.0
4,Pirates on Target,94.0
...,...,...
1924,WWE 2K Battlegrounds,85.0
1925,Bounty Battle,94.0
1926,WRC 9 FIA World Rally Championship,81.0
1927,EA SPORTS UFC 4,78.0


# 3. ***Export*** do arquivo *csv*

In [119]:
metacritic['metascore'].unique()

array([ 85.,  84.,  94.,  80.,  95.,  76.,  89.,  90.,  91.,  77.,  88.,
        97.,  96.,  75.,  78.,  93.,  79.,  83.,  81.,  87.,  92.,  82.,
        86., 100.,  99.,  98.,  nan])

In [121]:
metacritic.to_csv('xbox_series_metacritic.csv', index=False)