Importanção das bibliotecas

In [None]:
import dataprocessing.datacolect as dc
import re
import pandas as pd
import numpy as np

# 1. Sobre a coleta no ***Metacritic***

# 2. Rotina de ***scraping***

## 2.1. Coleta dos ***hrefs*** para acessar as ***urls*** dos jogos

É necessário coletar o *link* para cada jogo em cada uma das páginas do ***Metacritic*** referentes ao ***Xbox One***. Há, para essa plataforma, 319 páginas.

A ***url*** abaixo será usada em cada iteração, adicionando-se apenas o número correto da página.

In [None]:
url = ("https://www.metacritic.com/browse/game/xbox-one/all/all-time/new/"
      "?platform=xbox-one&releaseYearMin=1910&releaseYearMax=2023&page=")

In [3]:
games_hrefs = list() # lista em que o href de cada jogo será inserido.

for page in range(1, 320):
    # Instancia-se o objeto BeautifulSoup:
    soup_class = dc.Soup(url + str(page))
    soup = soup_class.get_soup()

    # Classe para realização do scraping:
    scrape = dc.Scrape()

    # Objeto BeautifulSoup com os cartões para os jogos na página:
    game_cards = scrape.find_elements(
        tag='div', tag_class='c-finderProductCard c-finderProductCard-game',
        timeout=0.05, soup=soup
    )
    # Itera-se cada cartão para obtenção do href:
    for game in game_cards:
        href = scrape.find_element(
            tag='a', tag_class='c-finderProductCard_container g-color-gray80 u-grid',
            timeout=0, soup=game
        )
        href = href['href']
        games_hrefs.append(href)

## 2.2. Coleta das informações sobre os jogos

In [None]:
metacritic_url = 'https://www.metacritic.com'

In [None]:
data_games = list()
data_game = dict()

In [None]:
for href in games_hrefs:
    soup = dc.Soup(metacritic_url + str(href))
    soup = soup.get_soup()

    name = scrape.find_element(
        tag='div', timeout=0.1, soup=soup,
        tag_class='c-productHero_title g-inner-spacing-bottom-medium g-outer-spacing-top-medium'
    )
    name = re.sub(r'^\s+|\s+$', '', name.get_text(), flags=re.MULTILINE)
    
    try:
        release_date = scrape.find_element(
            tag='span', timeout=0, soup=soup,
            tag_class='g-outer-spacing-left-medium-fluid g-color-gray70 u-block'
        )
        release_date = release_date.get_text()
    except AttributeError:
        release_date = np.NaN

    try: 
        developer_div = scrape.find_element(
            tag='div', timeout=0, soup=soup,
            tag_class='c-gameDetails_Developer u-flexbox u-flexbox-row'
        )
        developer_soup = scrape.find_element(
            tag='li', timeout=0, soup=developer_div,
            tag_class='c-gameDetails_listItem g-color-gray70 u-inline-block'
        )
        developer = re.sub(r'^\s+|\s+$', '', developer_soup.get_text(), flags=re.MULTILINE)
    except AttributeError:
        developer = np.NaN

    try:    
        publisher_div = scrape.find_element(
            tag='div', timeout=0, soup=soup,
            tag_class='c-gameDetails_Distributor u-flexbox u-flexbox-row'
        )
        publisher_soup = scrape.find_element(
            tag='span', timeout=0, soup=publisher_div,
            tag_class='g-outer-spacing-left-medium-fluid g-color-gray70 u-block'
        )
        publisher = re.sub(r'^\s+|\s+$', '', publisher_soup.get_text(), flags=re.MULTILINE)
    except AttributeError:
        publisher = np.NaN
    
    try:
        genre = scrape.find_element(
            tag='div', timeout=0, soup=soup,
            tag_class='c-globalButton c-globalButton-small c-globalButton-primary'
        )
        genre = re.sub(r'^\s+|\s+$', '', genre.get_text(), flags=re.MULTILINE)
    except AttributeError:
        genre = np.NaN

    score = scrape.find_element(
        tag='div', timeout=0, soup=soup,
        tag_class=('c-siteReviewScore u-flexbox-column u-flexbox-alignCenter u-flexbox-justifyCenter g-text-bold c-siteRevie'
                   'wScore_tbdUser g-bg-white g-border-gray60 c-siteReviewScore_user g-color-gray90 c-siteReviewScore_medium')
    )
    score = score.get_text()
        
    data_game['name'] = name
    data_game['release_date'] = release_date
    data_game['developer'] = developer
    data_game['publisher'] = publisher
    data_game['genre'] = genre
    data_game['metascore'] = score

    data_games.append(data_game.copy())

In [None]:
metacritic = pd.DataFrame(data_games)

In [None]:
metacritic.to_csv('xbox_one_metacritic.csv', index=False)