In [22]:
import dataprocessing.datacolect as dc
import pandas as pd
import numpy as np

# 1. Sobre a coleta dos jogos

O objetivo deste *notebook* é obter o nome dos jogos que estão presentes no serviço ***Xbox Game Pass***. Portanto, é necessário extrair essas informações de uma página que tenha essas informações atualizadas constantemente. Isso deve ao fato da natureza do serviço.

O ***Xbox Game Pass*** recebe jogos e outros são retirados de maneira periódica. Os dados serão coletados do portal ***Windows Cetral***.

![Alt text](image.png)

Ademais, é importante obter os jogos para os diferentes segmentos do serviço que hoje se divide em:

> *Game Pass Core.*
>
> *Game Pass Standard.*
>
> *Game Pass Ultimate.*
>
> *PC Game Pass.*

Os 3 primeiros destinados aos consoles e o último apenas para os computadores. 

# 2. Rotina de *scraping*

In [2]:
# Url da página do Windows Central com a lista dos jogos:

url = 'https://www.windowscentral.com/xbox-game-pass-list'

In [3]:
soup = dc.Soup(url)
soup = soup.get_soup()
scrape = dc.Scrape()

In [4]:
type(soup)

bs4.BeautifulSoup

In [5]:
games_soup_list = scrape.find_elements(tag='ul', soup=soup)

In [6]:
def get_games_names(soup, index: int=2):
    if index in [2, 3, 4]:
        games_soup = scrape.find_elements(tag='li', soup=soup[index])
    else:
        raise ValueError('Parâmetro index fora do intervalo. As opções são 2 (console), 3 (PC) e 4 (Cloud).')
    
    games_list = list()

    for game in games_soup:
        if ' (👇🏻)' in game.get_text():
            game = game.get_text().replace(' (👇🏻)', '')
            games_list.append(game.upper())
        else:    
            games_list.append(game.get_text().upper())

    return games_list


def games_dataframe(games: list, column: str='name'):
    dataframe = pd.DataFrame({column: games})
    return dataframe

In [7]:
console_list = get_games_names(games_soup_list, index=2)
pc_list = get_games_names(games_soup_list, index=3)
cloud_list = get_games_names(games_soup_list, index=4)

In [8]:
console_games = games_dataframe(console_list)
console_games

Unnamed: 0,name
0,7 DAYS TO DIE
1,A PLAGUE TALE: REQUIEM
2,A WAY OUT
3,AGE OF EMPIRES II: DEFINITIVE EDITION
4,AGE OF EMPIRE IV
...,...
460,YOU SUCK AT PARKING
461,ZOMBIE ARMY 4: DEAD WAR
462,ZOO TYCOON: ULTIMATE ANIMAL COLLECTION
463,ZUMA


In [9]:
pc_games = games_dataframe(pc_list)
pc_games

Unnamed: 0,name
0,7 DAYS TO DIE
1,A PLAGUE TALE: REQUIEM
2,A WAY OUT
3,AGE OF EMPIRES: DEFINITIVE EDITION
4,AGE OF EMPIRES II: DEFINITIVE EDITION
...,...
441,YAKUZA KIWAMI 2
442,YAKUZA: LIKE A DRAGON
443,YOU SUCK AT PARKING
444,ZOMBIE ARMY 4: DEAD WAR


In [10]:
cloud_games = games_dataframe(cloud_list)

In [11]:
df = pd.DataFrame()

In [12]:
list_dataframes = [console_games, pc_games, cloud_games]

In [13]:
for dataframe in list_dataframes:
    df = pd.concat([df, dataframe])
    df = df.drop_duplicates(subset='name')

In [19]:
df

Unnamed: 0,name
0,7 DAYS TO DIE
1,A PLAGUE TALE: REQUIEM
2,A WAY OUT
3,AGE OF EMPIRES II: DEFINITIVE EDITION
4,AGE OF EMPIRE IV
...,...
297,THE GOOD SLICE
304,THE SIMS 2
306,THE WALKING DEAD: A NEW FRONTIER — THE COMPLET...
309,THE WALKING DEAD: THE COMPLETE SEASON


In [18]:
games_dataframe(pc_list, column='pc')

Unnamed: 0,pc
0,7 DAYS TO DIE
1,A PLAGUE TALE: REQUIEM
2,A WAY OUT
3,AGE OF EMPIRES: DEFINITIVE EDITION
4,AGE OF EMPIRES II: DEFINITIVE EDITION
...,...
441,YAKUZA KIWAMI 2
442,YAKUZA: LIKE A DRAGON
443,YOU SUCK AT PARKING
444,ZOMBIE ARMY 4: DEAD WAR


In [25]:
new_df = pd.merge(
    left=df, right=games_dataframe(pc_list, column='pc'), how='left',
    left_on='name', right_on='pc'
)

In [32]:
for row in new_df.loc[new_df['pc'].isnull()].iterrows():
    print(row)

(4, name    AGE OF EMPIRE IV
pc                   NaN
Name: 4, dtype: object)
(19, name    ASSASSIN’S CREED ODYSSEY
pc                           NaN
Name: 19, dtype: object)
(23, name    BANJO-KAZOOIE: NUTS & BOLTS
pc                              NaN
Name: 23, dtype: object)
(24, name    BANJO-KAZOOIE
pc                NaN
Name: 24, dtype: object)
(25, name    BANJO-TOOIE
pc              NaN
Name: 25, dtype: object)
(26, name    BATMAN: ARKHAM KNIGHT
pc                        NaN
Name: 26, dtype: object)
(27, name    BATTLEFIELD 1943
pc                   NaN
Name: 27, dtype: object)
(28, name    BATTLEFIELD 1
pc                NaN
Name: 28, dtype: object)
(29, name    BATTLEFIELD 3
pc                NaN
Name: 29, dtype: object)
(30, name    BATTLEFIELD 4
pc                NaN
Name: 30, dtype: object)
(31, name    BATTLEFIELD: BAD COMPANY
pc                           NaN
Name: 31, dtype: object)
(33, name    BATTLEFIELD HARDLINE
pc                       NaN
Name: 33, dtype: object)
(34,