<a href="https://colab.research.google.com/github/DamodaraBarbosa/infos_xbox_game_pass/blob/main/Web_Scraping_Xbox_Series_Wikipedia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Web Scraping de informações de jogos publicados para Xbox Series

Importando as bibliotecas

In [None]:
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup

## Criando a rotina de scraping

In [None]:
url = 'https://en.wikipedia.org/wiki/List_of_Xbox_Series_X_and_Series_S_games'

response = urlopen(url)
html = response.read().decode('utf-8')
soup = BeautifulSoup(html, 'html.parser')

In [None]:
def game_genres(genres):
  list_genres = list()

  if genres.find('li') != None:
    genres = genres.find_all('li')

    for genre in genres:
      list_genres.append(genre.get_text())
  else:
    genres = genres.find('a')
    list_genres.append(genres.get_text())
  return list_genres

In [None]:
# buscando os jogos no html:

games = soup.find_all('tr')

jogo = dict()
jogos = list()

for index, game in enumerate(games):
  try:
    jogo['name'] = game.find('i').get_text()
    game_infos = game.find_all('td')
    genres_html = game_infos[0]
    jogo['genre(s)'] = game_genres(genres_html)
    jogo['developer(s)'] = game_infos[1].get_text()
    jogo['publisher(s)'] = game_infos[2].get_text()
    jogo['releasedate'] = game_infos[4].get_text()
    jogo['addons'] = game_infos[6].get_text()
    jogos.append(jogo.copy())
  except (AttributeError, IndexError) as error:
    pass


## Gerando dataframe com todos os jogos Xbox Series

In [None]:
xbox_series_games = pd.DataFrame(jogos)

In [None]:
xbox_series_games[xbox_series_games['addons'].str.contains('\xa0˄\xa0')]

Unnamed: 0,name,genre(s),developer(s),publisher(s),releasedate,addons
185,Just Dance 2021,[Music],Ubisoft Paris\n,Ubisoft\n,"Nov 24, 2020",˄ \n
186,Just Dance 2022,[Music],Ubisoft Paris\n,Ubisoft\n,"Nov 4, 2021",˄ \n
209,Little Nightmares II,"[Puzzle-platformer, Survival horror]",Tarsier Studios\n,Bandai Namco Entertainment\n,"Feb 11, 2021",˄ \n


Primeiros tratamentos

In [None]:
# remove-se as quebras de linha ('\n') que aparece nas linhas do dataframe:

xbox_series_games = xbox_series_games.replace(r'\n', '', regex= True)

In [None]:
xbox_series_games.head()

Unnamed: 0,name,genre(s),developer(s),publisher(s),releasedate,addons
0,3 out of 10: Season One,"[Action-adventure, Puzzle]","Terrible Posture Games, Inc.","Terrible Posture Games, Inc.","Mar 3, 2021",SD OP
1,Aeterna Noctis,[Metroidvania],Aeternum Game Studios,Aeternum Game Studios,"Dec 15, 2021",
2,AEW Fight Forever,[Sports],Yuke's,AEW Games,TBA,
3,Ailment & Endurance Bundle,[Action-adventure],Ivan Panasenko,EpiXR Games,"Feb 17, 2021",SD OP
4,Alan Wake Remastered,[Action-adventure],Remedy Entertainment,Epic Games Publishing,"Oct 5, 2021",


In [None]:
# trata as siglas que aparecem nas informações do site e substitui por uma lista com informação mais literal

def format_addons(series_addons):
  try:
    addons_formated = list()

    for addons in series_addons:
      if '\xa0SD\xa0' in addons:
        addons_formated.append('Smart Delivery')
      elif '\xa0OP\xa0' in addons:
        addons_formated.append('Optimized for Xbox Series X/S')
      elif '\xa0XP\xa0' in addons:
        addons_formated.append('Cross-play')
      elif '\xa0PA\xa0' in addons:
        addons_formated.append('Play Anywhere')
      elif '\xa0EN\xa0' in addons:
        addons_formated.append('Enhanced for Xbox Series X/S')
      elif '\xa0˄\xa0' in addons:
        addons_formated.append('')
      else:
        addons_formated.append(''.join(addons))
  except AttributeError:
    pass
  
  if addons_formated == ['SD']:
    addons_formated = ['Smart Delivery']
  return addons_formated



In [None]:
for index, addons in enumerate(xbox_series_games['addons']):
  addons = format_addons(addons.split(' '))
  xbox_series_games.at[index, 'addons'] = addons

In [None]:
xbox_series_games

Unnamed: 0,name,genre(s),developer(s),publisher(s),releasedate,addons
0,3 out of 10: Season One,"[Action-adventure, Puzzle]","Terrible Posture Games, Inc.","Terrible Posture Games, Inc.","Mar 3, 2021","[Smart Delivery, Optimized for Xbox Series X/S]"
1,Aeterna Noctis,[Metroidvania],Aeternum Game Studios,Aeternum Game Studios,"Dec 15, 2021",[]
2,AEW Fight Forever,[Sports],Yuke's,AEW Games,TBA,[]
3,Ailment & Endurance Bundle,[Action-adventure],Ivan Panasenko,EpiXR Games,"Feb 17, 2021","[Smart Delivery, Optimized for Xbox Series X/S]"
4,Alan Wake Remastered,[Action-adventure],Remedy Entertainment,Epic Games Publishing,"Oct 5, 2021",[]
...,...,...,...,...,...,...
368,WWE 2K22,[Sports],Visual Concepts,2K,Mar 2022,[]
369,Yakuza: Like a Dragon,[Role-playing],Ryu Ga Gotoku Studio,Sega,"Nov 10, 2020","[Smart Delivery, Optimized for Xbox Series X/S]"
370,"Yes, Your Grace","[Role-playing, strategy]",Brave at Night,No More Robots,"Nov 10, 2020",[Smart Delivery]
371,Yu-Gi-Oh! Master Duel,[Card battle],Konami,Konami,"Jan 18, 2022",[]


In [None]:
xbox_series_games

Unnamed: 0,name,genre(s),developer(s),publisher(s),releasedate,addons
0,3 out of 10: Season One,"[Action-adventure, Puzzle]","Terrible Posture Games, Inc.","Terrible Posture Games, Inc.","Mar 3, 2021","[Smart Delivery, Optimized for Xbox Series X/S]"
1,Aeterna Noctis,[Metroidvania],Aeternum Game Studios,Aeternum Game Studios,"Dec 15, 2021",[]
2,AEW Fight Forever,[Sports],Yuke's,AEW Games,TBA,[]
3,Ailment & Endurance Bundle,[Action-adventure],Ivan Panasenko,EpiXR Games,"Feb 17, 2021","[Smart Delivery, Optimized for Xbox Series X/S]"
4,Alan Wake Remastered,[Action-adventure],Remedy Entertainment,Epic Games Publishing,"Oct 5, 2021",[]
...,...,...,...,...,...,...
368,WWE 2K22,[Sports],Visual Concepts,2K,Mar 2022,[]
369,Yakuza: Like a Dragon,[Role-playing],Ryu Ga Gotoku Studio,Sega,"Nov 10, 2020","[Smart Delivery, Optimized for Xbox Series X/S]"
370,"Yes, Your Grace","[Role-playing, strategy]",Brave at Night,No More Robots,"Nov 10, 2020",[Smart Delivery]
371,Yu-Gi-Oh! Master Duel,[Card battle],Konami,Konami,"Jan 18, 2022",[]


## Exportando o dataframe

In [None]:
xbox_series_games.to_csv('df_xbox_series_games.csv', index= False)