In [70]:
import pandas as pd

all_games_dataset = pd.read_csv('all_games.csv')

all_games_dataset['release_date'] = pd.to_datetime(all_games_dataset['release_date'])
all_games_dataset.rename(columns={'name': 'Title'}, inplace=True)
all_games_dataset['platform'] = all_games_dataset['platform'].str.strip()

unique_titles_sample = all_games_dataset['Title'].drop_duplicates().sample(n=1400)
games_sample = unique_titles_sample.to_frame().merge(all_games_dataset, on= 'Title')
games_sample.to_csv('games_sample.csv')
print(str(len(games_sample)))

6


In [71]:
import requests

def get_wikipedia_extract(game):
    url = 'https://en.wikipedia.org/w/api.php'
    params = {
        'action': 'query',
        'format': 'json',
        'titles': game,
        'prop': 'extracts',
        'exintro': True,
        'explaintext': True,
    }
 
    response = requests.get(url, params=params)
    data = response.json()
    if int(next(iter(data['query']['pages'].keys()))) < 0:
        return ''
    page_extract = next(iter(data['query']['pages'].values()))['extract']
    return page_extract

In [72]:
from bs4 import BeautifulSoup

def get_genre(raw_html): 
    if raw_html == '':
        return ''
    soup = BeautifulSoup(raw_html, 'html.parser')
    tbody = soup.find('tbody')
    if tbody == None:
        return ''
    genreElement = tbody.find('a', {'title': 'Video game genre'})
    if genreElement == None:
        return ''
    genre = genreElement.find_parent().find_parent().find('td').find('a')
    if genre == None:
        return ''
    return genre.text

def get_modes(raw_html):
    if raw_html == '':
        return []
    soup = BeautifulSoup(raw_html, 'html.parser')
    tbody = soup.find('tbody')
    if tbody == None:
        return []
    modesElement = tbody.find(text='Mode(s)')
    if modesElement == None:
        return []
    modesElement = modesElement.find_parent()
    if modesElement == None:
        return []
    modes = modesElement.find_parent().find('td').findAll('a', recursive=False)
    if modes == None:
        return []
    return list(map(lambda mode: mode.text, modes))

def get_from_infobox(game): 
    url = 'https://en.wikipedia.org/w/api.php'
    params = {
        'action': 'parse',
        'page': game,
        'format': 'json',
        'prop': 'text',
        'redirects': ''
    }
 
    response = requests.get(url, params=params)
    data = response.json()
    if 'error' in data.keys():
        return {'genre': '', 'modes': []}
    raw_html = data['parse']['text']['*']
    info = {'genre': get_genre(raw_html), 'modes': get_modes(raw_html)}
    return info

In [73]:
wikipedia_extracts = []
genres = []
modes = []

for game in games_sample['Title']:
    wikipedia_extracts.append(get_wikipedia_extract(game))
    info = get_from_infobox(game)
    genres.append(info['genre'])
    modes.append(info['modes'])

games_sample['Wikipedia'] = wikipedia_extracts
games_sample['Genre'] = genres
games_sample['Modes'] = modes
games_sample.head()

Unnamed: 0,Title,platform,release_date,summary,meta_score,user_review,Wikipedia,Genre,Modes
0,The Last Federation,PC,2014-04-18,From the creators of AI War: Fleet Command com...,72,6.9,The Last Federation is an indie strategy video...,,[]
1,Battlefield 3: Armored Kill,PC,2012-09-11,Battlefield 3: Armored Kill ups the ante for v...,87,7.1,,First-person shooter,"[Single-player, multiplayer]"
2,Battlefield 3: Armored Kill,PlayStation 3,2012-09-04,Battlefield 3: Armored Kill ups the ante for v...,79,4.3,,First-person shooter,"[Single-player, multiplayer]"
3,Time Ace,Xbox One,2007-06-12,"In 1914, eccentric scientist Dr. Hugo Clock cr...",52,tbd,Time Ace is an aerial combat game developed fo...,Combat flight simulator,"[Single-player, multiplayer]"
4,Mortal Kombat: Unchained,PSP,2006-11-13,This sixth episode of the ulraviolent and visc...,70,8.4,,Fighting,"[Single-player, multiplayer]"


In [74]:
games_sample.to_csv('games_clean.csv')