In [167]:
import pandas as pd

all_games_dataset = pd.read_csv('all_games.csv')

all_games_dataset['release_date'] = pd.to_datetime(all_games_dataset['release_date'])
all_games_dataset.rename(columns={'name': 'Title'}, inplace=True)
all_games_dataset['platform'] = all_games_dataset['platform'].str.strip()

unique_titles_sample = all_games_dataset['Title'].drop_duplicates().sample(n= 1400)
games_sample = unique_titles_sample.to_frame().merge(all_games_dataset, on= 'Title')
games_sample.to_csv('games_sample.csv')
print(str(len(games_sample)))

2141


In [168]:
import requests

def get_wikipedia_extract(game):
    url = 'https://en.wikipedia.org/w/api.php'
    params = {
        'action': 'query',
        'format': 'json',
        'titles': game,
        'prop': 'extracts',
        'exintro': True,
        'explaintext': True,
    }
 
    response = requests.get(url, params=params)
    data = response.json()
    if int(next(iter(data['query']['pages'].keys()))) < 0:
        return ''
    page_extract = next(iter(data['query']['pages'].values()))['extract']
    return page_extract

In [169]:
from bs4 import BeautifulSoup

def get_from_infobox(game): 
    url = 'https://en.wikipedia.org/w/api.php'
    params = {
        'action': 'parse',
        'page': game,
        'format': 'json',
        'prop': 'text',
        'redirects': ''
    }
 
    response = requests.get(url, params=params)
    data = response.json()
    if 'error' in data.keys():
        return ''
    raw_html = data['parse']['text']['*']
    return raw_html

def get_genre(game): 
    raw_html = get_from_infobox(game)
    if raw_html == '':
        return ''
    soup = BeautifulSoup(raw_html, 'html.parser')
    tbody = soup.find('tbody')
    if tbody == None:
        return ''
    genreElement = tbody.find('a', {'title': 'Video game genre'})
    if genreElement == None:
        return ''
    genre = genreElement.find_parent().find_parent().find('td').find('a')
    if genre == None:
        return ''
    return genre.text

In [170]:
wikipedia_extracts = []
genres = []

for game in games_sample['Title']:
    wikipedia_extracts.append(get_wikipedia_extract(game))
    genres.append(get_genre(game))

games_sample['Wikipedia'] = wikipedia_extracts
games_sample['Genre'] = genres
games_sample.head()

Unnamed: 0,Title,platform,release_date,summary,meta_score,user_review,Wikipedia,Genre
0,Winter Sports 2: The Next Challenge,Xbox 360,2008-11-18,WINTER SPORTS 2 is an arcade-oriented 3D simul...,41,tbd,"Winter Sports 2: The Next Challenge, known in ...",Sports
1,Dark Rose Valkyrie,PlayStation 4,2017-06-06,1929 - It was the year of the outbreak. In mas...,58,6.0,"Dark Rose Valkyrie (クロバラノワルキューレ, Kurobara no W...",Role-playing
2,Jeanne d'Arc,PSP,2007-08-21,"The Jeanne d'Arc story begins far in the past,...",87,8.3,,
3,Blazing Angels: Squadrons of WWII,Xbox,2006-03-23,You can take to the skies and reenact some of ...,69,7.0,Blazing Angels: Squadrons of WWII is a flight ...,Flight combat
4,Blazing Angels: Squadrons of WWII,PlayStation 3,2006-12-12,"In Blazing Angels: Squadrons of WWII, players ...",67,7.0,Blazing Angels: Squadrons of WWII is a flight ...,Flight combat


In [171]:
games_sample.to_csv('games_clean.csv')