In [1]:
import pandas as pd
import os
import subprocess

games_sample = pd.read_csv('../datasets/processed_batch.csv')

#games_sample.drop(['Unnamed: 0'], axis=1, inplace=True)

games_sample.head()

Unnamed: 0,Unnamed: 0.1,name,platform,release_date,summary,meta_score,user_review,MetacriticReviews,Wikipedia,Genre,Modes
0,1941,Assassin's Creed Valhalla,PlayStation 5,"November 12, 2020","Build your own Viking Legend. Become Eivor, a ...",84,7.8,"[('DarkStation', 'Valhalla is equipped with a ...",,,
1,1942,Peggle: Dual Shot,DS,"February 27, 2009",Shoot and clear the orange pegs from 120 level...,84,7.8,"[('Modojo', ""Peggle: Dual Shot is the ideal ga...",,,
2,1943,Hitman 3,PlayStation 4,"January 20, 2021",Death Awaits HITMAN 3 is the dramatic conclusi...,84,7.9,"[('GameSpace', ""Hitman 3 is an incredible conc...",,,
3,1944,Disgaea: Hour of Darkness,PlayStation 2,"August 27, 2003",The Netherworld - a realm darker than the deep...,84,8.7,"[('Into Liquid Sky', 'Pure immature fun with a...",,,
4,1945,Kingdom Hearts HD I.5 + II.5 Remix,PlayStation 4,"March 28, 2017",Kingdom Hearts HD 1.5 + 2.5 Remix is an HD rem...,84,8.9,"[('Gaming Age', 'For fans looking to revisit t...",,,


In [2]:
import requests

def search_wikipedia(game):
    search_url = 'https://en.wikipedia.org/w/api.php'
    search_params = {
        'action': 'query',
        'format': 'json',
        'list': 'search',
        'srsearch': game,
    }

    search_response = requests.get(search_url, params=search_params)
    search_data = search_response.json()

    if search_data["query"]["searchinfo"]["totalhits"] > 0:
        return search_data["query"]["search"][0]['title']
    else:
        return ''

In [3]:
def get_wikipedia_extract(game):      
    summary_url = 'https://en.wikipedia.org/w/api.php'
    summary_params = {
        'action': 'query',
        'format': 'json',
        'titles': game,
        'prop': 'extracts',
        'exintro': True,
        'explaintext': True,
    }

    response = requests.get(summary_url, params=summary_params)
    data = response.json()
    if int(next(iter(data['query']['pages'].keys()))) < 0:
        return ''
    page_extract = next(iter(data['query']['pages'].values()))['extract']
    return page_extract


In [4]:
from bs4 import BeautifulSoup

def get_genre(raw_html): 
    if raw_html == '':
        return ''
    soup = BeautifulSoup(raw_html, 'html.parser')
    tbody = soup.find('tbody')
    if tbody == None:
        return ''
    genreElement = tbody.find('a', {'title': 'Video game genre'})
    if genreElement == None:
        return ''
    genreElement = genreElement.find_parent()
    if genreElement == None:
        return ''
    genreElement = genreElement.find_parent()
    if genreElement == None:
        return ''
    genreElement = genreElement.find('td')
    if genreElement == None:
        return ''
    genre = genreElement.find('a')
    if genre == None:
        return ''
    return genre.text

def get_modes(raw_html):
    if raw_html == '':
        return []
    soup = BeautifulSoup(raw_html, 'html.parser')
    tbody = soup.find('tbody')
    if tbody == None:
        return []
    modesElement = tbody.find(text='Mode(s)')
    if modesElement == None:
        return []
    modesElement = modesElement.find_parent()
    if modesElement == None:
        return []
    modesElement = modesElement.find_parent()
    if modesElement == None:
        return []
    modesElement = modesElement.find('td')
    if modesElement == None:
        return []
    modes = modesElement.findAll('a', recursive=False)
    if modes == None:
        return []
    return list(map(lambda mode: mode.text, modes))

def get_from_infobox(game): 
    url = 'https://en.wikipedia.org/w/api.php'
    params = {
        'action': 'parse',
        'page': game,
        'format': 'json',
        'prop': 'text',
        'redirects': ''
    }
 
    response = requests.get(url, params=params)
    data = response.json()
    if 'error' in data.keys():
        return {'genre': '', 'modes': []}
    raw_html = data['parse']['text']['*']
    info = {'genre': get_genre(raw_html), 'modes': get_modes(raw_html)}
    return info

In [5]:
wikipedia_extracts = []
genres = []
modes = []

try:
    for index, game in enumerate(games_sample['name'], start=0):
        try:
            print("Game no. ", index)
            page_title = search_wikipedia(game + "(video game)")
            if page_title != '':
                wikipedia_extracts.append(get_wikipedia_extract(page_title))
                info = get_from_infobox(page_title)
                genres.append(info['genre'])
                modes.append(info['modes'])
            else:
                wikipedia_extracts.append('')
                genres.append('')
                modes.append('')

            if (index % 20 == 0):
                # Update the CSV file every 50 entries
                games_sample.loc[:index, 'Wikipedia'] = wikipedia_extracts
                games_sample.loc[:index, 'Genre'] = genres
                games_sample.loc[:index, 'Modes'] = modes
                games_sample.to_csv('../datasets/first_batch_2_try.csv', index=False)
                print("Wrote to file")
        except Exception as e:
            with open("errors.txt", 'a') as file:   
                file.write(f"Error processing game {game}: {e}")
                # You can add more specific error handling or logging here if needed

    # After the loop, update the CSV file with the remaining entries
    games_sample['Wikipedia'] = wikipedia_extracts
    games_sample['Genre'] = genres
    games_sample['Modes'] = modes
    games_sample.to_csv('../datasets/first_batch_2_try.csv', index=False)

except Exception as e:
    print(f"Unexpected error: {e}")




Game no.  0
Wrote to file
Game no.  1
Game no.  2
Game no.  3
Game no.  4
Game no.  5
Game no.  6
Game no.  7
Game no.  8
Game no.  9
Game no.  10
Game no.  11
Game no.  12
Game no.  13
Game no.  14
Game no.  15
Game no.  16
Game no.  17
Game no.  18
Game no.  19
Game no.  20


  return asarray(a).ndim


Wrote to file
Game no.  21
Game no.  22
Game no.  23
Game no.  24
Game no.  25
Game no.  26
Game no.  27
Game no.  28
Game no.  29
Game no.  30
Game no.  31
Game no.  32
Game no.  33
Game no.  34
Game no.  35
Game no.  36
Game no.  37
Game no.  38
Game no.  39
Game no.  40
Wrote to file
Game no.  41
Game no.  42
Game no.  43
Game no.  44
Game no.  45
Game no.  46
Game no.  47
Game no.  48
Game no.  49
Game no.  50
Game no.  51
Game no.  52
Game no.  53
Game no.  54
Game no.  55
Game no.  56
Game no.  57
Game no.  58
Game no.  59
Game no.  60
Wrote to file
Game no.  61
Game no.  62
Game no.  63
Game no.  64
Game no.  65
Game no.  66
Game no.  67
Game no.  68
Game no.  69
Game no.  70
Game no.  71
Game no.  72
Game no.  73
Game no.  74
Game no.  75
Game no.  76
Game no.  77
Game no.  78
Game no.  79
Game no.  80
Wrote to file
Game no.  81
Game no.  82
Game no.  83
Game no.  84
Game no.  85
Game no.  86
Game no.  87
Game no.  88
Game no.  89
Game no.  90
Game no.  91
Game no.  92
Game no.