# Game Recommender — Compile Data

All data is extracted from [Backloggd](https://backloggd.com/games/lib/popular/) and [IGDB](https://www.igdb.com/).

In [1]:
# General
import re
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# Web scraping
import requests
from bs4 import BeautifulSoup

# Utility
import time
import unidecode

import warnings
warnings.simplefilter('ignore')

Extract trending game titles, dev / genre tags and game summaries from Backloggd:

In [2]:
BASE_URL = 'https://backloggd.com/'
SEARCH_URL = "https://backloggd.com/games/lib/popular?page="

In [3]:
game_data = []

for page_number in tqdm(range(1, 41), total=40):
    response = requests.get(f"{SEARCH_URL}{page_number}")
    soup = BeautifulSoup(response.content, 'html.parser')

    try:
        # Fetch game titles along with href link to the game's info page
        for div in soup.find_all('div', {'class': 'col-2 my-2 px-1 px-md-2'}):
            title, game_info_url = div.div.text.strip(), div.a['href']
            game_data.append([title, f"{BASE_URL}{game_info_url}"])
    except Exception as e:
        print(f"Error at page number: {page_number} — {e}")

HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))




In [4]:
print(game_data[:5])

[['Elden Ring', 'https://backloggd.com//games/elden-ring/'], ['The Legend of Zelda: Tears of the Kingdom', 'https://backloggd.com//games/the-legend-of-zelda-tears-of-the-kingdom/'], ["Baldur's Gate 3", 'https://backloggd.com//games/baldurs-gate-3/'], ['The Legend of Zelda: Breath of the Wild', 'https://backloggd.com//games/the-legend-of-zelda-breath-of-the-wild/'], ['Hades', 'https://backloggd.com//games/hades--1/']]


In [5]:
for i in tqdm(range(len(game_data)), total=len(game_data)):
    game_title, game_info_url = game_data[i][0], game_data[i][1]
    
    response = requests.get(game_info_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    try:
        # Split dev tags and remove additional whitespace from each string
        developers = [
            re.sub('\s+', ' ', dev_name).strip() for dev_name in\
            soup.find('div', {'class': 'col-auto pl-lg-1 sub-title'})\
            .text.replace('by\n', '').strip().split(',')
        ]
        
        genre_tags = [tag.text for tag in soup.find_all('p', {'class': 'genre-tag'})]
        game_summary = soup.find('div', {'id': 'collapseSummary'}).text.strip()
    except Exception as e:
        print(f"Error at index pos; {i}, {game_title} – {e}")
        continue

    # Store data in new_games_df
    game_data[i].append(developers)
    game_data[i].append(genre_tags)
    game_data[i].append(game_summary)

HBox(children=(FloatProgress(value=0.0, max=1440.0), HTML(value='')))

Error at index pos; 1017, Silent Hill 2: Enhanced Edition – 'NoneType' object has no attribute 'text'



In [12]:
# Sample row of data
game_data[0]

['Elden Ring',
 'https://backloggd.com//games/elden-ring/',
 ['FromSoftware', 'Bandai Namco Entertainment'],
 ['Adventure', 'RPG'],
 'Elden Ring is an action RPG developed by FromSoftware and published by Bandai Namco Entertainment, released in February 2022. Directed by Hidetaka Miyazaki, with world-building contributions from novelist George R. R. Martin, the game features an expansive open world called the Lands Between. Players assume the role of a customisable character known as the Tarnished, who must explore this world, battle formidable enemies, and seek to restore the Elden Ring to become the Elden Lord.\n\nThe game builds on the challenging gameplay mechanics familiar from the Dark Souls series but introduces a more open-ended structure with vast exploration, dynamic weather, and a day-night cycle. It offers deep lore, complex characters, and an interconnected world filled with secrets, dungeons, and powerful bosses.']

In [11]:
def reset_data(data: list) -> list:
    return [entry[:5] for entry in data]

game_data = reset_data(game_data)

Extract more tags from IGDB (platform, themes, storyline, game modes, keywords):

In [8]:
# API info for IGDB
API_URL = 'https://api.igdb.com/v4/games/'
CLIENT_ID = '7w2gvan50xfhdyjrjnshmx6o82tmtc'
CLIENT_SECRET = '4avly364nhovv2faj4d5l8pypeonmv'

# Get access token
post_url = 'https://id.twitch.tv/oauth2/token?'
params = {'client_id': CLIENT_ID, 'client_secret': CLIENT_SECRET, 'grant_type': 'client_credentials'}
response = requests.post(post_url, params=params)
access_token = response.json()['access_token']

In [13]:
for i in tqdm(range(len(game_data)), total=len(game_data)):
    game_title = unidecode.unidecode(game_data[i][0])
    game_query = f'search "{game_title}"; fields themes.*,game_modes.*,storyline,platforms.*,keywords.*,player_perspectives.*;'
    api_headers = {'headers': {'Client-ID': CLIENT_ID, 'Authorization': f'Bearer {access_token}'}, 'data': game_query}

    # Exception for pasring errors with API
    try:
        response = requests.post(API_URL, **api_headers)
        query_data = response.json()[0]
    except Exception as e:
        print(f"Error at index pos: {i}, {game_title} — {e}")

    for column_key in ['storyline', 'platforms', 'game_modes', 'themes', 'keywords', 'player_perspectives']:
        # Exception for missing data in query data
        try:
            if column_key == 'storyline':
                game_data[i].append(query_data[column_key])
            else:
                game_data[i].append([data['name'] for data in query_data[column_key]])
        except Exception as e:
            game_data[i].append([])

    time.sleep(0.5)

HBox(children=(FloatProgress(value=0.0, max=1440.0), HTML(value='')))

Error at index pos: 705, God of War Ragnarok: Valhalla — list index out of range
Error at index pos: 939, Off — list index out of range
Error at index pos: 1199, Sid Meier's Civilization V — 0
Error at index pos: 1201, Until Then — list index out of range
Error at index pos: 1202, Shin Megami Tensei IV: Apocalypse — 0
Error at index pos: 1205, Wordle — 0
Error at index pos: 1207, Fate/Samurai Remnant — 0
Error at index pos: 1210, Doom 3 — 0
Error at index pos: 1211, Captain Toad: Treasure Tracker — 0
Error at index pos: 1231, Castlevania: Circle of the Moon — 0
Error at index pos: 1239, Danganronpa Another Episode: Ultra Despair Girls — 0
Error at index pos: 1240, Final Fantasy VIII Remastered — 0
Error at index pos: 1267, Pokemon Pearl Version — 0
Error at index pos: 1292, Chained Together — 0
Error at index pos: 1365, Rhythm Heaven Fever — 0
Error at index pos: 1399, Chrono Trigger — 0
Error at index pos: 1406, Batman: Arkham City - Game of the Year Edition — 0
Error at index pos: 14

In [14]:
# Store game data in dataframe
df_columns = [
    'game_title', 'game_info_url', 'dev_team',
    'genre_tags', 'game_summary', 'storyline', 'platforms',
    'game_modes', 'themes', 'keywords', 'perspectives'
]
df = pd.DataFrame(data=game_data, columns=df_columns).drop(columns=['game_info_url'])
df.sample(3)

Unnamed: 0,game_title,dev_team,genre_tags,game_summary,storyline,platforms,game_modes,themes,keywords,perspectives
97,Resident Evil 3,"[K2, Capcom]","[Adventure, Shooter]",Resident Evil 3 is a remake of the original Re...,A series of strange disappearances have been o...,"[PC (Microsoft Windows), PlayStation 4, Xbox O...","[Single player, Multiplayer]","[Action, Horror, Survival]","[zombies, survival horror]",[Third person]
617,Black Mesa,[Crowbar Collective],"[Adventure, Indie, Platform, Shooter]",Black Mesa is a re-envisioning of Valve Softwa...,The plot of Black Mesa is almost identical to ...,"[Linux, PC (Microsoft Windows)]","[Single player, Multiplayer]","[Action, Science fiction, Warfare]","[aliens, assassin, bloody, first person shoote...",[First person]
709,Triangle Strategy,"[Square Enix Creative Business Unit II, Nintendo]","[RPG, Strategy, Tactical, Turn Based Strategy]",Three nations battle for control of the dwindl...,"Command a group of warriors as Serenoa, heir o...","[PC (Microsoft Windows), Nintendo Switch]",[Single player],[Fantasy],"[2.5d, turn-based tactics, game with chapters]",[Bird view / Isometric]


Save dataset:

In [15]:
df.to_csv('datasets/new/new_games_addon.csv', index=False)