# Game Recommender — Data Modelling

In [41]:
# General 
import re
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# ML
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Utility
import pickle
import unidecode
from ast import literal_eval
import random

import warnings
warnings.simplefilter('ignore')

Load in data:

In [42]:
df = pd.read_csv('datasets/processed/games_PROCESSED.csv')
df.sample(3)

Unnamed: 0,game_title,dev_team,platforms,primary_tags,keywords,game_summary,rake_summary,lemma_summary
488,Abzu,"['505games', 'giantsquid']","['pc', 'playstation4', 'xboxone', 'nintendoswi...","['educational', 'puzzle', 'singleplayer', 'sim...","['exploration', 'dinosaurs', 'underwater', 'sh...",Abzu is an exploration game where you are a lo...,,abzu exploration game lone diver lush ocean di...
1285,JoJo’s Bizarre Adventure: All-Star Battle R,"['bandainamcoentertainment', 'cyberconnect2']","['pc', 'playstation4', 'xboxone', 'nintendoswi...","['multiplayer', 'singleplayer', 'cooperative',...",[],"A remaster of the fighting game ""JoJo's Bizarr...",,remaster fighting game jojo bizarre adventure ...
198,Silent Hill 3,"['teamsilent', 'konami']","['playstation3', 'xbox360']","['puzzle', 'horror', 'singleplayer', 'survival...",[],Silent Hill 3 is the third installment in the ...,survival horror combine action,silent hill installment silent hill series pre...


In [43]:
# Convert list-like columns to Python lists
df['dev_team'] = df['dev_team'].apply(literal_eval)
df['platforms'] = df['platforms'].apply(literal_eval)
df['primary_tags'] = df['primary_tags'].apply(literal_eval)

Build corpus:

In [44]:
# Format documents
doc_data = []

for index, row in tqdm(df.iterrows(), total=len(df)):
    game_tags = row['primary_tags']
    document = " ".join(game_tags)
    if document.strip():
        doc_data.append([row['game_title'], document, row['game_summary'], game_tags])

# Add doc data to a new dataframe
corpus_df = pd.DataFrame(data=doc_data, columns=['game_title', 'doc', 'game_summary', 'tags'])
print('DOC SAMPLE:\n', corpus_df['doc'].iloc[random.randint(0, len(corpus_df) - 1)])
corpus_df.sample(2)

HBox(children=(FloatProgress(value=0.0, max=1351.0), HTML(value='')))


DOC SAMPLE:
 platform singleplayer multiplayer action 2d game character feature adventure include original release sonic nintendo element multiplayer introduce title


Unnamed: 0,game_title,doc,game_summary,tags
1229,Fire Emblem Fates: Birthright,drama turnbasedstrategy singleplayer multiplay...,Fire Emblem Fates is split into three story pa...,"[drama, turnbasedstrategy, singleplayer, multi..."
666,Mario & Luigi: Brothership,comedy platform singleplayer adventure rpg coo...,The brothers return for a brand new adventure ...,"[comedy, platform, singleplayer, adventure, rp..."


In [45]:
# Strip accents
corpus_df['game_title'] = corpus_df['game_title'].apply(lambda x: unidecode.unidecode(x))

Save corpus dataset:

In [46]:
corpus_df.to_csv('datasets/processed/game_corpus.csv', index=False)

TF-iDF Modelling Demo:

In [47]:
game_titles, game_summary, tags, corpus = corpus_df['game_title'], corpus_df['game_summary'], corpus_df['tags'], corpus_df['doc']

# Create a term document matrix
vectorizer = TfidfVectorizer(strip_accents='ascii')
tdm = vectorizer.fit_transform(corpus)
tdm

<1351x261 sparse matrix of type '<class 'numpy.float64'>'
	with 33413 stored elements in Compressed Sparse Row format>

In [54]:
game_index_map = pd.Series(corpus_df.index, index=corpus_df['game_title'])

# query_index = random.randint(0, len(corpus_df) - 1)  # Picks random title from game index map
query_title = input(f"Showing recommendations for:").strip()
query_vector = tdm[game_index_map[query_title]]

# Get cosine similarity / euclidian distance of all vectors -> sort in descending order
cos_sim_series = pd.Series(cosine_similarity(query_vector, tdm).flatten(), name='cos_sim')
rec_df = pd.concat([game_titles, game_summary, cos_sim_series, tags], axis=1).sort_values(['cos_sim'], ascending=False)

counter = 0
for index, row in rec_df.iterrows():
    if row['game_title'] != query_title:
        print(f"{row['game_title']}")
        counter += 1
        if counter == 10: break
    else:
        print('TAGS:', row['tags'], '\n')

Showing recommendations for: Neon White


TAGS: ['shooter', 'platform', 'singleplayer', 'indie', 'visualnovel', 'action', 'assassin', 'fantasy', 'game', 'action', 'life', 'live', 'demon', 'firstperson'] 

Postal 2
Until Then
Bayonetta 2
Devil May Cry 3: Dante's Awakening
Half-Life 2: Episode Two
Coffee Talk
Doom
Devil May Cry 5
Tsukihime: A Piece of Blue Glass Moon
The Finals


Save model:

In [55]:
def save_model(model):
    with open('model/game_rec_model.pkl', 'wb') as file:
        pickle.dump(model, file)

save_model(tdm)