# Game Recommender — Text Preprocessing

In [27]:
# General 
import re
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# NLP
import string
import nltk
from rake_nltk import Rake
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet

# Utility
import unidecode
from ast import literal_eval
from collections import Counter


# POS TAG helper function ——
def get_wordnet_tag(treebank_tag: str) -> str:
    """
    In order for the WordNetLemmatizer to work correctly, 
    this function will convert the treebank tag to a wordnet POS tag.
    """
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

Load in data:

In [28]:
df = pd.read_csv('datasets/cleaned/new_games_CLEANED.csv')
df.sample(3)

Unnamed: 0,game_title,dev_team,platforms,primary_tags,keywords,game_summary
911,Yakuza,"['Ryu ga Gotoku Studios', 'Sega']",['PlayStation Portable'],"['Historical', 'Turn Based Strategy', 'Simulat...","['hand-to-hand combat', 'brawler', 'action-adv...","Just as Kazuma, a former rising star in the Ya..."
692,Venba,['Visai Games'],"['PC (Microsoft Windows)', 'Xbox One', 'Ninten...","['Adventure', 'Indie', 'Puzzle', 'Simulator', ...","['2d', 'crafting', 'cooking', 'cinematic', 'fe...","Venba is a narrative cooking game, where you p..."
76,Resident Evil 7: Biohazard,['Capcom'],"['PC (Microsoft Windows)', 'Mac', 'iOS', 'Play...","['Shooter', 'multiple endings', 'Puzzle', 'Spl...","['zombies', 'psychological horror', 'crafting'...",Resident Evil 7: Biohazard is the first game o...


In [29]:
# Convert list-like columns to Python list
df['dev_team'] = df['dev_team'].apply(literal_eval)
df['platforms'] = df['platforms'].apply(literal_eval)
df['primary_tags'] = df['primary_tags'].apply(literal_eval)
df['keywords'] = df['keywords'].apply(literal_eval)

Before preprocessing use Rake to indentify keywords in each `game_summary` document:

In [30]:
r = Rake()
for index, row in tqdm(df.iterrows(), total=len(df)):
    r.extract_keywords_from_text(row['game_summary'])
    extracted_keywords = r.get_ranked_phrases_with_scores()

    rake_keywords = []
    for score, keyword in extracted_keywords:
        if score > 12:
            rake_keywords += keyword.split(' ')

    rake_keywords_string = " ".join(rake_keywords)
    
    # Find primary keyword matches in rake summary
    new_primary_tags = [tag for tag in row['primary_tags'] if tag.lower() in rake_keywords_string.lower()]
    row['primary_tags'] += new_primary_tags

    df.loc[index, 'rake_summary'] = rake_keywords_string

HBox(children=(FloatProgress(value=0.0, max=1351.0), HTML(value='')))




In [31]:
# Remove duplicate tags
df['primary_tags'] = df['primary_tags'].apply(lambda x: list(set(x)))

`game_summary` text preprocessing:

In [32]:
def decontract(doc: str) -> str:
    """
    Decontract phrases in doc.
    """
    # Specific
    doc = re.sub(r"won\'t", "will not", doc)
    doc = re.sub(r"can\'t", "can not", doc)
    # General
    doc = re.sub(r"n\'t", " not", doc)
    doc = re.sub(r"\'re", " are", doc)
    doc = re.sub(r"\'s", " is", doc)
    doc = re.sub(r"\'d", " would", doc)
    doc = re.sub(r"\'ll", " will", doc)
    doc = re.sub(r"\'t", " not", doc)
    doc = re.sub(r"\'ve", " have", doc)
    doc = re.sub(r"\'m", " am", doc)
    return doc


def lemmatize(doc: str) -> str:
    """
    Lemmatize document.
    """
    wnl = WordNetLemmatizer()
    tokens = word_tokenize(doc)
    words_and_tags = pos_tag(tokens)
    return " ".join(wnl.lemmatize(word, pos=get_wordnet_tag(tag)) for word, tag in words_and_tags)


def remove_stopwords(doc: str) -> str:
    """
    Remove stopwords from document.
    """
    with open('updated_stopwords.txt') as file:
        words_to_remove = set(stopwords.words('english') + file.read().split('\n')) 
    tokens = word_tokenize(doc)
    return " ".join([word for word in tokens if word not in words_to_remove and len(word) > 1])


def remove_punctuation(doc: str) -> str:
    """
    Remove punctuation and numbers from document.
    """
    doc = doc.lower()
    doc = re.sub('[0-9]', '', doc)
    doc = decontract(doc)
    for symbol in string.punctuation + '’...':
        doc = doc.replace(symbol, '')
    doc = re.sub('\s+', ' ', doc)
    doc = lemmatize(doc)
    doc = remove_stopwords(doc)
    doc = unidecode.unidecode(doc)
    return doc

In [33]:
for index, row in tqdm(df.iterrows(), total=len(df)):
    df.loc[index, 'lemma_summary'] = remove_punctuation(row['game_summary'])
    df.loc[index, 'rake_summary'] = remove_punctuation(row['rake_summary'])

df['rake_summary'].iloc[0]

HBox(children=(FloatProgress(value=0.0, max=1351.0), HTML(value='')))




'challenge gameplay mechanic familiar demigod child fight elden'

Identify keywords from lemmatized summaries:

In [34]:
# Most common words in lemmatized corpus
lemma_corpus = " ".join(df['lemma_summary'].tolist()).strip()
lemma_tokens = word_tokenize(lemma_corpus)
token_counts = Counter(lemma_tokens)
print(token_counts.most_common(50))

[('game', 1703), ('player', 869), ('character', 483), ('story', 455), ('time', 442), ('series', 391), ('battle', 383), ('set', 379), ('power', 359), ('feature', 358), ('adventure', 332), ('fight', 322), ('play', 308), ('city', 300), ('enemy', 277), ('experience', 264), ('dark', 261), ('war', 261), ('gameplay', 250), ('weapon', 250), ('pokemon', 250), ('force', 248), ('friend', 245), ('mario', 242), ('explore', 240), ('action', 238), ('land', 219), ('discover', 218), ('control', 217), ('life', 216), ('event', 211), ('return', 211), ('ability', 208), ('follow', 205), ('hero', 202), ('save', 201), ('include', 199), ('original', 197), ('attack', 196), ('create', 192), ('mode', 192), ('journey', 187), ('combat', 182), ('level', 182), ('release', 178), ('island', 176), ('team', 172), ('defeat', 171), ('kill', 171), ('evil', 167)]


In [35]:
# Add the 200 most common words to primary tags
lemma_keywords = [word for word, count in token_counts.most_common(200) if word.isalpha()]

for index, row in tqdm(df.iterrows(), total=len(df)):
    lemma_word_list = set(row['lemma_summary'].split(' '))
    # Keyword matches in lemma summary
    for word in lemma_keywords:
        if word in lemma_word_list and word not in row['primary_tags']:
            row['primary_tags'].append(word)

HBox(children=(FloatProgress(value=0.0, max=1351.0), HTML(value='')))




In [36]:
print(df['primary_tags'].iloc[0])

['Single player', 'Multiplayer', 'sword & sorcery', 'Adventure', 'RPG', 'Open world', 'Co-operative', 'Action', 'difficult', 'exploration', 'Fantasy', 'game', 'player', 'character', 'series', 'battle', 'power', 'feature', 'fight', 'enemy', 'dark', 'gameplay', 'force', 'explore', 'action', 'land', 'event', 'release', 'secret', 'develop', 'powerful', 'build', 'rpg', 'role', 'introduce', 'offer', 'deep', 'seek', 'mechanic']


Format `keyword_tags`:

In [37]:
def format_tag(tags: list) -> list:
    """
    Validate formatting of each tag.
    """
    tags = tags.copy()
    for index, tag in enumerate(tags):
        tag = tag.lower()
        if '(' in tag:
            tag = tag.split(' (')[0]

        tag = re.sub('[-\.\!\?\&]', '', tag)
        tag = tag.replace(' ', '')
        tags[index] = tag

    return tags

df['dev_team'] = df['dev_team'].apply(format_tag)
df['platforms'] = df['platforms'].apply(format_tag)
df['primary_tags'] = df['primary_tags'].apply(format_tag)
df.sample(2)

Unnamed: 0,game_title,dev_team,platforms,primary_tags,keywords,game_summary,rake_summary,lemma_summary
1342,Kirby Air Ride,"[nintendo, hallaboratory]",[nintendogamecube],"[arcade, singleplayer, splitscreen, racing, fa...","[minigames, protagonist's name in the title, m...",Kirby Air Ride is a 2003 racing game video gam...,support lan play broadband adapter race game v...,kirby air ride racing game video game develop ...
1014,Dante's Inferno,"[electronicarts, visceralgames]",[commodorec64/128/max],"[singleplayer, medieval, adventure, action, br...","[magic, based on - book, bangsian fantasy]","Dante's Inferno is an epic single player, thir...",person action adventure game inspire feature n...,dante inferno epic single player thirdperson a...


Save dataset:

In [38]:
df.to_csv('datasets/processed/games_PROCESSED.csv', index=False)