# Sentiment Analysis of Game Reviews on IGN

In [1]:
# Import dependencies
import tflearn
import pandas as pd

curses is not supported on this machine (please install/reinstall curses for an optimal experience)


In [2]:
# IGN Dataset loading
from urllib.request import urlretrieve
from os.path import isfile, isdir
from tqdm import tqdm
import zipfile

"""
def fixBadZipfile(zipFile):  
    f = open(zipFile, 'r+b')  
    data = f.read()
    
    pos = data.find('\x50\x4b\x05\x06') # End of central directory signature  
    if (pos > 0):  
        self._log("Trancating file at location " + str(pos + 22)+ ".")  
        f.seek(pos + 22)   # size of 'ZIP end of central directory record' 
        f.truncate()  
        f.close()
"""

"""
class DLProgress(tqdm):
    last_block = 0
    
    def hook(self, block_num=1, block_size=1, total_size=None):
        self.total = total_size
        self.update((block_num - self.last_block) * block_size)
        self.last_block = block_num
        
if not isfile('20-years-of-games.zip'):
    with DLProgress(unit='B', unit_scale=True, miniters=1, desc='IGN Game Review Dataset') as pbar:
        urlretrieve(
        'https://kaggle2.blob.core.windows.net/datasets/200/425/20-years-of-games.zip',
        '20-years-of-games.zip',
        pbar.hook)
"""

if not isfile('ign.csv'):
    # fixBadZipfile('20-years-of-games.zip')
    with zipfile.ZipFile('20-years-of-games.zip', 'r') as the_zip:
        the_zip.extractall()
        the_zip.close()

In [3]:
# IGN game review dataset loading
game_reviews = pd.read_csv('ign.csv')
game_reviews.head()

Unnamed: 0.1,Unnamed: 0,score_phrase,title,url,platform,score,genre,editors_choice,release_year,release_month,release_day
0,0,Amazing,LittleBigPlanet PS Vita,/games/littlebigplanet-vita/vita-98907,PlayStation Vita,9.0,Platformer,Y,2012,9,12
1,1,Amazing,LittleBigPlanet PS Vita -- Marvel Super Hero E...,/games/littlebigplanet-ps-vita-marvel-super-he...,PlayStation Vita,9.0,Platformer,Y,2012,9,12
2,2,Great,Splice: Tree of Life,/games/splice/ipad-141070,iPad,8.5,Puzzle,N,2012,9,12
3,3,Great,NHL 13,/games/nhl-13/xbox-360-128182,Xbox 360,8.5,Sports,N,2012,9,11
4,4,Great,NHL 13,/games/nhl-13/ps3-128181,PlayStation 3,8.5,Sports,N,2012,9,11


In [4]:
game_reviews.shape

(18625, 11)

In [5]:
game_reviews['score_phrase'].value_counts()

Great          4773
Good           4741
Okay           2945
Mediocre       1959
Amazing        1804
Bad            1269
Awful           664
Painful         340
Unbearable       72
Masterpiece      55
Disaster          3
Name: score_phrase, dtype: int64

In [6]:
game_reviews.isnull().sum()

Unnamed: 0         0
score_phrase       0
title              0
url                0
platform           0
score              0
genre             36
editors_choice     0
release_year       0
release_month      0
release_day        0
dtype: int64

In [7]:
# Data cleaning
game_reviews.fillna(value='', inplace=True)
game_reviews.drop(labels = ['Unnamed: 0', 'url', 'score', 'release_year', 'release_month', 'release_day'], axis = 1, inplace=True)
game_reviews.head()

Unnamed: 0,score_phrase,title,platform,genre,editors_choice
0,Amazing,LittleBigPlanet PS Vita,PlayStation Vita,Platformer,Y
1,Amazing,LittleBigPlanet PS Vita -- Marvel Super Hero E...,PlayStation Vita,Platformer,Y
2,Great,Splice: Tree of Life,iPad,Puzzle,N
3,Great,NHL 13,Xbox 360,Sports,N
4,Great,NHL 13,PlayStation 3,Sports,N


In [8]:
# Merge into one single text
game_reviews_text = game_reviews['title'] + ' ' + game_reviews['platform'] + ' ' + game_reviews['genre'] + ' ' + ['Editors_Choice' if e is 'Y' else '' for e in game_reviews['editors_choice']]
import string
game_reviews['text'] = ["".join(l for l in s if l not in string.punctuation) for s in game_reviews_text]
game_reviews.head()

Unnamed: 0,score_phrase,title,platform,genre,editors_choice,text
0,Amazing,LittleBigPlanet PS Vita,PlayStation Vita,Platformer,Y,LittleBigPlanet PS Vita PlayStation Vita Platf...
1,Amazing,LittleBigPlanet PS Vita -- Marvel Super Hero E...,PlayStation Vita,Platformer,Y,LittleBigPlanet PS Vita Marvel Super Hero Edi...
2,Great,Splice: Tree of Life,iPad,Puzzle,N,Splice Tree of Life iPad Puzzle
3,Great,NHL 13,Xbox 360,Sports,N,NHL 13 Xbox 360 Sports
4,Great,NHL 13,PlayStation 3,Sports,N,NHL 13 PlayStation 3 Sports


In [9]:
# keep only the score_phrase and text column
game_reviews.drop(['title', 'platform', 'genre', 'editors_choice'], axis = 1, inplace = True)
game_reviews.head()

Unnamed: 0,score_phrase,text
0,Amazing,LittleBigPlanet PS Vita PlayStation Vita Platf...
1,Amazing,LittleBigPlanet PS Vita Marvel Super Hero Edi...
2,Great,Splice Tree of Life iPad Puzzle
3,Great,NHL 13 Xbox 360 Sports
4,Great,NHL 13 PlayStation 3 Sports


In [10]:
%pdb

Automatic pdb calling has been turned ON


In [11]:
# Data preprocessing
# split into text and training data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(game_reviews['text'], game_reviews['score_phrase'], test_size = 0.1, random_state = 42)

In [12]:
X_train.head()

14619    World Poker Tour Texas Hold Em Nintendo DSi Card 
12414               Family Glide Hockey Wii Sports Action 
5312     Metal Arms Glitch in the System GameCube Shooter 
5259     The Lord of the Rings The Return of the King G...
3993     MechWarrior 4 Clan Mech Pak PC Action Simulation 
Name: text, dtype: object

In [71]:
# convert words to indexes
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(ngram_range = (1,1), token_pattern = r'\b\w{1,}\b')
vect.fit(game_reviews['text'])
vocab = vect.vocabulary_
inverse_vocab = {i: w for w, i in vocab.items()}

def convert_word_to_idx(x):
    return x.apply(lambda x: [vocab[w] for w in [w.lower().strip() for w in x.split()] if w in vocab])

print(len(vocab))
print(len(inverse_vocab))
#print(vocab)

8276
8276


In [64]:
# convert label to indexes
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(list(game_reviews['score_phrase'].value_counts().index))

LabelEncoder()

In [15]:
# convert to numbers
X_train = convert_word_to_idx(X_train)
X_test = convert_word_to_idx(X_test)

In [16]:
X_train.head()

14619    [8081, 5551, 7434, 7259, 3552, 2476, 5035, 234...
12414                  [2705, 3153, 3546, 7996, 6825, 277]
5312       [4621, 583, 3155, 3722, 7268, 7137, 3046, 6472]
5259     [7268, 4347, 5137, 7268, 6052, 7268, 6001, 513...
3993        [4579, 174, 1551, 4575, 5304, 5379, 277, 6544]
Name: text, dtype: object

In [17]:
from tflearn.data_utils import to_categorical, pad_sequences
# Sequence padding
X_train = pad_sequences(X_train, maxlen = 20, value = 0)
X_test = pad_sequences(X_test, maxlen = 20, value = 0)
# convert to numbers
y_train = to_categorical(y_train.map(lambda x: le.transform([x])[0]), nb_classes=len(game_reviews['score_phrase'].value_counts()))
y_test  = to_categorical(y_test.map(lambda x:  le.transform([x])[0]), nb_classes=len(game_reviews['score_phrase'].value_counts()))

In [18]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(16762, 20) (1863, 20)
(16762, 11) (1863, 11)


In [19]:
pd.DataFrame(X_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,8081,5551,7434,7259,3552,2476,5035,2340,1338,0,0,0,0,0,0,0,0,0,0,0
1,2705,3153,3546,7996,6825,277,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,4621,583,3155,3722,7268,7137,3046,6472,0,0,0,0,0,0,0,0,0,0,0,0
3,7268,4347,5137,7268,6052,7268,6001,5137,7268,4081,3046,277,2425,0,0,0,0,0,0,0
4,4579,174,1551,4575,5304,5379,277,6544,0,0,0,0,0,0,0,0,0,0,0,0


In [20]:
pd.DataFrame(y_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [21]:
each_vector_size = X_train.shape[1]
vocab_size = len(vocab)
distinct_label_size = len(game_reviews['score_phrase'].value_counts())
print(each_vector_size, vocab_size, distinct_label_size)

20 8276 11


**Note**:
If `IndexError: list index out of range` encountered during training, restart the kernel will solve it.
or just add
```
import tensorflow as tf
tf.reset_default_graph()
```
to clean the state.

In [None]:
# clean state
import tensorflow as tf
tf.reset_default_graph()

In [22]:
# Network building
net = tflearn.input_data([None, each_vector_size])
net = tflearn.embedding(net, input_dim = vocab_size, output_dim = 128) # input_dim: vocabulary size
net = tflearn.lstm(net, 128, dropout = 0.8)
net = tflearn.fully_connected(net, distinct_label_size, activation = 'softmax')
net = tflearn.regression(net, optimizer = 'adam', learning_rate = 0.001, loss = 'categorical_crossentropy')

In [23]:
# Training
model = tflearn.DNN(net, tensorboard_verbose = 0)
model.fit(X_train, y_train, validation_set = 0.1, n_epoch = 30, show_metric = True)

Training Step: 7079  | total loss: 0.86127 | time: 10.791s
| Adam | epoch: 030 | loss: 0.86127 - acc: 0.7682 -- iter: 15040/15085
Training Step: 7080  | total loss: 0.80930 | time: 11.849s
| Adam | epoch: 030 | loss: 0.80930 - acc: 0.7789 | val_loss: 2.21130 - val_acc: 0.4383 -- iter: 15085/15085
--


In [25]:
# Evaluate
model.evaluate(X_test, y_test)

[0.46698872753835402]

In [107]:
# Show some predicted samples
import numpy as np
for i in range(0, 6):
    pred_class = np.argmax(model.predict([X_test[i]]))
    true_class = np.argmax(y_test[i])

    print(' '.join([inverse_vocab[idx] for idx in X_test[i] if idx != 0]))
    print('pred_class:', le.inverse_transform(pred_class))
    print('true_class:', le.inverse_transform(true_class))
    print('')

sonic the hedgehog 1991 wireless platformer editorschoice
pred_class: Great
true_class: Great

zanac wii shooter
pred_class: Mediocre
true_class: Okay

winter games wireless sports
pred_class: Okay
true_class: Okay

cars wireless racing
pred_class: Good
true_class: Good

topolon wireless puzzle editorschoice
pred_class: Great
true_class: Great

tiny brains pc puzzle
pred_class: Bad
true_class: Bad

