In [1]:
import json_lines
from langdetect import detect
from langdetect import detect_langs
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.neural_network import MLPClassifier

lemmatizer = WordNetLemmatizer()
REMOVEABLE_CHARS = "»<>123456789\/&()?:!.,;'´"
# ALL_TAGS = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT',
#  'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP',
#  'WP$', 'WRB']
ACCEPTABLE_TAGS = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT',
 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP',
 'WP$', 'WRB']
MAX_FEATURES = 50
MIN_ENGLISH = 0.9999
LIMIT_INPUT_ROWS = 200 # to stop run times hindering development

def get_wordnet_pos(tag): # Return a char, based on input tag, that is used as a param in lemmatisation for enhanced results
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

In [2]:
# READ IN INFO
X= []; y= []; z= [];
with open ('reviews.json', 'rb') as f:
    for i, item in enumerate(json_lines.reader(f)):
        if i <= LIMIT_INPUT_ROWS:
            try:
                langs = detect_langs(item['text'])
            except:
                pass
            for i, lang in enumerate(langs):
                if str(lang)[0:2] == 'en':
                    if float(str(lang)[3:]) > MIN_ENGLISH:
                        X.append(item['text'])
                        y.append(item['voted_up'])
                        z.append(item['early_access'])

In [3]:
for i, item in enumerate(X):
    print( "X[", str(i), "]: ", str(X[i]) )
    print( "y[", str(i), "]: ", str(y[i]) )
    print( "z[", str(i), "]: ", str(z[i]) )
    print( "\n" )

X[ 0 ]:  neat little game. play if your staple fps game's server is down or sth. maps and weapon distribution system needs rework.
y[ 0 ]:  True
z[ 0 ]:  True


X[ 1 ]:  Kinda decent f2p mmorpg that has no class limitations(hooray!)(and that game where i drained too much hours...) Community: In PvE, you may encounter friendly ppl(i managed to even encounter person that i love :o), while in PvP mostly complainers(those who complain about AT&T releasing too much of 1st in battle xD) or persons that make you turn into complainer.(As always!) Main review part: Graphics: Everything looks kinda pretty, but performance is not what expected for java and openGL...(8/10) Gameplay: SK features huge choice of equipment, which adds variety to the gear and gameplay itself(like you can use guns, or bombs, or swords.. or 2 of those... or all 3 xD) But because this game features grinding in big quantities you  may get bored(or may not if you do co-op with your friend(s))(8/10) Controls: In order to rek

y[ 131 ]:  True
z[ 131 ]:  False


X[ 132 ]:  Best pirate game on the market, very addictive. Definitely would be better to play with friends.
y[ 132 ]:  True
z[ 132 ]:  False


X[ 133 ]:  10/10 amazing game, okay graphics but very fun to play with friends, it has so many modes/skins/weapons and has alot of medals/achievements.
y[ 133 ]:  True
z[ 133 ]:  False


X[ 134 ]:  This one is the most awsome game in the whole wide world!
y[ 134 ]:  True
z[ 134 ]:  False


X[ 135 ]:  This game is an extremely arcadey racing game. You can drift in any car with ease, really hard to lose control if you are half decent at racing games and the game is relatively easy. Still my favourite NFS game. I think this is because of the hyper realistic visuals and the sick drifts. One huge problem is that it is always online, so whenever EA decides to pull the plug on the servers you can't play the game anymore, but until then, be the drift king that you always wanted to be!
y[ 135 ]:  True
z[ 135 ]:  False



y[ 267 ]:  True
z[ 267 ]:  True


X[ 268 ]:  I can see the logic behind the negative reviews. This game is not for everyone. It requires patience, reflexes and somewhat grinding for achievements - therefore the items.  Now if you are not a fan of rogue-like, action genre, this game wouldn't be the best match for you. If you are, however and want to have some fun, this game suits your needs perfectly. I will start with saying that the game is really hard. This doesn't make the game frustrating though. After you get the grasp of the mechanics and familiar with the items, it will be only a matter of time to finish the game.  David Bowie (RIP) is a playable character in this game (Though I don't think he would appreciate the choice of soundtrack.).  Gameplay is fast-paced. That means there will be times you won't even understand how you died or took massive amount of damage. So never rely on your health being full. You always need to be alert and keep your reflexes up because the ways to d

In [4]:
# FILTER TEXT
for i, text in enumerate(X):
    text_words = nltk.word_tokenize(text.lower())
    tags = nltk.pos_tag(text_words)
    #print(tags)
    new_text= [];
    for j, word in enumerate(text_words):
        if tags[j][1] in ACCEPTABLE_TAGS:
            for char in word:
                if char in REMOVEABLE_CHARS:
                    break
            else:
                wordnet_pos = get_wordnet_pos(tags[j][1])
                if wordnet_pos != '':
                    new_text.append(lemmatizer.lemmatize(word, wordnet_pos))
                    #print("word: ", word, ". wordnet_pos: ", wordnet_pos, "lemmad: ",lemmatizer.lemmatize(word, wordnet_pos),"\n")
                continue
    X[i] = new_text

In [5]:
# MOST FREQ WORDS AS FEATURES
word_counts = {};
for text in X:
    for word in text:
        if word in word_counts:
            word_counts[word]+= 1
        else:
            word_counts[word]= 1
word_counts = {k: v for k, v in sorted(word_counts.items(), key=lambda item: item[1], reverse=True)}
trimd_word_counts = {}
if len(word_counts) > MAX_FEATURES:
    for i, entry in enumerate(word_counts):
        if i >= MAX_FEATURES:
            break
        trimd_word_counts[entry] = word_counts[entry]
word_counts = trimd_word_counts
print(word_counts)

{'be': 805, 'game': 591, 'i': 378, 'have': 217, 'do': 153, 'play': 122, 'get': 115, 'good': 96, 'not': 89, 'so': 86, 'make': 82, 'story': 79, 'more': 78, 'fun': 74, 'time': 74, 'just': 70, 'well': 61, 'great': 59, 'really': 57, 'best': 56, 'also': 53, 'go': 50, 'still': 50, 'much': 49, 'love': 49, 'very': 48, 'thing': 48, 'other': 47, 'character': 47, 'player': 44, 'new': 42, 'weapon': 40, 'use': 40, 'only': 39, 'need': 38, 'even': 38, 'friend': 38, 'people': 38, 'hour': 36, 'say': 36, 'want': 36, 'first': 36, 'way': 36, 'up': 35, 'think': 35, 'buy': 34, 'take': 33, 'many': 33, 'then': 32, 'mode': 32}


In [6]:
# CREATE 1-HOT ENCODINGS
one_hots = [];
for text in X:
    one_hot = [];
    for i in range(MAX_FEATURES):
        one_hot.append(0)
    for word in text:
        if word in word_counts:
            one_hot[list(word_counts).index(word)]+= 1
    one_hots.append(one_hot)
    
print(one_hots)

[[1, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [4, 7, 3, 2, 2, 0, 4, 1, 4, 1, 2, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 4, 1, 1, 0, 0, 0, 0, 2, 0, 1, 1, 1, 2, 2, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0], [2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [5, 4, 3, 5, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 3, 1, 1, 0, 1, 0, 1, 0, 1, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0], [1, 1, 2, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [7]:
print(X)



In [8]:
# # CONVERT TO PANDAS
# d = {'Text':X, 'Voted up':y, 'Early access':z}
# df = pd.DataFrame(d)
# df