# Install and Load Packages #

In [94]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow_datasets as tfds

import gensim
from gensim.models.phrases import Phraser, Phrases
from gensim.models.word2vec import Word2Vec

import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import os
import gzip
import json
import string

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Devin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Devin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Devin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

Load in the review data

In [69]:
import pandas as pd
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('video_game_data.gz')

In [70]:
df

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5.0,True,"10 17, 2015",A1HP7NVNPFMA4N,0700026657,Ambrosia075,"This game is a bit hard to get the hang of, bu...",but when you do it's great.,1445040000,,,
1,4.0,False,"07 27, 2015",A1JGAP0185YJI6,0700026657,travis,I played it a while but it was alright. The st...,"But in spite of that it was fun, I liked it",1437955200,,,
2,3.0,True,"02 23, 2015",A1YJWEXHQBWK2B,0700026657,Vincent G. Mezera,ok game.,Three Stars,1424649600,,,
3,2.0,True,"02 20, 2015",A2204E1TH211HT,0700026657,Grandma KR,"found the game a bit too complicated, not what...",Two Stars,1424390400,,,
4,5.0,True,"12 25, 2014",A2RF5B5H74JLPE,0700026657,jon,"great game, I love it and have played it since...",love this game,1419465600,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
497572,4.0,True,"07 1, 2017",AVECM71LSZLC5,B01HGPUTCA,boris teplitskiy,not OEM but good replacement parts,Four Stars,1498867200,,,
497573,3.0,True,"08 20, 2018",A1RS06313BL6WN,B01HH6JEOC,Tom Stopsign,Okay stuff.,Three Stars,1534723200,,"{'Edition:': ' Kids Room', 'Platform:': ' PC O...",
497574,3.0,True,"08 7, 2017",ACIZ77IGIX2JL,B01HH6JEOC,Era,This does add some kids room things that are v...,Only buy on sale.,1502064000,,"{'Edition:': ' Kids Room', 'Platform:': ' PC O...",
497575,4.0,False,"08 5, 2018",A34GG58TJ1A3SH,B01HIZF7XE,seamonkey10,I think I originally began playing Bioshock se...,"It's Okay, Nothing Profound",1533427200,,"{'Edition:': ' Collection', 'Platform:': ' Xbo...",


Remove any null reviews

In [72]:
df = df.dropna(subset = ["reviewText"])
df.isnull().sum()

overall                0
verified               0
reviewTime             0
reviewerID             0
asin                   0
reviewerName          76
reviewText             0
summary              103
unixReviewTime         0
vote              389635
style             208296
image             493819
dtype: int64

In [73]:
reviews = df['reviewText']
reviews = reviews.tolist()


# Preprocessing #

In [74]:
reviews_cleaned = list()

for review in reviews:
    tokens = word_tokenize(review) # Tokenize each sentence in the list
    tokens = [word.lower() for word in tokens] # List comprehension - make each word lowercase in each sentence
    table = str.maketrans('' , '' , string.punctuation) # Here I am creating a byte mapping table for the puncutation group from the string package
    stripped = [ w.translate(table) for w in tokens] # Apply the mapping table to the characters in each of our tokens
    words = [word for word in stripped if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    reviews_cleaned.append(words)



In [92]:
common = nltk.FreqDist(i for sentences in reviews_cleaned for i in sentences) # access each tokenized word in each sentence and run a frequency distribution

In [93]:
mostCommon= common.most_common(10) # list the most frequent words from our reviews
mostCommon

[('game', 1009255),
 ('nt', 415683),
 ('like', 286546),
 ('one', 253622),
 ('games', 244291),
 ('get', 211107),
 ('play', 202563),
 ('great', 196300),
 ('good', 188839),
 ('really', 161192)]

### None of these words look out of the ordinary so far, so we won't remove any words or word trends for now ###

# Bigram collocations: #

In [95]:
phrases = Phrases(reviews_cleaned)

In [96]:
bigram = Phraser(phrases)

In [97]:
bigram.phrasegrams

{'get_hang': 19.547502250123102,
 'looking_forward': 72.07502768596183,
 'verry_good': 14.299701738641788,
 'avid_gamer': 149.51226948868398,
 'work_properly': 19.537252317101128,
 'incredibly_frustrating': 20.6746731392426,
 'massive_amounts': 150.76867511463934,
 'sad_sad': 22.22311343917693,
 'pretty_cool': 12.607554274732152,
 'boy_wrong': 19.141104867912446,
 'takes_forever': 56.95019200544469,
 'create_account': 16.643590442858855,
 'right_bat': 64.98782254546806,
 'ever_played': 15.698332020764655,
 'sim_city': 40.552977805147584,
 'layers_complexity': 73.93344697519223,
 'previous_versions': 44.20527475887357,
 'requires_internet': 20.46934900686247,
 'internet_connection': 272.2955965165898,
 'spent_hours': 16.285828205868274,
 'trade_routes': 304.0901392521873,
 'gaming_sites': 22.5606040178109,
 'absolutely_gorgeous': 39.219495197662695,
 'dirt_dirt': 23.41782355511152,
 'logitech_wireless': 17.840256505685215,
 'evga_gtx': 1975.2839147286822,
 'tour_mode': 16.0261437533532,