In [1]:
import json_lines
from langdetect import detect
from langdetect import detect_langs
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import pandas as pd

lemmatizer = WordNetLemmatizer()
REMOVEABLE_CHARS = "»<>123456789\/&()?:!.,;'´"
# ALL_TAGS = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT',
#  'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP',
#  'WP$', 'WRB']
ACCEPTABLE_TAGS = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT',
 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP',
 'WP$', 'WRB']
MAX_FEATURES = 10
MIN_ENGLISH = 0.9999
LIMIT_INPUT_ROWS = 100 # to stop run times hindering development

def get_wordnet_pos(tag): # Return a char, based on input tag, that is used as a param in lemmatisation for enhanced results
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

In [2]:
# READ IN INFO
X= []; y= []; z= [];
with open ('reviews.json', 'rb') as f:
    for i, item in enumerate(json_lines.reader(f)):
        if i <= LIMIT_INPUT_ROWS:
            try:
                langs = detect_langs(item['text'])
            except:
                pass
            for i, lang in enumerate(langs):
                if str(lang)[0:2] == 'en':
                    if float(str(lang)[3:]) > MIN_ENGLISH:
                        X.append(item['text'])
                        y.append(item['voted_up'])
                        z.append(item['early_access'])

In [3]:
# for i, item in enumerate(X):
#     print( "X[", str(i), "]: ", str(X[i]) )
#     print( "y[", str(i), "]: ", str(y[i]) )
#     print( "z[", str(i), "]: ", str(z[i]) )
#     print( "\n" )

In [4]:
# FILTER TEXT
for i, text in enumerate(X):
    text_words = nltk.word_tokenize(text.lower())
    tags = nltk.pos_tag(text_words)
    #print(tags)
    new_text= [];
    for j, word in enumerate(text_words):
        if tags[j][1] in ACCEPTABLE_TAGS:
            for char in word:
                if char in REMOVEABLE_CHARS:
                    break
            else:
                wordnet_pos = get_wordnet_pos(tags[j][1])
                if wordnet_pos != '':
                    new_text.append(lemmatizer.lemmatize(word, wordnet_pos))
                    #print("word: ", word, ". wordnet_pos: ", wordnet_pos, "lemmad: ",lemmatizer.lemmatize(word, wordnet_pos),"\n")
                continue
    X[i] = new_text

In [5]:
# for i, item in enumerate(X):
#     print( "X[", str(i), "]: ", str(X[i]) )
#     print( "y[", str(i), "]: ", str(y[i]) )
#     print( "z[", str(i), "]: ", str(z[i]) )
#     print( "\n" )

In [7]:
# MOST FREQ WORDS AS FEATURES
word_counts = {};
for text in X:
    for word in text:
        if word in word_counts:
            word_counts[word]+= 1
        else:
            word_counts[word]= 1
word_counts = {k: v for k, v in sorted(word_counts.items(), key=lambda item: item[1], reverse=True)}
trimd_word_counts = {}
if len(word_counts) > MAX_FEATURES:
    for i, entry in enumerate(word_counts):
        if i >= MAX_FEATURES:
            break
        trimd_word_counts[entry] = word_counts[entry]
word_counts = trimd_word_counts
print(word_counts)

{'be': 42, 'game': 33, 'i': 22, 'have': 13, 'not': 10, 'play': 9, 'get': 9, 'much': 8, 'do': 8, 'friend': 8}


In [None]:
# # CONVERT TO PANDAS
# d = {'Text':X, 'Voted up':y, 'Early access':z}
# df = pd.DataFrame(d)
# df