In [32]:
import csv
import json
import random
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
from nltk.stem.wordnet import WordNetLemmatizer

In [33]:
# read and store the file in a list, remove the header
data = []
with open('groceries_trimmed.csv') as f:
    reader = csv.reader(f, delimiter=',')
    for i in reader:
        data.append(i)
f.close()

data = data[1:]

In [34]:
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))

In [35]:
# Make two lists of positive and negative reviews
# Normalize to lowercase, tokenize and remove stopwords
# Append as a tuple with tags

positive = []
negative = []

regex = re.compile('[^a-zA-Z0-9]')
lemmatizer = WordNetLemmatizer()

for i in data:
    if i[0] == '5':
        filtered = []
        tokens = tokenizer.tokenize(i[1].lower())
        for j in tokens:
            j = regex.sub('', j)
            if len(j) > 2 and j not in stop_words:
                lemma = lemmatizer.lemmatize(j)
                filtered.append(lemma)

        if len(filtered) > 0:
            positive.append((filtered, 'pos'))
    else:
        filtered = []
        tokens = tokenizer.tokenize(i[1].lower())
        for j in tokens:
            j = regex.sub('', j)
            if len(j) > 2 and j not in stop_words:
                lemma = lemmatizer.lemmatize(j)
                filtered.append(lemma)
                
        if len(filtered) > 0:
            negative.append((filtered, 'neg'))

In [36]:
# There are many more positive than negative reviews, so shuffle for randomness
# make a huge list containing equal numbers of positive and negative labeled reviews
# shuffle the giant list again
random.shuffle(positive)
labeled_reviews = positive[:len(negative)] + negative
random.shuffle(labeled_reviews)

In [37]:
# Our list
labeled_reviews

[(['disgusting', 'taste', 'bitter', 'ironically'], 'neg'),
 (['disappointment',
   'research',
   'reading',
   'review',
   'thought',
   'knew',
   'getting',
   'joke',
   'ordered',
   'case',
   'office',
   'case',
   'loose',
   'coffee',
   'ground',
   'throughout',
   'package',
   'generic',
   'silver',
   'blue',
   'maxwell',
   'house',
   'pictured',
   'tasted',
   'extremely',
   'stale',
   'yuck',
   'filter',
   'pack',
   'knew',
   'brewed',
   'pot',
   'coffee',
   'fine',
   'weak',
   'others',
   'stated',
   'using',
   'coffee',
   'pot',
   'quality',
   'totally',
   'unacceptable',
   'poor',
   'packaging',
   'falsely',
   'individually',
   'packaged',
   'little',
   'hard',
   'open',
   'taste',
   'typical',
   'maxwell',
   'house'],
  'neg'),
 (['would', 'give', 'zero', 'star', 'option', 'horrible', 'taste'], 'neg'),
 (['bar', 'real', 'good'], 'pos'),
 (['family',
   'favorite',
   'ship',
   'quick',
   'buy',
   'coffee',
   'family',
   'cof

In [38]:
# Our list contains over 800,000 reviews
len(labeled_reviews)

809226

In [39]:
labeled_reviews = positive[:1000] + negative[:1000]
random.shuffle(labeled_reviews)

In [40]:
labeled_reviews_joined = []
for i in labeled_reviews:
    labeled_reviews_joined.append((" ".join(i[0]), i[1]))

In [41]:
with open("2000Reviews.csv", 'w', newline='') as f:
    writer = csv.writer(f)

    for review in labeled_reviews_joined:
        writer.writerow(review)