In [5]:
import csv
import json
import random
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
from nltk.stem.wordnet import WordNetLemmatizer

In [6]:
# read and store the file in a list, remove the header
data = []
with open('groceries_trimmed.csv') as f:
    reader = csv.reader(f, delimiter=',')
    for i in reader:
        data.append(i)
f.close()

data = data[1:]

In [7]:
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))

In [8]:
# Make two lists of positive and negative reviews
# Normalize to lowercase, tokenize and remove stopwords
# Append as a tuple with tags

positive = []
negative = []

regex = re.compile('[^a-zA-Z0-9]')
lemmatizer = WordNetLemmatizer()

for i in data:
    if i[0] == '5':
        filtered = []
        tokens = tokenizer.tokenize(i[1].lower())
        for j in tokens:
            j = regex.sub('', j)
            if len(j) > 2 and j not in stop_words:
                lemma = lemmatizer.lemmatize(j)
                filtered.append(lemma)

        if len(filtered) > 0:
            positive.append((filtered, 'pos'))
    else:
        filtered = []
        tokens = tokenizer.tokenize(i[1].lower())
        for j in tokens:
            j = regex.sub('', j)
            if len(j) > 2 and j not in stop_words:
                lemma = lemmatizer.lemmatize(j)
                filtered.append(lemma)
                
        if len(filtered) > 0:
            negative.append((filtered, 'neg'))

In [9]:
# There are many more positive than negative reviews, so shuffle for randomness
# make a huge list containing equal numbers of positive and negative labeled reviews
# shuffle the giant list again
random.shuffle(positive)
labeled_reviews = positive[:len(negative)] + negative
random.shuffle(labeled_reviews)

In [10]:
# Our list
labeled_reviews

[(['need',
   'use',
   'lot',
   'tea',
   'tea',
   'doesnt',
   'taste',
   'like',
   'hot',
   'water'],
  'neg'),
 (['box', 'super', 'small', 'check', 'dimension', 'well'], 'neg'),
 (['use',
   'able',
   'buy',
   'small',
   'jar',
   'local',
   'store',
   'anymore',
   'good',
   'chick',
   'grilled',
   'also',
   'excellent',
   'steak',
   'grilled'],
  'pos'),
 (['taste',
   'like',
   'chemical',
   'enjoyed',
   'variety',
   'barely',
   'drinkable',
   'product',
   'prime',
   'two',
   'day',
   'shipping',
   'either'],
  'neg'),
 (['dislike',
   'taste',
   'mix',
   'supposedly',
   'receive',
   'ounce',
   'canister',
   'purchasing',
   'year'],
  'neg'),
 (['rose', 'dead', 'time', 'got', 'rose', 'also', 'rose', 'purple', 'pink'],
  'neg'),
 (['great', 'flavor', 'give', 'energy', 'boost', 'need'], 'pos'),
 (['beware',
   'coffee',
   'tell',
   'kind',
   'coffee',
   'bean',
   'use',
   'info',
   'bag',
   'nothing',
   'website',
   'could',
   'find',
 

In [11]:
# Our list contains over 800,000 reviews
len(labeled_reviews)

809226

In [12]:
labeled_reviews = positive[:1000] + negative[:1000]
random.shuffle(labeled_reviews)

In [13]:
labeled_reviews_joined = []
for i in labeled_reviews:
    labeled_reviews_joined.append((" ".join(i[0]), i[1]))

In [14]:
with open("2000Reviews.csv", 'w', newline='') as f:
    writer = csv.writer(f)

    for review in labeled_reviews_joined:
        writer.writerow(review)

In [15]:
import sqlite3

conn = sqlite3.connect("review.db")
c = conn.cursor()

c.execute("""Create TABLE IF NOT EXISTS labeled_reviews('review', 'label')""")

def data_entry():
    for entry in labeled_reviews_joined:
        c.execute("INSERT INTO labeled_reviews('review', 'label') VALUES(?, ?)", (entry[0], entry[1]))
    conn.commit()

data_entry()

c.execute("SELECT * FROM labeled_reviews")
rows = c.fetchall()
for row in rows:
    print(row)

conn.close()

('Something', 'Another thing')
('Something else', 'Another thing else')
('Something', 'Another thing')
('Something else', 'Another thing else')
('drinking tea year variety fantastic paris time favorite tea complex mixture black tea fruity vanilla caramel flavor along hint lemony bergamot aroma die ive brewed variety extra strong morning cup tea least week past year 1lb bag great deal provided take minor storage precaution ensure leaf retain aroma freshness highly recommend taking small package amount daily use tightly sealing rest package storing cool dry place keep daily use tea little glass jar within cabinet air tight dark container', 'pos')
('product misshapen chocolate flanking taste stale sat heat long time good', 'neg')
('received peter pepper red hot pepper seed mail month ago compliment understanding intricate nature plant need first furiously buried one warm moist hole imagine surprise started grow almost immediately plant like need water survive dunked one swimming pool near

('great taste everyone talking', 'neg')
('tasty', 'pos')
('excellent', 'pos')
('item part multi item order would nice know upon ordering rather order received item back order', 'neg')
('thing mist right', 'neg')
('dry fresh waste money like', 'neg')
('give thesis peacock love', 'pos')
('dear amazon someone messed order ordered wilton color set color unfortunately happen come color needed might forgive send missing blue color adress still love mpc', 'neg')
('great coffee everyday drinking cup every morning also choice guest like storger variety good well balanced coffee regular drinking', 'pos')
('buy stuff amazon time best ever', 'pos')
('wife bakes one product work baking well coffee cereal fresh berry home made dressing wherever substitute desired used product year baby boomer remember product originally made pillsbury available glass bottle white push button top pillsbury dropped line associated brand picked today difficult find super market opinion liquid granular product come clos

('disappointed pecan meltaways shop offering customer holiday soooooo good ordered tin dry certainly meltaway mouth although burnt definitely cooked many crumb tin well willing pay price one good order', 'neg')
('problem candy amazon picture christmas box instead get regular white box', 'neg')
('much spray come time', 'neg')
('allergy season usually mean large amount sudafed particularly like side affect sudafed tried ton different decongestant first thing sudafed actually work tried sinus migraine yet moderate congestion work wonder smell bit taste fine make sure seep min work give min kick seems last hr side affect tell glad rid racing heart side affect dry mouth sore throat come sudafed work great paired sinus rinse netty pot', 'pos')
('well ordered ice cream machine exchanged machine use ice salt rock return easy ice cream maker sooo sitting box floor', 'neg')
('love', 'pos')
('glad purchase serve special occasion hard gritty tasting even close melting mouth ick', 'neg')
('great mo

('tney putting sugar product', 'neg')
('waste money ordered make gold oreo cooky patrick day result horrible oreo became soggy tasted like hair spray ended throwing away cooky spray pinterest fail', 'neg')
('first stated order gum came count box made price good last shipment received amazon one case come mean doubled price gum cheaper better buy gum wal mart cause even driving buy pack gum still cheaper buying amazon get nothing else disappointing amazon', 'neg')
('buyer beware use stuff reenact mad max fury road cosplay stuff ethanol propellant damage teeth gum make really grow tumor', 'neg')
('fresh nothing added', 'pos')
('missing important color december christmas red came packed two pink disappointing find making christmas cooky returning', 'neg')
('delicious tea arrived quickly', 'pos')
('use hundred pod every month perhaps economical way drink equivalent several pot coffee day convenience ease making personal cup latte whipping splash milk bottom glass well worth cost also great

('bottle leaking sticky', 'neg')
('shipping day order math extra shipping alone crazy talking scharffen berger chocolate charge day order care chocolate buy get taken shipping cheap product', 'neg')
('everybody love coffee mate really smooth coffee wrong huge size good price', 'pos')
('delivered fine use right away stay good quite long time went use week later container still sealed top seal opened one side container split side gel oozed everywhere one awful sticky mess flat pantry shelf kitchen nothing sitting top never buy gel anywhere', 'neg')
('love mix use make easy one pot meal time time slice favorite cajun sausage coin saute browned add ingredient pan use deep sided chicken fryer usually find take bit water bit time instruction call done rice bean make yummy super simple meal', 'pos')
('take teabags make single cup tea steeping minute overpriced expensive tea even taste good fragrance whatsoever going post review every tazo awake listing amazon com find others taken tazo hype p

('taste horrible', 'neg')
('looked everwhere change daughter birthday cake frosting black frosting white product spray say give black color followed instruction perfectly gave grayish color sprayed bunch smell awful looked awful realizing going work stopped tried scrape entire cake absorbed flavor horrible cake ended garbage', 'neg')
('fantastic rice let price stop purcahsing product use table spoon vinegar cook', 'pos')
('like taste', 'neg')
('love cream cheese cracker', 'pos')
('might worst tea ever tasted honestly know anyone could give tea positive review save money', 'neg')
('smelled like paint spray throw away cooky', 'neg')
('excellent canned chili buy', 'pos')
('look pretty horrible taste smell', 'neg')
('favorite pretzel friend gluten free love well always trying eat', 'pos')
('much sodium', 'neg')
('coconut oil good price quality product pleased use cooking baking condition hair use skin', 'pos')
('great product buy', 'pos')
('got stave alzheimer hope case really yummy made c