In [None]:
import csv
import random
import nltk
from nltk.tokenize import RegexpTokenizer

In [None]:
# read and store the file in a list, remove the header
data = []
with open('2000reviews.csv') as f:
    reader = csv.reader(f, delimiter=',')
    for i in reader:
        data.append(i)
f.close()

In [None]:
tokenizer = RegexpTokenizer(r'\w+')

In [None]:
# Make two lists of positive and negative reviews
# Normalize to lowercase, tokenize and remove stopwords
# Append as a tuple with tags

positive = []
negative = []

for i in data:
    if i[1] == 'pos':
        tokens = tokenizer.tokenize(i[0])
        positive.append((tokens, 'pos'))
        
    else:
        tokens = tokenizer.tokenize(i[0])
        positive.append((tokens, 'neg'))

In [None]:
labeled_reviews = positive + negative
random.shuffle(labeled_reviews)
labeled_reviews

In [None]:
all_words = []
for review in labeled_reviews:
    for word in review[0]:
        all_words.append(word)
all_words

In [None]:
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:3000]
word_features

In [None]:
def find_features(document):
	features = {}
	for w in word_features:
		features[w] = (w in document[0])
	return features

featuresets = [(find_features(reviews), category) for (reviews, category) in labeled_reviews]

In [None]:
featuresets

In [None]:
type(featuresets)

In [None]:
# 70% to train, 30% to test
training_percent = int(len(featuresets)*.7)
training_set = featuresets[:training_percent]
testing_set = featuresets[training_percent:]

In [None]:
# Using Naive Bayes Classifier and Printing Accuracy
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Accuracy: ", round((nltk.classify.accuracy(classifier, testing_set) * 100), 2), "%")

In [None]:
# Most informative features
classifier.show_most_informative_features(100)

In [None]:
features = classifier.most_informative_features(100)

In [None]:
features

In [None]:
probability = classifier.prob_classify(featuresets[0][0])
probability

In [None]:
def make_feature_set(review):
    tokens = tokenizer.tokenize(review.lower())
    features = {}
    for word in word_features:
        features[word] = (word in tokens)
    return features

In [None]:
negative_review = "This is a terrible product.  I hate it.  Never buy it.  It is the worst flavor I have ever experienced"
classifier.classify(make_feature_set(negative_review))

In [None]:
positive_review = "This is a wonderful product.  Absolutely delicious.  Excellent.  I love it!"
classifier.classify(find_features(positive_review))

In [None]:
classifier.classify(find_features("DATA GOES HERE").lower())