In [1]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
def refine_word(words):
    useful_words = [word for word in words if word not in stopwords.words("english")] # remove all stopwords like 'a', 'I', 'the', etc
    refine_dict = dict([(word, True) for word in useful_words]) # dictionary allows for repeat words to be removed
    return refine_dict 

In [3]:
refine_word(['jumps', 'over', 'the', 'lazy', 'dog', 'jumps' ]) #testing to verify our function indeed works

{'dog': True, 'jumps': True, 'lazy': True}

In [4]:
pos = []
for fileid in movie_reviews.fileids('pos'):
    words = movie_reviews.words(fileid)
    pos.append((refine_word(words), 'positive'))


In [5]:
print(len(pos))

1000


In [6]:
neg = []
for fileid in movie_reviews.fileids('neg'):
    words = movie_reviews.words(fileid)
    neg.append((refine_word(words), 'negative'))

In [7]:
print(len(neg))

1000


In [8]:
training_set = neg[:750] + pos[:750]
test_set = neg[750:] + pos[750:]

In [9]:
print(len(training_set), len(test_set))

1500 500


In [10]:
classifier = NaiveBayesClassifier.train(training_set) # use Naive Bayes classifier

In [11]:
accuracy = nltk.classify.util.accuracy(classifier, test_set) #calculate accuracy

In [12]:
print(round(accuracy*100,3))

72.4


In [13]:
#Now we will let our classifier decide if this is a good or bad review

In [14]:
emoji_review = '''
This may come as a shock to many of you, but The Emoji Movie is not good. In fact, it's really bad. Bad in the way that it doesn't even try much of the time. Bad in the way that it is intended to be a funny children's film with a message about championing individuality and being yourself, but even that tried and true formula falls flat.
Did I say it was supposed to be funny? It's not funny. It tries, it has obvious attempts at humor, but it's not funny. Worse, it has a talented and typically hilarious group of people providing the voices for much of these humanoid expressions that exist in a world that doesn't make much sense in the first place. 
Let's start over as this would be the initial issue that only leads to more of these problems that spawn from the fact this is a movie based on emoji's. It would probably be big of me to say that this movie isn't bad simply because it is a movie based on emoji's, but it is. It represents everything wrong with the studio system from the perspective of attempting a cash grab without any measure of creativity or thought put into the actual work. 

'''
print(emoji_review)


This may come as a shock to many of you, but The Emoji Movie is not good. In fact, it's really bad. Bad in the way that it doesn't even try much of the time. Bad in the way that it is intended to be a funny children's film with a message about championing individuality and being yourself, but even that tried and true formula falls flat.
Did I say it was supposed to be funny? It's not funny. It tries, it has obvious attempts at humor, but it's not funny. Worse, it has a talented and typically hilarious group of people providing the voices for much of these humanoid expressions that exist in a world that doesn't make much sense in the first place. 
Let's start over as this would be the initial issue that only leads to more of these problems that spawn from the fact this is a movie based on emoji's. It would probably be big of me to say that this movie isn't bad simply because it is a movie based on emoji's, but it is. It represents everything wrong with the studio system from the perspe

In [15]:
words = word_tokenize(emoji_review)
words = refine_word(words)
classifier.classify(words)
#much of our algorithm depends on the data
#in this case, the data has a lot of swear words and is fairly small, but it is a good lesson to learn 
#for this particular case, the review had to be very very bad for the algorithm to detect negative

'positive'

In [16]:
emoji_review2 = '''

The Emoji Movie is the worst movie I have ever seen in my hopefully short life. The funniest thing about this "jam packed comedy" was the fact that I was that nobody in the whole audience was laughing.
The plot was predictable I felt no emotion towards a single character. The puns were 💩 quite literally. I had to apologise to my distraught family after taking them to see this film. 
I knew it would be bad but I didn't think it would be quite this bad.I am still in trouble from my wife for forcing her to put up with all 90 minutes of this predictably boring film.
'''
print(emoji_review2)



The Emoji Movie is the worst movie I have ever seen in my hopefully short life. The funniest thing about this "jam packed comedy" was the fact that I was that nobody in the whole audience was laughing.
The plot was predictable I felt no emotion towards a single character. The puns were 💩 quite literally. I had to apologise to my distraught family after taking them to see this film. 
I knew it would be bad but I didn't think it would be quite this bad.I am still in trouble from my wife for forcing her to put up with all 90 minutes of this predictably boring film.



In [17]:
words = word_tokenize(emoji_review2)
words = refine_word(words)
classifier.classify(words)

'negative'

In [18]:
getout = '''
More than just a standard-issue thriller, this brutal, smart movie is impeccably made, as well as surprising, shocking, and funny, while also offering a compassionate, thoughtful look at race.
'''
#testing a positive review to confirm

In [19]:
words = word_tokenize(getout)
words = refine_word(words)
classifier.classify(words)
#Also good to note our data set was not extremely representative of imdb reviews(hence the disparity)

'positive'