<a href="https://colab.research.google.com/gist/1UC1F3R616/615cd755356c6791f0500ba6342e9700/moviereviewclassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Steps
- Importing Libraries
- Data Pre-Processing
- Split the dataset into Train and Test set
- Train the Classifier
- Test the Accuracy

In [0]:
import nltk
import random # To shuffle the dataset
from nltk.corpus import movie_reviews # 1K negative, 1K positive Reviews

## Data Pre-Processing Steps
- Create the List of Tuples
- Shuffle the Documents
- Normalize the Dataset
- Convert word list into nltk frequency distribution
- Limit the words
- Find Features within the documents

In [0]:
  import nltk
  nltk.download('movie_reviews')

In [0]:
documents = []

for category in movie_reviews.categories():
  for fileid in movie_reviews.fileids(category):
    documents.append((list(movie_reviews.words(fileid)), category))

In [0]:
random.shuffle(documents)
print(documents[1])

In [0]:
# for x in movie_reviews.categories():
#   for y in movie_reviews.fileids(category):
#     print(y) # individual files

In [0]:
all_words = []
for word in movie_reviews.words():
  all_words.append(word.lower())

In [0]:
all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))
print(all_words['love'])

In [0]:
# limit the words
word_features = list(all_words.keys())[:3000]

In [0]:
def find_features(document):
  words = set(document)
  features = {}
  for w in word_features:
    features[w] = (w in words)
  return features

In [23]:
print(find_features(movie_reviews.words('neg/cv000_29416.txt')))



In [0]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [0]:
# Working for Model
training_set = featuresets[:1900]
testing_set = featuresets[1900:]

In [0]:
classifer = nltk.NaiveBayesClassifier.train(training_set)

In [37]:
## Testing the Accuracy
print('Accuracy {}'.format(nltk.classify.accuracy(classifer, testing_set)*100))

classifier.show_most_informative_features(15)

Accuracy 71.0
Most Informative Features
                  stupid = True              neg : pos    =      7.1 : 1.0
                  police = True              neg : pos    =      7.1 : 1.0
                    hand = True              pos : neg    =      6.3 : 1.0
                   waste = True              neg : pos    =      6.2 : 1.0
                       b = True              neg : pos    =      5.4 : 1.0
                 follows = True              pos : neg    =      5.4 : 1.0
                   awful = True              neg : pos    =      5.2 : 1.0
                   haven = True              neg : pos    =      5.2 : 1.0
                     van = True              neg : pos    =      5.2 : 1.0
                  boring = True              neg : pos    =      4.9 : 1.0
                   worst = True              neg : pos    =      4.9 : 1.0
                    save = True              neg : pos    =      4.9 : 1.0
                  wouldn = True              neg : pos    = 