In [None]:
import nltk
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> all


    Downloading collection 'all'
       | 
       | Downloading package abc to /root/nltk_data...
       |   Unzipping corpora/abc.zip.
       | Downloading package alpino to /root/nltk_data...
       |   Unzipping corpora/alpino.zip.
       | Downloading package averaged_perceptron_tagger to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger.zip.
       | Downloading package averaged_perceptron_tagger_ru to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger_ru.zip.
       | Downloading package basque_grammars to /root/nltk_data...
       |   Unzipping grammars/basque_grammars.zip.
       | Downloading package bcp47 to /root/nltk_data...
       | Downloading package biocreative_ppi to /root/nltk_data...
       |   Unzipping corpora/biocreative_ppi.zip.
       | Downloading package bllip_wsj_no_aux to /root/nltk_data...
       |   Unzipping models/bllip_wsj_no_aux.zip.
       | Downloading package book_grammars to


---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

### Download the corpus and know the dataset

In [None]:
#Download the corpus
from nltk.corpus import movie_reviews

In [None]:
movie_reviews

<CategorizedPlaintextCorpusReader in '/root/nltk_data/corpora/movie_reviews'>

In [None]:
# Review categories
movie_reviews.categories()

['neg', 'pos']

In [None]:
# Total review categories
len(movie_reviews.categories())

2

In [None]:
# Total reviews
len(movie_reviews.fileids())

2000

In [None]:
# Total pos reviews
len(movie_reviews.fileids("pos"))

1000

In [None]:
# Total neg reviews
len(movie_reviews.fileids("pos"))

1000

### Data preprocessing

In [None]:
#Get all words from the movie review corpus
all_words= [word.lower() for word in movie_reviews.words()]
print(all_words[:20])

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an']


#### Remove stop words and punctuations

In [None]:
#Remove stop words
from nltk.corpus import stopwords
stopwords_english = stopwords.words('english')

In [None]:
all_words_exclude_stopwords= [word for word in all_words if word not in stopwords_english]
print(all_words_exclude_stopwords[:15])

['plot', ':', 'two', 'teen', 'couples', 'go', 'church', 'party', ',', 'drink', 'drive', '.', 'get', 'accident', '.']


In [None]:
#Remove punctuations
import string

In [None]:
all_words_exclude_stopwords_punctuations= [word for word in all_words_exclude_stopwords if word not in string.punctuation]
print(all_words_exclude_stopwords_punctuations[:15])

['plot', 'two', 'teen', 'couples', 'go', 'church', 'party', 'drink', 'drive', 'get', 'accident', 'one', 'guys', 'dies', 'girlfriend']


#### Know the frequency of each word in the corpus after removal of noise

In [None]:
from nltk import FreqDist

In [None]:
#Extract frequency of each word in noise free list and print top 20 words
all_clean_words_freq= FreqDist(all_words_exclude_stopwords_punctuations)
print((all_clean_words_freq).most_common(100))

[('film', 9517), ('one', 5852), ('movie', 5771), ('like', 3690), ('even', 2565), ('good', 2411), ('time', 2411), ('story', 2169), ('would', 2109), ('much', 2049), ('character', 2020), ('also', 1967), ('get', 1949), ('two', 1911), ('well', 1906), ('characters', 1859), ('first', 1836), ('--', 1815), ('see', 1749), ('way', 1693), ('make', 1642), ('life', 1586), ('really', 1558), ('films', 1536), ('plot', 1513), ('little', 1501), ('people', 1455), ('could', 1427), ('scene', 1397), ('man', 1396), ('bad', 1395), ('never', 1374), ('best', 1333), ('new', 1292), ('scenes', 1274), ('many', 1268), ('director', 1237), ('know', 1217), ('movies', 1206), ('action', 1172), ('great', 1148), ('another', 1121), ('love', 1119), ('go', 1113), ('made', 1084), ('us', 1073), ('big', 1064), ('end', 1062), ('something', 1061), ('back', 1060), ('still', 1047), ('world', 1037), ('seems', 1033), ('work', 1020), ('makes', 992), ('however', 989), ('every', 947), ('though', 940), ('better', 922), ('real', 915), ('aud

### Create features

In [None]:
feature_vector = list(all_clean_words_freq)[:3000]
print(feature_vector)



In [None]:
#create a function that will extract features from each file
def document_features(document):
    # "set" function will remove repeated/duplicate tokens in the given list
    document_words = set(document)
    features = {}
    for word in feature_vector:
        features[word] = (word in document_words)
    return features

In [None]:
# get the first negative movie review file
movie_review_file = movie_reviews.fileids('neg')[0]
print (movie_review_file)
print (document_features(movie_reviews.words(movie_review_file)))

neg/cv000_29416.txt


### Create a list of all the text files with their words and categories


In [None]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):

        documents.append((movie_reviews.words(fileid), category))

In [None]:
from random import shuffle
shuffle(documents)

In [None]:
documents[0]

(['five', 'years', 'after', 'his', 'directorial', ...], 'neg')

In [None]:
len(documents)

2000

### Create feature set from all the documents by iteration over list of documents

In [None]:
feature_set = [(document_features(doc), category) for (doc, category) in documents]

In [None]:
feature_set[0]

({'film': True,
  'one': True,
  'movie': True,
  'like': True,
  'even': True,
  'good': False,
  'time': False,
  'story': True,
  'would': False,
  'much': True,
  'character': False,
  'also': True,
  'get': True,
  'two': True,
  'well': True,
  'characters': True,
  'first': False,
  '--': False,
  'see': True,
  'way': False,
  'make': False,
  'life': True,
  'really': False,
  'films': False,
  'plot': True,
  'little': False,
  'people': False,
  'could': False,
  'scene': False,
  'man': True,
  'bad': True,
  'never': True,
  'best': False,
  'new': False,
  'scenes': False,
  'many': False,
  'director': True,
  'know': False,
  'movies': False,
  'action': False,
  'great': True,
  'another': True,
  'love': False,
  'go': False,
  'made': False,
  'us': False,
  'big': True,
  'end': False,
  'something': True,
  'back': False,
  'still': True,
  'world': True,
  'seems': False,
  'work': True,
  'makes': False,
  'however': False,
  'every': False,
  'though': False,
  

In [None]:
len(feature_set)

2000

#### Create train and test sets

In [None]:
feature_subset = feature_set[:2000]
train_set=feature_subset[:1600]
test_set= feature_subset[1600:]

#### Train and test the NaiveBayesCLassifier

In [None]:
from nltk import NaiveBayesClassifier

classifier_NB = NaiveBayesClassifier.train(train_set)

In [None]:
from nltk import classify

accuracy_NB = classify.accuracy(classifier_NB, test_set)
print (accuracy_NB)

0.8125


### Train and Test SVC

In [None]:
import nltk.classify
from sklearn.svm import LinearSVC
classifier_SVC = nltk.classify.SklearnClassifier(LinearSVC())
classifier_SVC.train(train_set)


<SklearnClassifier(LinearSVC())>

In [None]:
from nltk import classify
accuracy_SVC = classify.accuracy(classifier_SVC, test_set)
print (accuracy_SVC)

0.795


### Train and test DecisionTree Classifier

In [None]:
classifier_DT = nltk.classify.DecisionTreeClassifier.train(train_set)

In [None]:
from nltk import classify
accuracy_DT = classify.accuracy(classifier_DT, test_set)
print (accuracy_DT)

0.59


In [None]:
import pandas as pd

In [None]:
results=pd.DataFrame()
results["Models"]= ["NaiveBayes", "SVC", "DecisionTree"]

In [None]:
results["Accuracy"]= [accuracy_NB, accuracy_SVC, accuracy_DT]

In [None]:
results.head()

Unnamed: 0,Models,Accuracy
0,NaiveBayes,0.8125
1,SVC,0.795
2,DecisionTree,0.59


### Conclusion

I have used top 3000 words from the feature vector to train the model. Train and test split is 80-20. Accuracy obtained in Naive Bayes is the highest as seen in the table above

In [None]:
from nltk.tokenize import word_tokenize
wordsList = nltk.word_tokenize("I am planning to visit Tamil Nadu to attend the Tamil Nadu Orange Festival")
tagged = nltk.pos_tag(wordsList)
print(tagged)

[('I', 'PRP'), ('am', 'VBP'), ('planning', 'VBG'), ('to', 'TO'), ('visit', 'VB'), ('Tamil', 'NNP'), ('Nadu', 'NNP'), ('to', 'TO'), ('attend', 'VB'), ('the', 'DT'), ('Tamil', 'NNP'), ('Nadu', 'NNP'), ('Orange', 'NNP'), ('Festival', 'NNP')]
