
# Hands-on Exercise 02 
# Text Classification using NaiveBayesClassifier using NLTK

Source: https://www.nltk.org/book/ch06.html

adapted by Raghava Mukkamala 


In [3]:
import nltk
from nltk.corpus import movie_reviews
import random
from prettytable import PrettyTable
import textwrap 



In [4]:
nltk.download('movie_reviews')

# if you get error then you can download movie reviews by using 
# nltk.download('movie_reviews') and then unpack the downloaded zip file.

print(movie_reviews.words('pos/cv957_8737.txt'))
movie_reviews.categories()

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Fredi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.


['capsule', ':', 'the', 'best', 'place', 'to', 'start', ...]


['neg', 'pos']

## Loading and Transforming movie review documents

    Load the documents from ../nltk_data/corpora/movie_reviews and
    transform them in the following format.

    [
    ([ 'gotten', 'a', 'four', 'star', 'rating', 'out', 'of', 'me', '.'], 'pos'), 
    ([ 'free', 'tickets',  'definitely', 'worth', 'checking', 'out', '.'], 'pos')
    ]


In [5]:
print ('movie_reviews.categories(): ', movie_reviews.categories())

documents = [(list(movie_reviews.words(fileid)), category)
    for category in movie_reviews.categories()
    for fileid in movie_reviews.fileids(category)]


random.shuffle(documents)

print('number of documents: ', len(documents))

tab = PrettyTable(['Document Features', 'Category'])

tab.horizontal_char = '-'

for (doc, cat) in documents[0:5]:
    feats = textwrap.fill(','.join(doc[:50]), width=80)
    tab.add_row([ feats, cat])
    tab.add_row([ '\n', '\n'])
#     print(cat)

print(tab)


movie_reviews.categories():  ['neg', 'pos']
number of documents:  2000
+----------------------------------------------------------------------------------+----------+
|                                Document Features                                 | Category |
+----------------------------------------------------------------------------------+----------+
| for,better,or,worse,,,the,appearance,of,basic,instinct,in,the,movie,marketplace, |   neg    |
| gave,the,greenlight,to,a,whole,slew,of,overheated,,,oversexed,,,underwritten,thr |          |
| illers,.,apparently,not,having,joe,eszterhas,as,the,writer,doesn,',t,change,a,th |          |
|                       ing,,,since,body,of,evidence,wasn,'                        |          |
|                                                                                  |          |
|                                                                                  |          |
| *,*,the,following,review,contains,spoilers,*,*,",please,someone

## Generate a Frequency distribution of words 

    Load all words from all the documents from the movie reviews to use 
    most common words as features. 

In [6]:
print('total words from movie review corpus: ', len(movie_reviews.words()))

# load all the words in freq distribution
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())

most_freq_words = all_words.most_common(2000)

print('most freq words: ', most_freq_words[100:110])

word_features = [word for (word, count) in most_freq_words]

print('word_features[:25]: ', word_features[:25])



total words from movie review corpus:  1583820
most freq words:  [('off', 1581), ('too', 1577), ('any', 1574), ('does', 1568), ('really', 1558), ('had', 1546), ('while', 1539), ('films', 1536), ('how', 1517), ('plot', 1513)]
word_features[:25]:  [',', 'the', '.', 'a', 'and', 'of', 'to', "'", 'is', 'in', 's', '"', 'it', 'that', '-', ')', '(', 'as', 'with', 'for', 'his', 'this', 'film', 'i', 'he']


## Converting documents into training set containing features

    Extarcting features from a document and transforming them feature sets.

In [21]:
def get_document_features(document):
    """
        This function will convert given document into a feature set.
        
    """
    document_words = set(document)
    
    features = {}
    
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)

    return features




# test code for the above function

words_doc = movie_reviews.words('pos/cv957_8737.txt')

feat_dict = get_document_features(words_doc)

feat_dict_25 = {k: feat_dict[k] for k in list(feat_dict.keys())[:25]}

print('transformed document features, printing the first 25 features \n\n', feat_dict_25)

# print(documents[1][1])


transformed document features, printing the first 25 features 

 {'contains(,)': True, 'contains(the)': True, 'contains(.)': True, 'contains(a)': True, 'contains(and)': True, 'contains(of)': True, 'contains(to)': True, "contains(')": True, 'contains(is)': True, 'contains(in)': True, 'contains(s)': True, 'contains(")': True, 'contains(it)': True, 'contains(that)': True, 'contains(-)': True, 'contains())': True, 'contains(()': True, 'contains(as)': True, 'contains(with)': True, 'contains(for)': True, 'contains(his)': True, 'contains(this)': True, 'contains(film)': False, 'contains(i)': False, 'contains(he)': True}


## Preparing training set and training Naive Bayes Classifier

In [24]:

featuresets = [(get_document_features(d), c) for (d,c) in documents]

print(len(featuresets))


train_set, test_set = featuresets[100:], featuresets[:100]
print("test", train_set)
print("type", type(train_set))

print(len(test_set))

classifier = nltk.NaiveBayesClassifier.train(train_set)


print('accuracy: ', nltk.classify.accuracy(classifier, test_set)) 

classifier.show_most_informative_features(20)


2000
test 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



accuracy:  0.85
Most Informative Features
   contains(outstanding) = True              pos : neg    =     13.3 : 1.0
         contains(mulan) = True              pos : neg    =      7.6 : 1.0
        contains(seagal) = True              neg : pos    =      7.5 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.8 : 1.0
         contains(damon) = True              pos : neg    =      5.8 : 1.0
         contains(flynt) = True              pos : neg    =      5.6 : 1.0
          contains(lame) = True              neg : pos    =      5.5 : 1.0
        contains(wasted) = True              neg : pos    =      5.2 : 1.0
         contains(awful) = True              neg : pos    =      5.1 : 1.0
        contains(poorly) = True              neg : pos    =      5.0 : 1.0
    contains(ridiculous) = True              neg : pos    =      5.0 : 1.0
         contains(waste) = True              neg : pos    =      4.9 : 1.0
         contains(worst) = True              neg : pos    

## Testing the classifier



In [15]:

sample_review = 'outstanding'

sample_review_doc_feats = get_document_features(sample_review.split())

# print('Sample review features: \n\n',sample_review_doc_feats)

print(sample_review, ':' , classifier.classify(sample_review_doc_feats))





outstanding : neg


## <font color='red'>Handson Exercise - 02:</font>

    Use Reuters Corpus from nltk and build a Naive Bayes classifier for the categories of Reuters Corpus.
    Please refer to https://www.nltk.org/book/ch02.html for an example on how to access Reuters Corpus. 
    Use some test documents to test the accuracy of the classifier.
    



In [15]:

# make sure that NLTK and reuters corpus is accessible
import nltk

# If you get an error saying that 'Resource reuters not found.' , 
# you can download using the following code.

#nltk.download('reuters')
#nltk.download('punkt')

from nltk.corpus import reuters

# Check how many fileids in the reuters corpus to see that we have access.
print(len(reuters.fileids()))


tab = PrettyTable(['fileid', 'Category'])

index = 0

print('printing the categories for first 10 docs!')

for id in reuters.fileids():
    index += 1
    cats = textwrap.fill(','.join(reuters.categories(id)), width=40)
    tab.add_row([id, cats])
    if index == 10:
        break
    
print(tab)





10788
printing the categories for first 10 docs!
+------------+----------------------------------------+
|   fileid   |                Category                |
+------------+----------------------------------------+
| test/14826 |                 trade                  |
| test/14828 |                 grain                  |
| test/14829 |             crude,nat-gas              |
| test/14832 | corn,grain,rice,rubber,sugar,tin,trade |
| test/14833 |            palm-oil,veg-oil            |
| test/14839 |                  ship                  |
| test/14840 | coffee,lumber,palm-oil,rubber,veg-oil  |
| test/14841 |              grain,wheat               |
| test/14842 |                  gold                  |
| test/14843 |                  acq                   |
+------------+----------------------------------------+


In [16]:
print(len(reuters.words(categories=['barley', 'corn'])))

reuters.fileids('barley')


64419


['test/15618',
 'test/15649',
 'test/15676',
 'test/15728',
 'test/15871',
 'test/15875',
 'test/15952',
 'test/17767',
 'test/17769',
 'test/18024',
 'test/18263',
 'test/18908',
 'test/19275',
 'test/19668',
 'training/10175',
 'training/1067',
 'training/11208',
 'training/11316',
 'training/11885',
 'training/12428',
 'training/13099',
 'training/13744',
 'training/13795',
 'training/13852',
 'training/13856',
 'training/1652',
 'training/1970',
 'training/2044',
 'training/2171',
 'training/2172',
 'training/2191',
 'training/2217',
 'training/2232',
 'training/3132',
 'training/3324',
 'training/395',
 'training/4280',
 'training/4296',
 'training/5',
 'training/501',
 'training/5467',
 'training/5610',
 'training/5640',
 'training/6626',
 'training/7205',
 'training/7579',
 'training/8213',
 'training/8257',
 'training/8759',
 'training/9865',
 'training/9958']

In [None]:
reuters.fileids?