# Fetching the Data

In [1]:
import nltk
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
nltk.download('movie_reviews')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import string
from nltk.corpus import wordnet

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\akshay\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\akshay\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
movie_reviews.categories()                 #movie review categories: negative and positive

['neg', 'pos']

In [3]:
len(movie_reviews.fileids())               #2000 files, 1000 for neg and 1000 for pos

2000

In [4]:
movie_reviews.fileids("neg")               #this will give us negative file ids, as a list

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt',
 'neg/cv010_29063.txt',
 'neg/cv011_13044.txt',
 'neg/cv012_29411.txt',
 'neg/cv013_10494.txt',
 'neg/cv014_15600.txt',
 'neg/cv015_29356.txt',
 'neg/cv016_4348.txt',
 'neg/cv017_23487.txt',
 'neg/cv018_21672.txt',
 'neg/cv019_16117.txt',
 'neg/cv020_9234.txt',
 'neg/cv021_17313.txt',
 'neg/cv022_14227.txt',
 'neg/cv023_13847.txt',
 'neg/cv024_7033.txt',
 'neg/cv025_29825.txt',
 'neg/cv026_29229.txt',
 'neg/cv027_26270.txt',
 'neg/cv028_26964.txt',
 'neg/cv029_19943.txt',
 'neg/cv030_22893.txt',
 'neg/cv031_19540.txt',
 'neg/cv032_23718.txt',
 'neg/cv033_25680.txt',
 'neg/cv034_29446.txt',
 'neg/cv035_3343.txt',
 'neg/cv036_18385.txt',
 'neg/cv037_19798.txt',
 'neg/cv038_9781.txt',
 'neg/cv039_5963.txt',
 'neg/cv040_8829.txt',
 'neg/cv041_22364.txt',


In [5]:
movie_reviews.words(movie_reviews.fileids()[7])      #get the words of the 7th file in a list

['that', "'", 's', 'exactly', 'how', 'long', 'the', ...]

# Getting all the words in each file

In [6]:
#getting all the words from all the files:
docs=[]                                       #words will be stored in a tuple(words_list,category) for each file
for i in movie_reviews.categories():          #files are random, it is not so that first 1000 are new and next pos
    for fid in movie_reviews.fileids(i):
        docs.append((movie_reviews.words(fid),i))
docs[:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [7]:
#so that train_test_split is easy,we will shuffle this docs list so that neg and pos reviews are in random order
import random
random.shuffle(docs)
docs[0:5]

[(['the', 'obvious', 'reason', 'for', 'producing', 'a', ...], 'neg'),
 (['contact', '(', 'pg', ')', 'there', "'", 's', 'a', ...], 'pos'),
 (['saving', 'private', 'ryan', '(', 'dreamworks', ')', ...], 'pos'),
 (['george', 'little', '(', 'jonathan', 'lipnicki', ')', ...], 'pos'),
 (['john', 'sayles', "'", '"', 'men', 'with', 'guns', ...], 'pos')]

# Cleaning the Data

In [8]:
lmtz=WordNetLemmatizer()
stop=stopwords.words("english")
punc=list(string.punctuation)
stop+=punc
def getSimpleTag(tag):             
    if tag.startswith("J"):
        return wordnet.ADJ
    elif tag.startswith("N"):
        return wordnet.NOUN
    elif tag.startswith("V"):
        return wordnet.VERB
    elif tag.startswith("R"):
        return wordnet.ADV       
    else:
        return wordnet.NOUN
def clean(words):
    output_words=[]
    
    for w in words:
        if w.lower() not in stop:
            pos=pos_tag([w])                       #if we give only w, it will return pos tag for each character of the word 
            simple_pos=getSimpleTag(pos[0][1])     #pos tags are stored in a list in a tuple(word,pos_tag)
            lem_w=lmtz.lemmatize(w,simple_pos)
            output_words.append(lem_w.lower())
    return output_words

In [9]:
docs=[(clean(doc),category) for doc,category in docs]

In [10]:
print(docs[:1])

[(['obvious', 'reason', 'produce', 'sequel', 'immensely', 'popular', 'movie', 'acquire', 'continued', 'profit', 'rationale', 'sound', 'many', 'case', 'recent', 'year', 'sequel', 'shoddy', 'product', 'expose', 'cash', 'milk', 'vehicle', 'really', 'last', 'year', 'speed', '2', 'scream', '2', 'well', 'year', 'specie', 'ii', 'product', 'decisively', 'less', 'satisfactory', 'case', 'sequel', 'even', 'discredit', 'predecessor', 'late', 'neo', 'slasher', 'flick', 'still', 'know', 'last', 'summer', 'whatever', 'uniqueness', 'original', 'might', 'seem', 'trite', 'overplayed', 'pair', 'abominable', 'thriller', 'julie', 'james', 'jennifer', 'love', 'hewitt', 'ray', 'bronson', 'freddie', 'prinze', 'jr', 'back', 'original', 'star', 'still', 'know', 'reprise', 'role', 'typical', 'fashion', 'julie', 'ray', 'experience', 'rather', 'predictable', 'fall', 'begin', 'movie', 'leave', 'door', 'wide', 'open', 'newcomer', 'benson', 'matthew', 'settle', 'julie', 'hit', 'along', 'friend', 'karla', 'brandy', 'n

# Creating the Dataset (in format required by nltk naive bayes classifier)

In [11]:
trainData,testData=(docs[:1500],docs[:500])             #documents are already shuffled

In [12]:
allWords=[]
for doc in docs:
    allWords+=doc[0]                        #adding all the words in current document to allWords(with repetitions of course)
print(allWords[:10])                        #printing the first 10 words

['obvious', 'reason', 'produce', 'sequel', 'immensely', 'popular', 'movie', 'acquire', 'continued', 'profit']


In [13]:
#getting the frequency of each word in allWords:
freq=nltk.FreqDist(allWords)
freq                                        #u can use many inbuilt functions on this FreqDist object now

FreqDist({'film': 11198, 'movie': 6977, 'one': 6028, 'make': 4320, 'like': 3972, 'character': 3879, 'get': 3690, 'see': 3137, 'go': 3043, 'time': 2992, ...})

In [14]:
freq.most_common(15)                    #the 15 most frequent words, if u give no argument, then all words ordered by dec. freq

[('film', 11198),
 ('movie', 6977),
 ('one', 6028),
 ('make', 4320),
 ('like', 3972),
 ('character', 3879),
 ('get', 3690),
 ('see', 3137),
 ('go', 3043),
 ('time', 2992),
 ('well', 2843),
 ('scene', 2671),
 ('even', 2608),
 ('good', 2429),
 ('story', 2345)]

In [15]:
topwords=freq.most_common(3000)                           #selecting the top 3000 words, returns a list of tuples(words,freq)
features=[tup[0] for tup in topwords]                     #topwords has tuples, words are the 0th element of the tuple
print(features[:10])                                      #the first 10 features(as a list)     

['film', 'movie', 'one', 'make', 'like', 'character', 'get', 'see', 'go', 'time']


In [16]:
def getFeatureDict(words):
    ftrs={}
    wordSet=set(words)                                    #set of words in current document, for faster search
    for w in features:
        ftrs[w]=w in wordSet                              #if w in wordSet=>ftrs[w]:True, else False
    return ftrs
#we use boolean in this case because we only have 2 categories and it is likely that words occuring in one
#don't occur in the other, rather their opposites might occur in the other
#for a problem like we had done in text classification project, we will use numbers(counts(frequency))

In [17]:
trainData=[(getFeatureDict(doc_words),category) for doc_words,category in trainData]
testData=[(getFeatureDict(doc_words),category) for doc_words,category in testData]

In [18]:
trainData[0]                
#each entry in x_train nowis a tuple(dictionary,category_of_the document), 
#ict stores whether the words in features occur in this doc or not. Category
#can be treated as y_train(or y_test), dictionary is x_train

({'film': False,
  'movie': True,
  'one': True,
  'make': False,
  'like': True,
  'character': False,
  'get': True,
  'see': False,
  'go': True,
  'time': True,
  'well': True,
  'scene': False,
  'even': True,
  'good': False,
  'story': False,
  'take': False,
  'would': False,
  'much': True,
  'come': False,
  'also': False,
  'bad': True,
  'give': True,
  'life': False,
  'two': False,
  'look': False,
  'way': False,
  'know': True,
  'seem': True,
  'first': True,
  'end': True,
  '--': True,
  'year': True,
  'work': True,
  'thing': True,
  'plot': True,
  'say': False,
  'play': False,
  'really': True,
  'little': False,
  'show': True,
  'people': False,
  'could': True,
  'man': True,
  'star': True,
  'love': True,
  'never': False,
  'try': False,
  'great': False,
  'director': False,
  'best': False,
  'performance': True,
  'new': True,
  'big': False,
  'many': True,
  'action': False,
  'actor': False,
  'want': False,
  'u': True,
  'watch': True,
  'find': Fa

# Predicting using NLTK Naive Bayes Classifier

In [19]:
from nltk import NaiveBayesClassifier
clf=NaiveBayesClassifier.train(trainData) 

In [20]:
nltk.classify.accuracy(clf,testData)               #getting the accuracy score
#it is doing well even without any optimization 

0.872

In [21]:
clf.show_most_informative_features()                 #tells us what the most informative features are
                                                     #most Angelina Jolie movies have -ve reviews :D, Kevin Spacey are pos
                                                     #Matt Damon is pos, Jedi is positive, wrestling is negative :D

Most Informative Features
             outstanding = True              pos : neg    =     12.0 : 1.0
               stupidity = True              neg : pos    =      9.7 : 1.0
                flawless = True              pos : neg    =      8.7 : 1.0
               ludicrous = True              neg : pos    =      8.6 : 1.0
             wonderfully = True              pos : neg    =      8.2 : 1.0
                lifeless = True              neg : pos    =      8.0 : 1.0
              henstridge = True              neg : pos    =      7.6 : 1.0
                   jolie = True              neg : pos    =      7.6 : 1.0
                   badly = True              neg : pos    =      6.7 : 1.0
                 garbage = True              neg : pos    =      6.6 : 1.0


# Using SKLearn Classifiers:

In [22]:
#we know that sklearn classifiers require data in a different format than the one we have created
#either we can convert our format to that one, or let nltk do it:

In [23]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier

In [24]:
svc=SVC()                         #svm classifier
clf=SklearnClassifier(svc)  
#this will convert our data, into the required numpy 2d array format. Now clf will classify acc. to svm
#u can obviusly use any classifer(naivebayes,logreg,knnrandomforest etc....)

In [25]:
clf.train(trainData)              #notice that we don't need to change format of data, it is changed implicitly
#caution: write train instead of fit



<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))>

In [26]:
nltk.classify.accuracy(clf,testData)

0.816

# Using Count Vectorizer to convert cleaned data in 2d array format for use in sklearn classifiers

In [27]:
#forget the last 3 sections, see the one where we cleaned the data
#lets look at what docs was like after cleaning:
docs[0]             #a tuple(list_of_words_in_doc,category)

(['obvious',
  'reason',
  'produce',
  'sequel',
  'immensely',
  'popular',
  'movie',
  'acquire',
  'continued',
  'profit',
  'rationale',
  'sound',
  'many',
  'case',
  'recent',
  'year',
  'sequel',
  'shoddy',
  'product',
  'expose',
  'cash',
  'milk',
  'vehicle',
  'really',
  'last',
  'year',
  'speed',
  '2',
  'scream',
  '2',
  'well',
  'year',
  'specie',
  'ii',
  'product',
  'decisively',
  'less',
  'satisfactory',
  'case',
  'sequel',
  'even',
  'discredit',
  'predecessor',
  'late',
  'neo',
  'slasher',
  'flick',
  'still',
  'know',
  'last',
  'summer',
  'whatever',
  'uniqueness',
  'original',
  'might',
  'seem',
  'trite',
  'overplayed',
  'pair',
  'abominable',
  'thriller',
  'julie',
  'james',
  'jennifer',
  'love',
  'hewitt',
  'ray',
  'bronson',
  'freddie',
  'prinze',
  'jr',
  'back',
  'original',
  'star',
  'still',
  'know',
  'reprise',
  'role',
  'typical',
  'fashion',
  'julie',
  'ray',
  'experience',
  'rather',
  'predi

In [28]:
y=[category for doc,category in docs]
print(y)

['neg', 'pos', 'pos', 'pos', 'pos', 'neg', 'neg', 'pos', 'neg', 'neg', 'pos', 'neg', 'pos', 'neg', 'pos', 'pos', 'neg', 'neg', 'neg', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg', 'neg', 'pos', 'neg', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg', 'neg', 'neg', 'neg', 'pos', 'pos', 'neg', 'neg', 'neg', 'neg', 'neg', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'neg', 'neg', 'pos', 'pos', 'pos', 'pos', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'pos', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg', 'pos', 'pos', 'pos', 'pos', 'neg', 'pos', 'pos', 'neg', 'pos', 'neg', 'neg', 'pos', 'neg', 'neg', 'pos', 'pos', 'neg', 'pos', 'neg', 'neg', 'pos', 'pos', 'pos', 'neg', 'neg', 'neg', 'neg', 'pos', 'neg', 'neg', 'pos', 'pos', 'neg', 'neg', 'pos', 'pos', 'pos', 'pos', 'pos', 'neg', 'neg', 'pos', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg', 'neg', 'neg', 'neg', 'pos', 'pos', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'pos', 'pos'

In [29]:
#since text of each doc is in the form of a list of words in docs, we need to change it to a string for count vectorizer
#to work on it

In [30]:
#join example:
example=["red","hawk"]
" ".join(example)                   #converts the list given as argument to a string with delimiter space in between

'red hawk'

In [31]:
x=[" ".join(doc) for doc,category in docs]        #a list of strings,each string contains text(cleaned) of each document
x[0]                                              #text(cleaned) of the 0th document

'obvious reason produce sequel immensely popular movie acquire continued profit rationale sound many case recent year sequel shoddy product expose cash milk vehicle really last year speed 2 scream 2 well year specie ii product decisively less satisfactory case sequel even discredit predecessor late neo slasher flick still know last summer whatever uniqueness original might seem trite overplayed pair abominable thriller julie james jennifer love hewitt ray bronson freddie prinze jr back original star still know reprise role typical fashion julie ray experience rather predictable fall begin movie leave door wide open newcomer benson matthew settle julie hit along friend karla brandy norwood tyrell mekhi phifer foursome head radio station giveaway vacation tropic unfortunately thing peachy arrive regularity could set watch infamous fisherman muse watson back hook another bloody showdown complete cliffhanger end much original still know market saturate gen x thriller like one unlikely stil

In [32]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y)

In [33]:
#now we can use count vectorizer on x_train to get data in required 2d array format
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=3000,stop_words=stop,ngram_range=(1,3),max_df=0.8)
x_train=cv.fit_transform(x_train)

In [34]:
x_train.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [35]:
cv.get_feature_names()

['000',
 '10',
 '100',
 '12',
 '13',
 '15',
 '17',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '20',
 '200',
 '2000',
 '2001',
 '30',
 '40',
 '50',
 '54',
 '60',
 '70',
 '80',
 '8mm',
 '90',
 '90 minute',
 'abandon',
 'ability',
 'able',
 'absolutely',
 'absurd',
 'abuse',
 'academy',
 'academy award',
 'accent',
 'accept',
 'accident',
 'accidentally',
 'accompany',
 'accomplish',
 'accord',
 'account',
 'accuse',
 'achieve',
 'achievement',
 'across',
 'act',
 'act like',
 'action',
 'action film',
 'action movie',
 'action scene',
 'action sequence',
 'actor',
 'actress',
 'actual',
 'actually',
 'ad',
 'adam',
 'adapt',
 'adaptation',
 'add',
 'addition',
 'address',
 'admire',
 'admit',
 'adult',
 'advance',
 'advantage',
 'adventure',
 'advice',
 'affair',
 'affect',
 'affection',
 'affleck',
 'aforementioned',
 'afraid',
 'african',
 'age',
 'agent',
 'ago',
 'agree',
 'agrees',
 'ahead',
 'aid',
 'aim',
 'air',
 'airplane',
 'al',
 'ala',
 'alan',
 'albeit',

In [36]:
x_test=cv.transform(x_test)

In [37]:
#Note that u don't need to convert x_train and x_test to dense to a classifier, sparse works as well
from sklearn.svm import SVC
clf=SVC()
clf.fit(x_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [38]:
clf.score(x_test,y_test)                #score imporves slighly on using ngram in this case(w/o ngram=0.77)
                                        #also try experiment with max_df and min_df

0.788

In [39]:
#See that when we used nltk for classification, the words had corresponding boolean value which denoted whether they occur 
#in a document or not. But CountVectorizer converts them to frequency, so the scores are different. 

In [40]:
#There are 2 important terms:
#TF and IDF: Term Frequency, Inverse Document Frequency
#TF: no. of times a word w occurs in a doc/total n. of words in that doc
#DF: no. of documents that contain a word w/total no. of documents
#IDF=1/DF
#Basically DF allows us to get rid of words that occur in all documents (eg. stop words)
#for example, in a classification porblem on emails, the word "email" will occur in many docs
#but we don't want it as a feature.
# => we only want words with low DF or high IDF
#so instead of storing the no. of times a word(which is a feature) occurs in a doc in our training data
#we store, TF*IDF values for that word for that doc. Higher this value, more significant is that word.
#Instead of CountVectorizers we can use           TfidfVectorizer()        to store words according to their
#TF*IDF values.
#Alternatively, Count vectorizer provides us with parameters max_df and min_df.
#eg. if max_df=0.8 => words occuring in more than 80% of the docs are not selected.