In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize #NLTK Library has word_tokenize and sent_tokenize to easily break a stream of text into a list of words or sentences, respectively.
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm #naive_bayes classifier
from sklearn.metrics import accuracy_score #Use accuracy_score function to get the accuracy


In [2]:

import nltk
nltk.download('punkt') #punkt is an unsupervised trainable model, which means it can be trained on unlabeled data
nltk.download('averaged_perceptron_tagger') #download the model that contains the pre-trained English
nltk.download('stopwords') #Downloading stop words from NLTK
nltk.download("wordnet", "nltk_data/") #download WordNetLemmatizer from NLTK
nltk.data.path.append('nltk_data/')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to nltk_data/...


In [3]:

np.random.seed(500)
#This is used to reproduce the same result every time if the script is kept consistent otherwise each run will produce different results.
#The seed can be set to any number.







In [6]:
Corpus = pd.read_csv(r"corpus_small.csv",encoding='latin-1')

In [7]:

# Step - a : Remove blank rows if any.
Corpus['text'].dropna(inplace=True)



In [8]:
# Step - b : Change all the text to lower case.
# This is required as python interprets 'dog' and 'DOG' differently
Corpus['text'] = [entry.lower() for entry in Corpus['text']]


In [9]:
# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus['text']= [word_tokenize(entry) for entry in Corpus['text']]


In [10]:
# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(Corpus['text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # "The final processed set of words" for each "iteration" will be stored in 'text_final'
    Corpus.loc[index,'text_final'] = str(Final_words)



In [11]:
print(Corpus) ##checking initial processing

                                                  text        label  \
0    [stuning, even, for, the, non-gamer, :, this, ...  __label__2    
1    [the, best, soundtrack, ever, to, anything, .,...  __label__2    
2    [amazing, !, :, this, soundtrack, is, my, favo...  __label__2    
3    [excellent, soundtrack, :, i, truly, like, thi...  __label__2    
4    [remember, ,, pull, your, jaw, off, the, floor...  __label__2    
..                                                 ...          ...   
457  [so, far, ,, the, worst, book, of, one, of, my...  __label__1    
458  [very, dissapointing, :, from, potters, field,...  __label__1    
459  [intelligently, written, ;, a, fast, and, susp...  __label__2    
460  [what, a, disappointment, !, :, i, expected, b...  __label__1    
461  [just, awful, :, this, was, possibly, the, wor...  __label__1    

                                            text_final  
0    ['stun', 'even', 'sound', 'track', 'beautiful'...  
1    ['best', 'soundtrack', 'ever

In [12]:

##Prepare Train and Test Data sets
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['label'],test_size=0.4)




In [13]:
##Encoding. Label encode the target variable — This is done to transform Categorical data of string type in the data set into numerical values which the model can understand.
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)



In [14]:
##Word Vectorization
#It is a general process of turning a collection of text documents into numerical feature vectors.Their are many methods to convert text data to vectors which the model can understand but by far the most popular method is called TF-IDF. This is an acronym than stands for “Term Frequency — Inverse Document” Frequency which are the components of the resulting scores assigned to each word.
#Term Frequency: This summarizes how often a given word appears within a document.
#Inverse Document Frequency: This down scales words that appear a lot across documents.

Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)



In [15]:
#see the vocabulary that it has learned from the corpus
print(Tfidf_vect.vocabulary_)





In [16]:
#print the vectorized data to see how it looks like
print(Train_X_Tfidf)

  (0, 4225)	0.13800891866228707
  (0, 4173)	0.13095397609639822
  (0, 4172)	0.15286862933086437
  (0, 3854)	0.20375287836568556
  (0, 3794)	0.08136036183488445
  (0, 3254)	0.15619972441427876
  (0, 3020)	0.15619972441427876
  (0, 2971)	0.24531805025515563
  (0, 2800)	0.20375287836568556
  (0, 2564)	0.2043150587269425
  (0, 2263)	0.08136036183488445
  (0, 2255)	0.20375287836568556
  (0, 2223)	0.19093362800721525
  (0, 2222)	0.07822748784932726
  (0, 2185)	0.21205196164087006
  (0, 2055)	0.19093362800721525
  (0, 1977)	0.20375287836568556
  (0, 1728)	0.15619972441427876
  (0, 1675)	0.20375287836568556
  (0, 1588)	0.15619972441427876
  (0, 1568)	0.27601783732457413
  (0, 1562)	0.14004937897239406
  (0, 1443)	0.15619972441427876
  (0, 1336)	0.20375287836568556
  (0, 688)	0.09897103825454616
  :	:
  (276, 2159)	0.13097008342092875
  (276, 2129)	0.08871059753861107
  (276, 2106)	0.10551078681658518
  (276, 1666)	0.08521816778720864
  (276, 1586)	0.11685638971238135
  (276, 1558)	0.0458257541

In [17]:

#Use the ML Algorithms to Predict the outcome
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  76.21621621621621
