### Create a NLP Pipeline to clean reviews data
- Load input file and read Reviews
- Tokenize
- Remove Stopwords
- Preform Stemming
- Write cleaned data to output file  

In [1]:
import numpy as np

### NLTK
- Tokenisation -> Document -> Sentences -> Words
- Stopward Removal -> Removing words like do, they, there etc..
- Stemming -> to convert different form of a word into a base word like : see seen seem := see
- Building Vocab -> Vocab is list of all words
- Vectorization -> creating a list of size vocab to store word's frequency from Vocab
- Classification -> classify into category

In [2]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [3]:
sample_text = """I loved this movie <br /><br /> since I was 7 and I saw it on the opening day. It was so touching and beautiful. I strongly recommend seeing for all. It's a movie to watch with your family by far.<br /><br />My MPAA rating: PG-13 for thematic elements, prolonged scenes of disastor, nudity/sexuality and some language."""

In [4]:
tokenizer = RegexpTokenizer(r'\w+')

In [5]:
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [6]:
def getCleanReview(review):
    review = review.lower()
    review = review.replace("<br /><br />"," ")
    # Tokenize
    tokens = tokenizer.tokenize(review)
    new_tokens = [token for token in tokens if token not in en_stopwords]
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    cleaned_review = ' '.join(stemmed_tokens)
    return cleaned_review

In [7]:
getCleanReview(sample_text)

'love movi sinc 7 saw open day touch beauti strongli recommend see movi watch famili far mpaa rate pg 13 themat element prolong scene disastor nuditi sexual languag'

In [8]:
# Function that accepts input file and return clean output file of movie reviews

In [9]:
def getStemmedDoc(inputFile, outputFile):
    out = open(outputFile,'w',encoding="utf8")
    with open(inputFile, encoding ="utf8") as f:
        reviews=f.readlines()
    for review in reviews:
        cleaned_review = getStemmedReview(review)
        print((cleaned_review), file = out)
    out.close()

In [10]:
import sys

In [11]:
# # # Read command line arguments
# # inputFile  = sys.argv[1]
# # outputFile = sys.argv[2]
# inputFile ="./imdb_toy_x.txt"
# outputFile = "./imdb_toy_clean.txt"
# getStemmedDoc(inputFile,outputFile)

## Multinomial Event Model

In [12]:
x = [ "This was awsome an awsome movie",
     "Great movie! I liked it a lot",
     "happy ending! Awsome acting by the hero",
     "loved it! truly great",
     "bad not upto the mark",
     "could have been better",
     "Surely a disappointing movie"
]

y = [1,1,1,1,0,0,0] # 1 := Positive review, 0 - Negative class

In [13]:
test_x = ["I was happy & happy and I loved the acting in the movie",
          "The movie I saw was great not good yes Surely a disappointing movie bad not upto the mark happy ending! could have been better Awsome acting by the hero This was awsome an awsome movie"]

**1. Cleaning**

In [14]:
x_clean = [getCleanReview(i) for i in x]
xt_clean = [getCleanReview(i) for i in test_x]

In [15]:
x_clean

['awsom awsom movi',
 'great movi like lot',
 'happi end awsom act hero',
 'love truli great',
 'bad upto mark',
 'could better',
 'sure disappoint movi']

In [16]:
xt_clean

['happi happi love act movi',
 'movi saw great good ye sure disappoint movi bad upto mark happi end could better awsom act hero awsom awsom movi']

**2. Vectorization**

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
cv = CountVectorizer()
x_vec = cv.fit_transform(x_clean).toarray()

In [19]:
x_vec, x_vec.shape
# 2 is because awsome was 2 times 

(array([[0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0],
        [1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1],
        [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0]],
       dtype=int64),
 (7, 18))

In [20]:
cv.get_feature_names(), len(cv.get_feature_names())

(['act',
  'awsom',
  'bad',
  'better',
  'could',
  'disappoint',
  'end',
  'great',
  'happi',
  'hero',
  'like',
  'lot',
  'love',
  'mark',
  'movi',
  'sure',
  'truli',
  'upto'],
 18)

In [21]:
 x_vec[0].shape

(18,)

**Vectorization On the Test Dataset**

In [22]:
xt_vec = cv.fit_transform(xt_clean).toarray()
print(xt_vec)

[[1 0 0 0 0 0 0 0 0 2 0 1 0 1 0 0 0 0]
 [1 3 1 1 1 1 1 1 1 1 1 0 1 3 1 1 1 1]]


In [23]:
# Number of features changed with test data
# cv.fit_transform should only be called on the training data
cv.get_feature_names()

['act',
 'awsom',
 'bad',
 'better',
 'could',
 'disappoint',
 'end',
 'good',
 'great',
 'happi',
 'hero',
 'love',
 'mark',
 'movi',
 'saw',
 'sure',
 'upto',
 'ye']

In [24]:
print(xt_vec.shape)

(2, 18)


### 3. Mutlinomial Naive Bayes
https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html

In [25]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB

In [26]:
mnb = MultinomialNB()
print(mnb)

MultinomialNB()


In [27]:
# Training 
mnb.fit(x_vec, y)

MultinomialNB()

In [28]:
# prediction

In [29]:
mnb.predict(xt_vec)

array([1, 0])

In [38]:
mnb.score(x_vec,y)

1.0

In [30]:
mnb.predict_proba(xt_vec)
# 0.23594005 Denotes probablity for class_0 
# 0.76405995 Denotes Probablity for class_1
# 0.60880296 Denotes probablity for class_0 
# 0.39119704 Denotes Probablity for class_1

array([[0.23594005, 0.76405995],
       [0.60880296, 0.39119704]])

### 4. Multivariate Bernoulli Event Model Naive Bayes
- Bernoulli NB works for Boolean/Binary Features
https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html

In [31]:
bnb = BernoulliNB()

In [32]:
print(bnb)

BernoulliNB()


In [33]:
bnb.fit(x_vec,y)

BernoulliNB()

In [34]:
bnb.predict_proba(xt_vec)

array([[0.29280947, 0.70719053],
       [0.59662359, 0.40337641]])

In [36]:
bnb.predict(xt_vec) # same as above multinomial

array([1, 0])

In [37]:
bnb.score(x_vec,y)

1.0