In [45]:
import pandas as pd
import numpy as np
import nltk # natural language toolkit (for text processing, classification, tokenization, stemming, tagging, parsing...)
from nltk.corpus import stopwords #contains a predefined set of common words (e.g., "and," "the," "is") 
from sklearn.feature_extraction.text import TfidfVectorizer # to convert a collection of raw text documents to a matrix of TF-IDF features
from sklearn.model_selection import train_test_split #for data splitting
from sklearn import naive_bayes # for Naive Bayes classification
from sklearn.metrics import accuracy_score # for evaluation metrics
import pickle # for serializing and deserializing Python objects

In [46]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to C:\Users\amulya
[nltk_data]     shetty\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [47]:
#The sep='\t' parameter specifies that the values are separated by tabs. The names=['Reviews', 'comments'] parameter assigns column names to the DataFrame.
dataset = pd.read_csv('../datasets/reviews.txt',sep = '\t', names =['Reviews','Comments'])

In [48]:
dataset

Unnamed: 0,Reviews,Comments
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...
...,...,...
6913,0,Brokeback Mountain was boring.
6914,0,So Brokeback Mountain was really depressing.
6915,0,"As I sit here, watching the MTV Movie Awards, ..."
6916,0,Ok brokeback mountain is such a horrible movie.


In [49]:
stopset = stopwords.words('english')
stopset

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [50]:
#lets initializes a TfidfVectorizer 
#using IDF (Inverse Document Frequency)
# converting text to lowercase
#stripping accents using ASCII encoding (e.g é -> e )
#excluding stop words defined in the stopset.
vectorizer = TfidfVectorizer(use_idf = True,lowercase = True, strip_accents='ascii',stop_words=stopset)


In [51]:
X = vectorizer.fit_transform(dataset['Comments']) #It both fits the vectorizer to the data and transforms the text data into a TF-IDF matrix X.
y = dataset['Reviews']
# Let's Serialize the vectorizer and saves it to a file named 'tranform.pkl' in binary write mode ('wb').
#This allows you to later load the vectorizer and use it 
pickle.dump(vectorizer, open('../transform.pkl','wb'))

In [52]:
#split the dataset into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [53]:
#define the model
clf = naive_bayes.MultinomialNB()
#fit the model
clf.fit(X_train,y_train)

In [54]:
accuracy_score(y_test,clf.predict(X_test))*100

97.47109826589595

In [55]:
clf = naive_bayes.MultinomialNB()
clf.fit(X,y)

In [56]:
accuracy_score(y_test,clf.predict(X_test))*100

98.77167630057804

In [57]:
filename = '../nlp_model.pkl'
pickle.dump(clf, open(filename, 'wb'))