In [None]:
import numpy as np 
import pandas as pd 
import nltk

In [None]:
data = pd.read_csv('IMDB Dataset.csv')
print(data.shape)
data.head()

### STEPS TO CLEAN THE REVIEWS :
 - Remove HTML tags
 - Remove special characters(punctuation) & lowercaseed
 - Remove stopwords
 - Stemming with PorterStemmer
 - word to vectorizer using  tf-Idf vectorizer
 - Target Encoding
 
### Train Model
 - Train-test-split
 - Modeling

In [None]:
## Remove html tags
import re

def remove_html_tag(text):
    remove_html_tag = re.compile(r'<.*?>')
    return re.sub(remove_html_tag,'',text)

data['review']=data['review'].apply(remove_html_tag)

In [None]:
# Remove punctuations
import string
def remove_punctuation(sentence):
    review=[letters.lower() for letters in sentence if letters not in string.punctuation ]
    review = ''.join(review)
    return review


data['review'] = data['review'].apply(remove_punctuation)

In [None]:
## Remove Stopwords
from nltk.corpus import stopwords
def remove_stopwords(sentence):
    stop_words=stopwords.words('english')
    review=[words for words in sentence.split() if words not in stop_words]
    review=' '.join(review)
    return review


data['review']=data['review'].apply(remove_stopwords)

In [None]:
## Stemming
from nltk import PorterStemmer 
ps=PorterStemmer() 

data['review'] = data['review'].apply(ps.stem)

In [None]:
## tf-Idf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
tdidf= TfidfVectorizer() 
x=tdidf.fit_transform(data['review']).toarray()
# here x is array if want to use sparse (remove .toarray())

In [None]:
## Encoding Sentment Column
data['sentiment'].replace(['positive','negative'],[1,0],inplace=True)

y=data['sentiment']

In [None]:
## split data 
from sklearn.model_selection import train_test_split 
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

### Modeling:

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score


gnb,mnb,bnb = GaussianNB(),MultinomialNB(alpha=1.0,fit_prior=True),BernoulliNB(alpha=1.0,fit_prior=True)
gnb.fit(x_train,y_train)
mnb.fit(x_train,y_train)
bnb.fit(x_train,y_train)

In [None]:
ypg = gnb.predict(x_test)
ypm = mnb.predict(x_test)
ypb = bnb.predict(x_test)

print("Gaussian = ",accuracy_score(testy,ypg))
print("Multinomial = ",accuracy_score(testy,ypm))
print("Bernoulli = ",accuracy_score(testy,ypb))

In [None]:
import pickle

pickle.dump(bnb,open('model1.pkl','wb'))


# Extras (future scope)

In [None]:
rev =  """Terrible. Complete trash. Brainless tripe. Insulting to anyone who isn't an 8 year old fan boy. Im actually pretty disgusted that this movie is making the money it is - what does it say about the people who brainlessly hand over the hard earned cash to be 'entertained' in this fashion and then come here to leave a positive 8.8 review?? Oh yes, they are morons. Its the only sensible conclusion to draw. How anyone can rate this movie amongst the pantheon of great titles is beyond me.

So trying to find something constructive to say about this title is hard...I enjoyed Iron Man? Tony Stark is an inspirational character in his own movies but here he is a pale shadow of that...About the only 'hook' this movie had into me was wondering when and if Iron Man would knock Captain America out...Oh how I wished he had :( What were these other characters anyways? Useless, bickering idiots who really couldn't organise happy times in a brewery. The film was a chaotic mish mash of action elements and failed 'set pieces'...

I found the villain to be quite amusing.

And now I give up. This movie is not robbing any more of my time but I felt I ought to contribute to restoring the obvious fake rating and reviews this movie has been getting on IMDb."""
f1 = clean(rev)
f2 = is_special(f1)
f3 = to_lower(f2)
f4 = rem_stopwords(f3)
f5 = stem_txt(f4)

bow,words = [],word_tokenize(f5)
for word in words:
    bow.append(words.count(word))
#np.array(bow).reshape(1,3000)
#bow.shape
word_dict = cv.vocabulary_
pickle.dump(word_dict,open('bow.pkl','wb'))

In [None]:
inp = []
for i in word_dict:
    inp.append(f5.count(i[0]))
y_pred = bnb.predict(np.array(inp).reshape(1,1000))