###### Watch videos: Naive_Bayes_Movie_Reviews (all parts) to understand this

In [17]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB

In [2]:
df=pd.read_csv("Train.csv")
df.head(10)

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos
5,Steve Carell comes into his own in his first s...,pos
6,I'm only going to write more because it's requ...,neg
7,"OK, it was a ""risky"" move to rent this flick, ...",neg
8,"Cannibalism, a pair of cinematic references to...",pos
9,This is one of the great modern kung fu films....,pos


In [3]:
train=df.values
train_text=train[:,0]
y_train=train[:,-1]
train_text.shape,y_train.shape

((40000,), (40000,))

In [4]:
test_text=pd.read_csv("Test.csv").values.flatten()
test_text.shape

(10000,)

In [5]:
def clean(text):
    """Tokenization + Stopword Removal + Stemming"""
    
    sw=set(stopwords.words("english"))
    ps=PorterStemmer()
    
    cleaned_text=[]
    for rev in tqdm(text):
            
        rev=rev.lower()
        rev=rev.replace("<br /><br />"," ")             
        #there are br tags in the data which we don't want, so we will replace them with " "
            
        #Tokenization
        tokenizer=RegexpTokenizer(r"[a-zA-Z]+")         #select all words containing only alphabets
        tokenized_rev=tokenizer.tokenize(rev)
            
        #Stopword removal and stemming
        tokenized_rev_cleaned=[w for w in tokenized_rev if w not in sw]
        tokenized_rev_stemmed=[ps.stem(w) for w in tokenized_rev_cleaned]
            
        #converting cleaned data in the form of a list back to string
        cleaned_rev=" ".join(tokenized_rev_stemmed)
        cleaned_text.append(cleaned_rev)
            
    cleaned_text=np.array(cleaned_text)
    return cleaned_text

In [6]:
train_text_cleaned=clean(train_text)
train_text_cleaned

100%|███████████████████████████████████████████████████████████████████████████| 40000/40000 [02:52<00:00, 232.13it/s]


array(['matur intellig highli charg melodrama unbelivebl film china wei wei stun perform catylast love triangl simpli stun oppurun see magnific film take',
       'http video googl com videoplay docid hl en distribut tri opt mass appeal want best possibl view rang forgo profit continu manual labor job gladli entertain work view texa tale pleas write like like alex like stuie texa texa tale write opinion rule',
       'titl opera director dario argento cast cristina masillach ian charleson urbano barberini daria nicolodi review argento movi seen suspiria one blew away style color spooki stori line next decid go opera told one best man think discov ultim one favorit horror director opera young opera singer get big break main star creepi modern opera take mc beth get hit car betti understudi get part bad psycho make watch brutal murder friend co worker wow id heard good thing flick prepar level great film would take yeah movi shortcom ill get later part movi blew away first movi fill lot 

In [7]:
vec=TfidfVectorizer(ngram_range=(1,3),min_df=0.001,max_df=0.7)      #features will be of sizes 1 word,2 words and 3 words
x_train=vec.fit_transform(train_text_cleaned)
x_train

<40000x15103 sparse matrix of type '<class 'numpy.float64'>'
	with 4187360 stored elements in Compressed Sparse Row format>

In [8]:
test_text_cleaned=clean(test_text)
test_text_cleaned

100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:32<00:00, 310.16it/s]


array(['rememb old kung fu movi use watch friday saturday late night babysitt thought charg well movi play exactli like one movi patsi kensit biggest claim fame love interest mel gibson charact lethal weapon perform one reason never made big terribl actress lethal weapon thought cute cute enough check movi includ love music love danc anoth big let obvious impress either attract eye soul scream turn play anoth cheap predict role done badli movi kensit star comedienn good one either work club franc cut homeland make ear bleed luck even wors french govern want throw expir visa mayb caught act get marri casanova freiss luck predict begin terribl way give movi neg rate star rate',
       'movi anoth one list movi bother saw year ago adolesc stay late annoy find romanc everyth els histori call bait switch movi one interest titl actual movi scam subject deserv good cinemat treatment movi almost insult serv actual member lafayett escadril run law product abus home realiti idealist want someth 

In [9]:
x_test=vec.transform(test_text_cleaned)
x_test

<10000x15103 sparse matrix of type '<class 'numpy.float64'>'
	with 1057675 stored elements in Compressed Sparse Row format>

In [10]:
clf=MultinomialNB()
clf

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [11]:
clf.fit(x_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
predictions=clf.predict(x_test)
predictions

array(['neg', 'pos', 'neg', ..., 'pos', 'pos', 'neg'], dtype='<U3')

In [13]:
d={"Id":np.arange(0,predictions.shape[0]),"label":predictions}
df=pd.DataFrame(d)
df.to_csv("submission.csv",index=False)

In [23]:
TfidfVectorizer?