In [1]:
from nltk.corpus.reader import reviews
import numpy as np
import matplotlib.pyplot as plt

from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import re

In [2]:
tokenizer = RegexpTokenizer(r'\w+')
on_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

## Loading Data

In [3]:
import pandas as pd

In [4]:
train_df = pd.read_csv('Train.csv')
test_df = pd.read_csv("Test.csv")

In [5]:
train_df.head(n=20)

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos
5,Steve Carell comes into his own in his first s...,pos
6,I'm only going to write more because it's requ...,neg
7,"OK, it was a ""risky"" move to rent this flick, ...",neg
8,"Cannibalism, a pair of cinematic references to...",pos
9,This is one of the great modern kung fu films....,pos


In [6]:
X_train = train_df['review']
Y_train = train_df['label']

In [7]:
print(X_train.shape)
print(Y_train.shape)

(40000,)
(40000,)


In [8]:
def getCleanReview(review):
    
    review = review.lower()
    review = review.replace("<br /><br />","").replace("http","").replace("https","")
    
    #Tokenize 
    tokens = tokenizer.tokenize(review)
    new_tokens = [token for token in tokens if token not in on_stopwords]
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    
    cleaned_review = ' '.join(stemmed_tokens)
    return cleaned_review

In [9]:
X_train_cleaned = [getCleanReview(X) for X in X_train]
X_train_cleaned = np.array(X_train_cleaned)

In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y_train = le.fit_transform(Y_train)

In [11]:
print(f"corpus_len = {X_train_cleaned.shape} | class_len = {Y_train.shape}")

corpus_len = (40000,) | class_len = (40000,)


In [12]:
tokenizer = RegexpTokenizer('[a-zA-Z]+')
def myTokenizer(document):
    words = tokenizer.tokenize(document.lower())
    return words

## Vectorisation

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
cv = CountVectorizer(ngram_range=(1,2),tokenizer=myTokenizer)

In [15]:
vectorized_corpus = cv.fit_transform(X_train_cleaned,Y_train)

In [16]:
test_df.head()

Unnamed: 0,review
0,Remember those old kung fu movies we used to w...
1,This movie is another one on my List of Movies...
2,How in the world does a thing like this get in...
3,"""Queen of the Damned"" is one of the best vampi..."
4,The Caprica episode (S01E01) is well done as a...


In [18]:
test_corpus = test_df['review']
X_test_cleaned = [getCleanReview(X) for X in test_corpus]
X_test_cleaned = np.array(X_test_cleaned)

In [19]:
test_vectorized_corpus = cv.transform(X_test_cleaned)

In [20]:
test_vectorized_corpus.shape,vectorized_corpus.shape #should have same columns

((10000, 2248742), (40000, 2248742))

### 3.Multinomial Naive Bayes

In [21]:
from sklearn.naive_bayes import MultinomialNB

In [22]:
mnb = MultinomialNB()

In [23]:
mnb.fit(vectorized_corpus,Y_train)

MultinomialNB()

In [43]:
i = 0
results = []
for sentiment in test_vectorized_corpus:
    c = X_test_cleaned[i],mnb.predict(sentiment)
    results.append(c)
    i+=1
    if i==10:
        break

In [45]:
import pandas as pd

In [66]:
 pd.options.display.max_colwidth = 2000  

In [67]:
result_df = pd.DataFrame(results)

result_df.columns = ["review","prediction(0-negative,1-positve)"]

In [68]:
result_df

Unnamed: 0,review,"prediction(0-negative,1-positve)"
0,rememb old kung fu movi use watch friday saturday late night babysitt thought charg well movi play exactli like one movi patsi kensit biggest claim fame love interest mel gibson charact lethal weapon perform one reason never made big terribl actress lethal weapon thought cute cute enough check movi includ love music love danc anoth big let obvious impress either attract eye soul scream turn play anoth cheap predict role done badli movi kensit star comedienn good one either work club franc cut homeland make ear bleed luck even wors french govern want throw expir visa mayb caught act get marri casanova freiss luck predict begin terribl way give movi neg rate star rate,[0]
1,movi anoth one list movi bother saw year ago adolesc stay late annoy find romanc everyth els histori call bait switch movi one interest titl actual movi scam subject deserv good cinemat treatment movi almost insult serv actual member lafayett escadril run law product abus home realiti idealist want someth help franc suspect mani came upper class background tab hunter charact fli school smart aleck know all individu one portray last two day would either stockad infantri disciplin french armi often rather fierc short anoth hollywierd version histor episod deserv proper treatment,[0]
2,world thing like get dvd player home even get packag distribut absolut zero screen movi use term loos go put video store shelf anymor diy film make come entitl get group friend rel togeth crappi camcord aw stori put togeth creat heap pile crap call movi wish peopl would quit use word indi campi describ type movi either profess would someth like consid accept someon tri sell car bad movi take back say lemon surgic procedur su doctor malpractic wish could get time money back watch shame video store stock movi like rip public want campi go get friday th movi even later one dead aliv least make want kill movi like make peopl automat equat independ garbag,[0]
3,queen damn one best vampir movi ever seen movi suspens action gore combin fierc demand attitud queen rock mood star well act stuart townsend make wonder done movi combin creat alway one give advic friend famili member movi worthi rent ask queen damn worthi tell worthi buy movi sure must horror movi lover home,[1]
4,caprica episod e well done pilot realli episod exact content dvd pilot releas said episod give substant background popular battlestar galactica seri origin remak significantli appli seri trend plot sci fi late seri explor virtual life environ top given much background adama famili line well relationship unrev battlestar galactica seri creator cylon part first episod revolv around popular topic virtual life expect earli life coloni birth cylon rather enjoy episod although new materi anyon alreadi seen dvd releas caprica pilot seri seem promis case mani pilot episod leav us cliff hanger ensur follow audienc sci fi commun definit go keep watch resolv well develop give seri star,[1]
5,usual realli enjoy steven seagal movi usual highli entertain somewhat adept aikido usual like way steven incorpor martial art techniqu fight sequenc howev film realli bad movi make effort seem obviou blame lie director produc obvious idea make action movi let alon direct someon like steven seagal take advantag knowledg compet never saw end movi walk end simpli stand watch anymor bad movi sure mani peopl also share feel,[0]
6,ji trnka made last anim short indict totalitar caus troubl nativ czechoslovakia element symbol simpl trademark ornament almost absent allow viewer concentr fabl man room dedic potteri take care plant suddenli huge hand enter room order make statu man refus persecut omin glove hand day impress realiti factor seem eras anim tri replac real world refresh watch film make techniqu part enjoy,[1]
7,bad contribut next bad movi parti go clear start steve gutenburg take role serious principl walk line think got whiff much stank earli go motion paycheck sean bean act usual spare share space screen principl till final scene like actor walk onto high school stage defin contrast actor look good scruffi actor bulk part two statement appli polic academi steve scruffi look translat bum indig bulk make look potato lumpi buff pair one worst script dialogu hollywood histori bad movi guess principl realli realli need money remodel someth agent sign script written monkey must type would love know back stori disast,[0]
8,watch hilari retro entertain career girl tale floor joan crawford first appear could think initi god face michael jackson notori book photo minut movi dian baker hope lang get cab greenwich villag walk street see part sign back stonewal bar scene epoch riot consid trigger modern gay right movement speak baker taxi laugh get one tell cabbi th sutton place pleas care bump imagin reaction driver today manhattan say cours pregnant want hurt fetu stop jump move car find bob evan want abort well find way lose babi sue carson delight mari agn movi biograph inform imdb,[1]
9,excel polit thriller play much quieter slower higher rank film genr peopl talk pacino cusack manag skip amaz career top perform stori friendship father son relationship corrupt deceit two actor gel amazingli well togeth support aiello fonda equal impress although aiello brilliant especi paper run press instead focuss complex corrupt scandal creat wonder charact show human side failur polit briberi final scene main charact wonder written act,[1]
