In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer, sent_tokenize, wordpunct_tokenize, word_tokenize
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB

In [2]:
train_data = pd.read_csv("../input/movie-review-data/Train.csv")
train_data

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos
...,...,...
39995,There are similarities between Ray Lawrence's ...,neg
39996,"For starters, I once met the director when he ...",neg
39997,"Much of ""Over Her Dead Body"" is so painfully u...",neg
39998,"""Lifeforce"" is a truly bizarre adaptation of t...",pos


In [3]:
X, y = train_data.iloc[:,0], train_data.iloc[:,1]
train_data.dtypes

review    object
label     object
dtype: object

In [4]:
txt = X[1]
txt
type(X)

pandas.core.series.Series

In [5]:
tokenizer = RegexpTokenizer('[a-zA-Z]+')
ps = PorterStemmer()
sw = set(stopwords.words('english'))


In [6]:
def return_uw(review):
    review = str(review)
    review = review.lower()
    review = review.replace("<br /><br />", " ")
    words = tokenizer.tokenize(review)
    uw = [ps.stem(w) for w in words if w not in sw]
    return ' '.join(uw)

In [7]:
X = X.apply(return_uw)

In [8]:
X[1]

'http video googl com videoplay docid hl en distribut tri opt mass appeal want best possibl view rang forgo profit continu manual labor job gladli entertain work view texa tale pleas write like like alex like stuie texa texa tale write opinion rule'

In [9]:
cv = CountVectorizer()
tfv = TfidfVectorizer()

In [10]:
X_cv = cv.fit_transform(X).todense()
X_tfv = tfv.fit_transform(X).todense()

In [14]:
print(len(cv.get_feature_names()))

62561


In [16]:
len(set(cv.get_feature_names()))

62561

In [17]:
print(X_cv.min(),X_tfv.min(),X_cv.max(),X_tfv.max())

0 0.0 37 0.95949952161334


In [18]:
model = MultinomialNB()
model2 = BernoulliNB()

In [21]:
X_cv[1]

<1x62561 sparse matrix of type '<class 'numpy.int64'>'
	with 36 stored elements in Compressed Sparse Row format>

In [19]:
model.fit(X,y)
model2.fit(X,y)

ValueError: could not convert string to float: 'matur intellig highli charg melodrama unbelivebl film china wei wei stun perform catylast love triangl simpli stun oppurun see magnific film take'

In [None]:
print(model.score(X,y))
print(model2.score(X,y))

In [None]:
test_data = pd.read_csv("..//..//..//Datasets//MovieReviews//Test//Test.csv")

In [None]:
test_data = test_data.iloc[:,0]

In [None]:
test_cleaned = test_data.apply(return_uw)

In [None]:
test_cleaned.shape

In [None]:
print(test_cleaned[3])
print(test_data.iloc[3])

In [None]:
test_cleaned = cv.transform(test_cleaned)

In [None]:
ypred = model.predict(test_cleaned)
ypred2 = model2.predict(test_cleaned)

In [None]:
output = pd.DataFrame(ypred2, columns=["label"])

In [None]:
output["Id"] = np.arange(len(output))

In [None]:
output.to_csv("output_cv_bnb.csv", columns=["Id", "label"], index=False)