In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer, sent_tokenize, wordpunct_tokenize, word_tokenize
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB

In [2]:
train_data = pd.read_csv("Train.csv")
train_data

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos
...,...,...
39995,There are similarities between Ray Lawrence's ...,neg
39996,"For starters, I once met the director when he ...",neg
39997,"Much of ""Over Her Dead Body"" is so painfully u...",neg
39998,"""Lifeforce"" is a truly bizarre adaptation of t...",pos


In [3]:
X, y = train_data.iloc[:,0], train_data.iloc[:,1]
train_data.dtypes

review    object
label     object
dtype: object

In [4]:
txt = X[1]
txt
type(X)

pandas.core.series.Series

In [5]:
tokenizer = RegexpTokenizer('[a-zA-Z]+')
ps = PorterStemmer()
sw = set(stopwords.words('english'))


In [6]:
def return_uw(review):
    review = str(review)
    review = review.lower()
    review = review.replace("<br /><br />", " ")
    words = tokenizer.tokenize(review)
    uw = [ps.stem(w) for w in words if w not in sw]
    return ' '.join(uw)

In [7]:
X = X.apply(return_uw)

In [8]:
X[1]

'http video googl com videoplay docid hl en distribut tri opt mass appeal want best possibl view rang forgo profit continu manual labor job gladli entertain work view texa tale pleas write like like alex like stuie texa texa tale write opinion rule'

In [55]:
cv = CountVectorizer(ngram_range=(1,5))
tfv = TfidfVectorizer()

In [None]:
X_cv = cv.fit_transform(X)

In [None]:
X_tfv = tfv.fit_transform(X)

In [None]:
#cv.get_feature_names()

In [None]:
print(X_cv.min(),X_tfv.min(),X_cv.max(),X_tfv.max())

In [None]:
Bnb_cv = BernoulliNB()
Mnb_cv = MultinomialNB()
Bnb_tfv = BernoulliNB()
Mnb_tfv = MultinomialNB()

In [None]:
Bnb_cv.fit(X_cv,y)
Mnb_cv.fit(X_cv,y)
Bnb_tfv.fit(X_tfv,y)
Mnb_tfv.fit(X_tfv,y)

In [None]:
print("Bernouli Naves bayes score using countvectorize :",Bnb_cv.score(X_cv,y))
print("Multinomial Naves bayes score using countvectorize :",Mnb_cv.score(X_cv,y))
print("Bernouli Naves bayes score using tfidf :",Bnb_tfv.score(X_tfv,y))
print("Multinomial Naves bayes score using tfidf :",Mnb_tfv.score(X_tfv,y))

In [None]:
test_data = pd.read_csv("Test.csv")

In [None]:
test_data = test_data.iloc[:,0]

In [None]:
test_cleaned = test_data.apply(return_uw)

In [None]:
test_cleaned.shape

In [None]:
print(test_cleaned[3])
print(test_data.iloc[3])

In [None]:
test_cleaned_cv = cv.transform(test_cleaned)
test_cleaned_tfv = tfv.transform(test_cleaned)

In [None]:
ypred_bnb_cv = Bnb_cv.predict(test_cleaned_cv)
ypred_mnb_cv = Mnb_cv.predict(test_cleaned_cv)
ypred_bnb_tfv = Bnb_tfv.predict(test_cleaned_tfv)
ypred_mnb_tfv = Mnb_tfv.predict(test_cleaned_tfv)

In [None]:
output_bnb_cv = pd.DataFrame(ypred_bnb_cv, columns=["label"])
output_Mnb_cv = pd.DataFrame(ypred_mnb_cv, columns=["label"])
output_bnb_tfv = pd.DataFrame(ypred_bnb_tfv, columns=["label"])
output_Mnb_tfv = pd.DataFrame(ypred_mnb_tfv, columns=["label"])

In [None]:
output_bnb_cv["Id"] = np.arange(len(output_bnb_cv))
output_Mnb_cv["Id"] = np.arange(len(output_bnb_cv))
output_bnb_tfv["Id"] = np.arange(len(output_bnb_tfv))
output_Mnb_tfv["Id"] = np.arange(len(output_Mnb_tfv))

In [None]:
output_bnb_cv.to_csv("output_bnb_cv.csv", columns=["Id", "label"], index=False)
output_Mnb_cv.to_csv("output_Mnb_cv.csv", columns=["Id", "label"], index=False)
output_bnb_tfv.to_csv("output_bnb_tfv.csv", columns=["Id", "label"], index=False)
output_Mnb_tfv.to_csv("output_Mnb_tfv.csv", columns=["Id", "label"], index=False)