In [4]:
import pandas as pd
from bs4 import BeautifulSoup
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91891\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
df = pd.read_csv("labeledTrainData.tsv",delimiter = '\t',quoting = 3,header = 0)
test = pd.read_csv("testData.tsv",delimiter = '\t',quoting = 3,header = 0)

In [7]:
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [8]:
def get_words(text):
    #removing the html tags
    text = BeautifulSoup(text,'lxml').get_text()
    cleaner_text = re.sub('[^a-zA-Z]',' ',text)
    words = cleaner_text.lower().split()
    stopwrds = set(stopwords.words('english'))
    words_final = [x for x in words if not x in stopwrds]
    return ' '.join(words_final)

In [9]:
print(get_words(df['review'][0]))

stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate working

In [10]:
def text_to_tokens(a):
    for i in range(0,a.size):
        yield get_words(a[i])

In [11]:
vectorizer = CountVectorizer(analyzer = 'word',
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_df = 0.5,
                             max_features = 10000)

In [12]:
new_train = list(text_to_tokens(df['review']))
train_data_features = vectorizer.fit_transform(new_train)
new_test = list(text_to_tokens(test['review']))
test_data_features = vectorizer.fit_transform(new_test)

In [13]:
Xtrain, Xtest, ytrain, ytest = train_test_split(train_data_features, df['sentiment'], test_size=0.20, random_state=36)

MODELING

Multinomial Naive Bayes

In [14]:
mnb = MultinomialNB(alpha=0.0001)
cv_pred = mnb.fit(Xtrain,ytrain).predict_proba(Xtest)[:,1]

In [15]:
roc_auc_score(ytest,cv_pred)

0.9238410855634166

In [16]:
rfc = RandomForestClassifier(n_estimators=300, criterion = 'gini')
cv_pred_rf = rfc.fit(Xtrain,ytrain).predict_proba(Xtest)[:,1]
roc_auc_score(ytest,cv_pred_rf)

0.927588838119906

In [30]:
tfidf = TfidfVectorizer(analyzer = 'word')
tfidf_df = tfidf.fit_transform(new_train)
tfidf_test = tfidf.fit_transform(new_test)

In [31]:
Xtrain, Xtest, ytrain, ytest = train_test_split(tfidf_df, df['sentiment'], test_size=0.20, random_state=36)

In [34]:
mnb = MultinomialNB(alpha=0.0001)
cv_pred = mnb.fit(Xtrain,ytrain).predict_proba(Xtest)[:,1]

In [35]:
roc_auc_score(ytest,cv_pred)

0.8947946032235183

In [36]:
rfc = RandomForestClassifier(n_estimators=300, criterion = 'gini')
cv_pred_rf = rfc.fit(Xtrain,ytrain).predict_proba(Xtest)[:,1]
roc_auc_score(ytest,cv_pred_rf)

0.9332945485824604

In [41]:
nltk.download('punkt')
toks = nltk.word_tokenize(new_train[0])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91891\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [46]:
train_data_features

<25000x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 2135992 stored elements in Compressed Sparse Row format>