In [14]:
import nltk
import os
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
import re, string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#

In [15]:
nltk.download([
'punkt',
'wordnet',
'averaged_perceptron_tagger',
'stopwords',
'omw-1.4'
])

[nltk_data] Downloading package punkt to /home/ahmed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ahmed/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ahmed/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/ahmed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ahmed/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [16]:
def get_tokens(list):#list of reviews
    tokens = []
    for review in list:
        tokens.append(word_tokenize(review))
    return tokens

In [17]:
def remove_stop_words(tokens):# list of tokens
    new_list = []
    stop_words = set(stopwords.words('english'))
    for w in tokens:
        if w not in stop_words:
            new_list.append(w)
    
    return new_list

In [18]:
def stemming(tokens): #list of tokens
    new_list = []
    stemmer = PorterStemmer()
    for word in tokens:
        new_list.append(stemmer.stem(word))
    return new_list


In [19]:

def remove_noise(tokens):

    cleaned_tokens = []

    for token, tag in pos_tag(tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() and token.isalpha():
            cleaned_tokens.append(token.lower())

    return cleaned_tokens

In [20]:
def join_tokens(tokens):
    return ' '.join(tokens)

In [21]:
# read data from files
positive_reviews = []
negative_reviews = []

for path in os.scandir('./data/neg'):
    if path.is_file():
        f = open(path,'r')
        positive_reviews.append(f.read().lower())

positive_list = list()
for path in os.scandir('./data/pos'):
    if path.is_file():
        f = open(path,'r')
        negative_reviews.append(f.read().lower())

In [22]:
def pre_processing(lst):# lst of reviews
    new_lst = []

    new_lst = get_tokens(lst)

    new_lst = [remove_noise(tokens) for tokens in new_lst]

    new_lst = [remove_stop_words(tokens) for tokens in new_lst]

    new_lst = [join_tokens(tokens) for tokens in new_lst]

    return new_lst


In [23]:

positive_tokens = pre_processing(positive_reviews)
negative_tokens = pre_processing(negative_reviews)


In [24]:
print(positive_tokens[0])

walt disney studio may finally meet match lush animation twentieth century fox anastasia judge late effort bluth studio visuals thing fox brag disney recent classic occasionally stretch credibility film pocahontas hunchback notre dame less extent hercules anastasia fox go far throw fact completely window may say kid movie well young kid beware may noticeably frighten visuals rasputin zombie whose body part continually fall disconcertingly real way consider warn nevertheless animation quite stunning time bluth use computer animation extensively throughout occasionally rival photographic quality yet scene material seem tv crowd lead wonder rush market combat disney plot anyone read history know concern attempt return anastasia royal family lose overthrow romanov anastasia much concern really happen plot go rent disney candleshoe see anastasia


In [25]:
X = pd.DataFrame()
X['reviews'] = positive_tokens + negative_tokens

zeros = np.zeros(len(positive_tokens),dtype=int) 
ones = np.ones(len(positive_tokens),dtype=int)

Y = pd.DataFrame()
Y['sentiment'] = np.append(zeros,ones)



In [27]:
from sklearn.naive_bayes import MultinomialNB

from sklearn import metrics

cv = CountVectorizer(stop_words='english',ngram_range = (1,1))
text_counts= cv.fit_transform(X['reviews'])

X_train, X_test, y_train, y_test = train_test_split(
    text_counts, Y['sentiment'], test_size=0.3,shuffle=True)

clf = MultinomialNB().fit(X_train, y_train)

predicted= clf.predict(X_test)
print(clf.score(X_test,y_test))
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))



0.81
MultinomialNB Accuracy: 0.81
