# Building a Sentiment Dictionary

Data used to build a sentiment dictionary:

- 200000 samples from Amazon product data (Movies and TV category): http://jmcauley.ucsd.edu/data/amazon/ 

In [3]:
# import necessary libraries
from nltk import sent_tokenize, word_tokenize
import nltk
from nltk.corpus import stopwords
import pandas as pd
import multiprocessing
from gensim.models import Word2Vec
import json
from joblib.parallel import Parallel, delayed
import random
import pandas as pd

# download additional data for the nltk functions used
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/daniel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/daniel/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/daniel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/daniel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
amazon_data = dict()
text_data = ""

# get the text of the first 200_000 reviews using all available cpus

with open("Movies_and_TV_5.json", "rt") as inf:
    text_data = \
        Parallel(n_jobs=multiprocessing.cpu_count())(delayed(\
            lambda x: (json.loads(x))['reviewText'])(line)\
            for line in inf.readlines()[:200000])

In [4]:
# a function that removes special characters from a text
# converts it to lowercase and tokenizes it
def tokenize(line):
    spec_chars = ".,?/!\'\"()[]{};:<>|_-=+“”"
    line.replace("&#34;", "")
    train_data = []
    
    for sentence in sent_tokenize(line):
        words = [word.strip().lower() \
                     for word in word_tokenize(sentence)\
                         if not (len(word) == 1 and word in spec_chars)]
        train_data.append(words)
        
    return train_data

In [5]:
# tokenize every review
train_data = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(tokenize)(line) for line in text_data)

In [6]:
# concatenate the resulting lists into one list
x = []
for l in train_data:
    x += l

In [7]:
# train a word2vec model on the amazon reviews
model = Word2Vec(x, min_count=1, size=100, window=5, workers=multiprocessing.cpu_count())

In [8]:
# define a few positive and negative words
positive = ["well", "good", "better", "best", "happy", "awesome", 
            "amazing", "great", "nice", "fantastic","funny", "happy",
            'creatively', 'intelligently','laughable']
negative = ["bad", "worse", "worst", "horrible", "poor", "poorly", 
            "weak", 'disappointed', "weakest", "unfunny"]

# define the accepted POS tags
accepted = ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']

In [9]:
# convert these lists to sets
positive = set(positive)
negative = set(negative)
temp_list = []

for _ in range(4):
    
    # find words that are similar to the defined ones
    # according to the word2vec model trained previously
    for w in positive:
        res = model.wv.most_similar(w, topn=10)
        res = [x[0] for x in res if not x[0] in negative and nltk.pos_tag([x[0]])[0][1] in accepted]
        temp_list += res
    
    # add these to the positive words set
    for x in temp_list:
        positive.add(x)
    
    temp_list = []
    
    # repeat this process with the negative words too
    for w in negative:
        res = model.wv.most_similar(w, topn=10)
        res = [x[0] for x in res if not x[0] in positive and nltk.pos_tag([x[0]])[0][1] in accepted]
        temp_list += res
    
    for x in temp_list:
        negative.add(x)
    temp_list = []

In [10]:
# print the positive word set
positive

{"'artistically",
 '1915-92',
 '1924-',
 'able',
 'accurately',
 'adequately',
 'admirable',
 'adorable',
 'adventurous',
 'amateurish',
 'amazing',
 'ambiguous',
 'anachronistic',
 'anxious',
 'appreciative',
 'artfully',
 'artistically',
 'atmospheric',
 'atypical',
 'aural',
 'authentic',
 'authentically',
 'awesome',
 'beatifully',
 'beautfully',
 'beautifully',
 'believable',
 'believably',
 'best',
 'best-known',
 'best-loved',
 'better',
 'bigger',
 'bitingly',
 'bizarrely',
 'breathtakingly',
 'brilliantly',
 'broadly',
 'buitifully',
 'capably',
 'carefully',
 'cautious',
 'charismatic',
 'charmingly',
 'choppy',
 'cleverly',
 'cliche-ish',
 'clumsily',
 'colorfully',
 'comedatic',
 'comical',
 'commendable',
 'compellingly',
 'convincingly',
 'craftily',
 'creatively',
 'credible',
 'credibly',
 'creditable',
 'dazzlingly',
 'deftly',
 'delicately',
 'deliciously',
 'delighted',
 'delightfully',
 'devilishly',
 'digitally',
 'dynamically',
 'earliest',
 'eclectic',
 'effectiv

In [58]:
# put the positive and negative words into a dictionary and 
# give them sentiment scores
dictionary = dict()
for word in positive:
    dictionary[word] = 1
    
for word in negative:
    dictionary[word] = -1

In [59]:
# save dictionary to a json file
with open("dict.json", "wt") as outf:
    json.dump(dictionary, outf)

In [60]:
len(dictionary.keys())

488

# Classification

Dataset used for classification:

- IMDB 50K dataset: https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [2]:
# import all necessary modules
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

In [3]:
# read data into a dataframe
imdb_data = pd.read_csv('IMDB Dataset.csv')
imdb_data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
# remove all "<br />" tags from the reviews
for idx, row in imdb_data.iterrows():
    txt = row.review.replace("<br />", " ")
    imdb_data.iloc[idx, 0] = txt

In [5]:
# split data into train and validation sets
X, y = imdb_data['review'], imdb_data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [6]:
# define a pipeline for the multinomial naive-bayes model
clf_model = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', MultinomialNB())
])

In [7]:
# convert labels to integers
y_train[y_train == "negative"] = -1
y_train[y_train == "positive"] = 1
y_test[y_test == "negative"] = -1
y_test[y_test == "positive"] = 1

y_train = y_train.astype('int').ravel()
y_test = y_test.astype('int').ravel()

In [8]:
# train and predict using MultinomialNB
clf_model.fit(X_train, y=y_train.ravel())
pred = clf_model.predict_proba(X_test)
predicted_bayes = np.array([-row[0] if np.max(row) == row[0] else row[1] for row in pred])
predicted_bayes

array([ 0.66084404,  0.77903229,  0.62037838, ..., -0.61963381,
       -0.65926726, -0.80904578])

In [9]:
discrete_pred = clf_model.predict(X_test)

In [10]:
# print evaluation metrics

def metrics(y_test, y_pred):
    print("Accuracy: ", accuracy_score(y_test, y_pred))
    print("Precision: ", precision_score(y_test, y_pred))
    print("Recall: ", recall_score(y_test, y_pred))
    print("F1 score: ", f1_score(y_test, y_pred))
    
metrics(y_test, discrete_pred)

Accuracy:  0.8611333333333333
Precision:  0.8859736207630123
Recall:  0.8300558065373372
F1 score:  0.8571036564450848


In [11]:
# load sentiment dictionary
with open("dict.json", "rt") as inf:
    sent_dictionary = json.load(inf)

In [12]:
# function that calculates sentiment scores for 
# each review using the sentiment dictionary
def analyse_dict(data):
    sentiments = []

    for idx, txt in data.items():
        # init sentiment for review
        sent = 0
        # iterate through sentences of review
        for sentence in sent_tokenize(txt):
            negative = 0
            # iterate through words of the sentence
            for word in word_tokenize(sentence):
                # process word
                w = word.lower().strip()
                # if word is in the list than change the polarity of 
                # the sentiment for the next 5 words
                if w in ['not', 'wasn\'t', 'isn\'t', 'weren\'t']:
                    negative = 5
                elif w in sent_dictionary.keys():
                    # else if word is in the dictionary
                    # then add sentiment score to the cumulative
                    # score
                    if negative != 0:
                        sent += -sent_dictionary[w]
                        negative -= 1
                    else:
                        sent += sent_dictionary[w]
        # append cumulative score of review to the list of sentiment scores
        sentiments.append(sent)
    
    return sentiments

In [13]:
# get sentiment scores for the train data
dictionary_train = analyse_dict(X_train)

In [14]:
# a function that finds the threshold value below which a 
# sentiment score is considered to be a negative sentiment
def find_threshold(X, y):
    # list of possible threshold values
    threshold = [i for i in range(10)]
    results = []

    # find threshold value that maximizes the accuracy score of 
    # the predictions based on the sentiment dictionary
    for t in threshold:
        pred = [-1 if x < t else 1 for x in X]
        pred = np.array(pred)
        results.append(accuracy_score(y, pred))
    m = max(results)
    return results.index(m)

In [15]:
threshold = find_threshold(dictionary_train, y_train)
threshold

2

In [16]:
# get predictions for the test dataset
dictionary_test = analyse_dict(X_test)
dictionary_predict = [-1 if x < threshold else 1 for x in dictionary_test]
dictionary_predict = np.array(dictionary_predict)

In [17]:
# combine predictions of the two methods presented above
predict_combined = 0.6 * predicted_bayes + 0.4 * dictionary_predict
predict_combined = np.array([1 if x >= 0 else -1 for x in predict_combined])

In [18]:
metrics(y_test, predict_combined)

Accuracy:  0.8270666666666666
Precision:  0.82109375
Recall:  0.8378952963061387
F1 score:  0.8294094436406682


In [19]:
predict_combined = 0.2 * predicted_bayes + 0.8 * dictionary_predict
predict_combined = np.array([1 if x >= 0 else -1 for x in predict_combined])
metrics(y_test, predict_combined)

Accuracy:  0.6484
Precision:  0.6241181657848325
Recall:  0.7523252723890513
F1 score:  0.6822508735992289
