# Opinion lexicon

Test: Read opinion lexicon and print overview

In [1]:
# Reload modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

# Import from project root
import sys; sys.path.insert(0, '../')

from access.file_storage import FileStorage
from amore.opinion_lexicon import OpinionLexicon

from gensim.utils import simple_preprocess
from collections import Counter
import timeit
import os

In [2]:
file_storage = FileStorage()
print(file_storage.get_filepath('opinion-words'))
print(os.path.exists(file_storage.get_filepath('opinion-words')))

/home/eml4u/EML4U/notebooks/BenchTest/data/opinion-words/opinion-lexicon-English.rar
True


In [3]:
opinion_lexicon = OpinionLexicon(file_storage.get_filepath('opinion-words'))

In [4]:
p = 0
for word in opinion_lexicon.get_positive_word():
    p += 1
p

2006

In [5]:
n = 0
for word in opinion_lexicon.get_negative_word():
    n += 1
n

4783

In [6]:
pos_words = opinion_lexicon.get_positive_set()
print(len(pos_words))

2006


In [7]:
neg_words = opinion_lexicon.get_negative_set()
print(len(neg_words))

4783


In [8]:
print(opinion_lexicon.get_extremum_length(maximum=False, positive=True))
print(opinion_lexicon.get_extremum_length(maximum=True,  positive=True))
print(opinion_lexicon.get_extremum_length(maximum=False, positive=False))
print(opinion_lexicon.get_extremum_length(maximum=True,  positive=False))

2
20
2
24


## Extraction

In [9]:
opinion_lexicon.extract_positive_words({'good', 'bad', 'great', 'boring'})

['good', 'great']

In [10]:
opinion_lexicon.extract_negative_words({'good', 'bad', 'great', 'boring'})

['bad', 'boring']

## Compare extraction runtimes

In [11]:
text = 'This is a very good text with good and great words. It it also boring. That is bad, very bad.'
print (text)

long_text = text
long_texts = []
for i in range(0,6):
    long_text += long_text
    long_texts.append(long_text)
print(len(text), len(long_text), len(long_texts))

This is a very good text with good and great words. It it also boring. That is bad, very bad.
93 5952 6


In [15]:
# 1x preprocess, same min/max -> faster
min_len = min(opinion_lexicon.get_extremum_length(maximum=False, positive=True), opinion_lexicon.get_extremum_length(maximum=False, positive=False))
max_len = max(opinion_lexicon.get_extremum_length(maximum=True,  positive=True), opinion_lexicon.get_extremum_length(maximum=True,  positive=False))

time_begin = timeit.default_timer()
for t in long_texts:
    counter = Counter(simple_preprocess(t, min_len=min_len, max_len=max_len))
    print(opinion_lexicon.extract_negative_counts(counter))
    print(opinion_lexicon.extract_positive_counts(counter))
print('Runtime:', timeit.default_timer() - time_begin)

[('boring', 2), ('bad', 4)]
[('good', 4), ('great', 2)]
[('boring', 4), ('bad', 8)]
[('good', 8), ('great', 4)]
[('boring', 8), ('bad', 16)]
[('good', 16), ('great', 8)]
[('boring', 16), ('bad', 32)]
[('good', 32), ('great', 16)]
[('boring', 32), ('bad', 64)]
[('good', 64), ('great', 32)]
[('boring', 64), ('bad', 128)]
[('good', 128), ('great', 64)]
Runtime: 0.004742762073874474


In [16]:
# 2x preprocess, exact min/max -> slower
minpos = opinion_lexicon.get_extremum_length(maximum=False, positive=True)
maxpos = opinion_lexicon.get_extremum_length(maximum=True,  positive=True)
minneg = opinion_lexicon.get_extremum_length(maximum=False, positive=False)
maxneg = opinion_lexicon.get_extremum_length(maximum=True,  positive=False)

time_begin = timeit.default_timer()
for t in long_texts:
    counter_neg = Counter(simple_preprocess(t, min_len=minneg, max_len=maxneg))
    counter_pos = Counter(simple_preprocess(t, min_len=minpos, max_len=maxpos))
    print(opinion_lexicon.extract_negative_counts(counter_neg))
    print(opinion_lexicon.extract_positive_counts(counter_pos))
print('Runtime:', timeit.default_timer() - time_begin)

[('boring', 2), ('bad', 4)]
[('good', 4), ('great', 2)]
[('boring', 4), ('bad', 8)]
[('good', 8), ('great', 4)]
[('boring', 8), ('bad', 16)]
[('good', 16), ('great', 8)]
[('boring', 16), ('bad', 32)]
[('good', 32), ('great', 16)]
[('boring', 32), ('bad', 64)]
[('good', 64), ('great', 32)]
[('boring', 64), ('bad', 128)]
[('good', 128), ('great', 64)]
Runtime: 0.006688169902190566
