# Filtering: Opinion lexicon

Reads `original.pickle.bz2` and writes `opinion-filtered.pickle.bz2`.

Uses opinion lexicon and review-summary + review-text.

Data format: `{year {star [(number, year, star)] } }`

In [1]:
# Reload modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

# Import from project root
import sys; sys.path.insert(0, '../')

import timeit

from access.file_storage import FileStorage
from access.interim_storage import InterimStorage
from amore.printer import Printer
from readers.opinion_counts import OpinionCounts

KEY_NUMBER = 0
KEY_YEAR   = 1
KEY_STAR   = 2

In [2]:
file_storage = FileStorage()
printer = Printer()

In [3]:
def count_ysl(ysl):
    c = 0
    for year in ysl.keys():
        for star in ysl[year].keys():
            c += len(ysl[year][star])
    return c

In [4]:
def add_tuple(dict_, tup):
    if tup[KEY_YEAR] not in dict_.keys():
        dict_[tup[KEY_YEAR]] = {}
    if tup[KEY_STAR] not in dict_[tup[KEY_YEAR]].keys():
        dict_[ tup[KEY_YEAR] ][ tup[KEY_STAR] ] = []
    dict_[tup[KEY_YEAR]][tup[KEY_STAR]].append(tup)

## Read data

In [5]:
# Read cache
ys_lists = InterimStorage('original').read()
print('Reviews in ys_lists:', count_ysl(ys_lists))
# Reviews in ys_lists: 5,483,175

Reviews in ys_lists: 7911684


In [6]:
# Print table
if False:
    printer.ipython_display(printer.get_dataframe_with_sums(ys_lists))
if False:
    print(printer.get_dataframe_markdown(printer.get_dataframe_with_sums(ys_lists), float_as_integer=True))
print('Reviews in ys_lists:', count_ysl(ys_lists))
# Reviews in ys_lists: 7,911,684

Reviews in ys_lists: 7911684


Original

|     |   1997 |   1998 |   1999 |   2000 |   2001 |   2002 |   2003 |   2004 |   2005 |   2006 |   2007 |   2008 |   2009 |   2010 |   2011 |   2012 |     Sum |
|:----|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|--------:|
| 1   |      6 |    191 |   4844 |  19944 |  24221 |  25311 |  25734 |  41016 |  54744 |  49049 |  49521 |  56076 |  59099 |  65343 |  72957 |  81276 |  629332 |
| 2   |      1 |    262 |   3631 |  17808 |  20320 |  22641 |  24183 |  33117 |  40868 |  37992 |  40205 |  40138 |  39680 |  41430 |  45767 |  47356 |  455399 |
| 3   |      8 |    442 |   6458 |  30907 |  35395 |  37798 |  43323 |  60489 |  71012 |  66128 |  75239 |  74057 |  73178 |  70279 |  72055 |  74826 |  791594 |
| 4   |     29 |    797 |  14178 |  73314 |  79152 |  84276 |  90527 | 119160 | 138000 | 135581 | 167632 | 161693 | 149771 | 142000 | 148457 | 150248 | 1654815 |
| 5   |     64 |   3313 |  49866 | 192002 | 189638 | 198712 | 205916 | 257603 | 308080 | 311252 | 452009 | 412870 | 422403 | 426248 | 465918 | 484650 | 4380544 |
| Sum |    108 |   5005 |  78977 | 333975 | 348726 | 368738 | 389683 | 511385 | 612704 | 600002 | 784606 | 744834 | 744131 | 745300 | 805154 | 838356 | 7911684 |

## Filter by opinion words

In [7]:
def print_examples(ysl, posstar_max=10, negstar_max=10):
    print('Star   existent occurences   review-number')
    posstar_count=0
    negstar_count=0
    for year in ysl.keys():
        for star in ysl[year].keys():
            for tup in ysl[year][star]:
                if(negstar_count<negstar_max and star in [1,2]):
                    print(star, ' ',
                          opinion_counts.get_existent(tup[KEY_NUMBER]),
                          opinion_counts.get_occurences(tup[KEY_NUMBER]),
                          ' ', tup[KEY_NUMBER])
                    negstar_count += 1
                elif(posstar_count<posstar_max and star in [4,5]):
                    print(star, ' ',
                          opinion_counts.get_existent(tup[KEY_NUMBER]),
                          opinion_counts.get_occurences(tup[KEY_NUMBER]),
                          ' ', tup[KEY_NUMBER])
                    posstar_count += 1
                if(posstar_count>=posstar_max and negstar_count>=negstar_max):
                    break
            if(posstar_count>=posstar_max and negstar_count>=negstar_max):
                break
        if(posstar_count>=posstar_max and negstar_count>=negstar_max):
            break

In [8]:
opinion_counts = OpinionCounts(file_storage.get_filepath('AMORE-OpinionCounts'))

In [9]:
# Examples with 0/neg for 5-star and with 0/pos for 1-star
print_examples(ys_lists, posstar_max=23, negstar_max=4)

Star   existent occurences   review-number
5   4 4   18
5   2 2   27
5   4 4   93
5   4 4   94
5   8 8   95
5   4 4   96
5   5 5   98
5   14 10   100
5   1 1   101
5   3 3   103
5   5 5   104
5   2 1   212
5   0 0   213
5   9 9   214
5   2 1   215
5   3 3   339
5   1 1   362
5   2 2   363
5   1 1   364
5   6 5   366
5   8 10   382
5   4 2   384
5   -11 -8   385
1   1 3   281
1   -2 -2   1130
1   9 7   1960
1   1 0   2275


In [10]:
time_begin = timeit.default_timer()
ys_opinion_lists = {}
for year in ys_lists.keys():
    for star in ys_lists[year].keys():
        for tup in ys_lists[year][star]:
            if(tup[KEY_STAR] in [1,2]):
                if(opinion_counts.get_existent(tup[KEY_NUMBER]) < 0 and
                   opinion_counts.get_occurences(tup[KEY_NUMBER]) < 0):
                    add_tuple(ys_opinion_lists, tup)
            elif(tup[KEY_STAR] == 3):
                if(opinion_counts.get_existent(tup[KEY_NUMBER]) >= -1 and
                   opinion_counts.get_existent(tup[KEY_NUMBER]) <= 1 and
                   opinion_counts.get_occurences(tup[KEY_NUMBER]) >= -1 and
                   opinion_counts.get_occurences(tup[KEY_NUMBER]) <= 1):
                    add_tuple(ys_opinion_lists, tup)
            elif(tup[KEY_STAR] in [4,5]):
                if(opinion_counts.get_existent(tup[KEY_NUMBER]) > 0 and
                   opinion_counts.get_occurences(tup[KEY_NUMBER]) > 0):
                    add_tuple(ys_opinion_lists, tup)
print('Runtime:', timeit.default_timer() - time_begin)

Runtime: 19.94827593397349


In [11]:
print_examples(ys_opinion_lists, posstar_max=23, negstar_max=4)

Star   existent occurences   review-number
5   4 4   18
5   2 2   27
5   4 4   93
5   4 4   94
5   8 8   95
5   4 4   96
5   5 5   98
5   14 10   100
5   1 1   101
5   3 3   103
5   5 5   104
5   2 1   212
5   9 9   214
5   2 1   215
5   3 3   339
5   1 1   362
5   2 2   363
5   1 1   364
5   6 5   366
5   8 10   382
5   4 2   384
5   1 4   395
5   4 2   421
1   -2 -2   1130
1   -4 -2   3590
1   -1 -1   4368
1   -2 -2   5422


In [12]:
# Print table
if False:
    printer.ipython_display(printer.get_dataframe_with_sums(ys_opinion_lists))
if False:
    print(printer.get_dataframe_markdown(printer.get_dataframe_with_sums(ys_opinion_lists), float_as_integer=True))
print('Reviews in ys_opinion_lists:', count_ysl(ys_opinion_lists))
# Reviews in ys_opinion_lists: 5,483,175

Reviews in ys_opinion_lists: 5483175


Filtered by opinion words

|     |   1997 |   1998 |   1999 |   2000 |   2001 |   2002 |   2003 |   2004 |   2005 |   2006 |   2007 |   2008 |   2009 |   2010 |   2011 |   2012 |     Sum |
|:----|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|--------:|
| 1   |      6 |    104 |   3110 |  12698 |  14751 |  15934 |  16611 |  26835 |  35093 |  29878 |  30496 |  33972 |  35420 |  39177 |  43552 |  48432 |  386069 |
| 2   |    nan |    122 |   1589 |   7820 |   8866 |  10127 |  10742 |  15439 |  19141 |  17152 |  17724 |  17271 |  16641 |  18523 |  20105 |  21036 |  202298 |
| 3   |    nan |    107 |   1535 |   6152 |   6376 |   6280 |   6917 |  10403 |  12668 |  11947 |  16364 |  16218 |  17691 |  17052 |  18243 |  19266 |  167219 |
| 4   |     13 |    599 |  10429 |  52130 |  56607 |  58688 |  62256 |  80602 |  93419 |  93912 | 121336 | 117024 | 110548 | 102739 | 109983 | 112609 | 1182894 |
| 5   |     52 |   2694 |  39621 | 150126 | 148077 | 153886 | 157562 | 194587 | 234448 | 242242 | 370047 | 341264 | 352366 | 353522 | 393641 | 410560 | 3544695 |
| Sum |     71 |   3626 |  56284 | 228926 | 234677 | 244915 | 254088 | 327866 | 394769 | 395131 | 555967 | 525749 | 532666 | 531013 | 585524 | 611903 | 5483175 |

In [13]:
# Write cache
print(InterimStorage('opinion-filtered').write(ys_opinion_lists).get_filepath())
print('Reviews in ys_opinion_lists:', count_ysl(ys_opinion_lists))
# Reviews in ys_opinion_lists: 5,483,175

/tmp/InterimStorage/opinion-filtered.pickle.bz2
Reviews in ys_opinion_lists: 5483175
