# Filtering

In [1]:
# Reload modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

# Import from project root
import sys; sys.path.insert(0, '../')

import timeit

from access.file_storage import FileStorage
from access.interim_storage import InterimStorage

from amore.printer import Printer

from readers.numbers_years_stars import NumbersYearsStars
from readers.opinion_counts import OpinionCounts

In [3]:
file_storage = FileStorage()
printer = Printer()

In [4]:
def print_examples(ysl, posstar_max=10, negstar_max=10):
    posstar_count=0
    negstar_count=0
    for year in ysl.keys():
        for star in ysl[year].keys():
            for tup in ysl[year][star]:
                if(negstar_count<negstar_max and star in [1,2]):
                    print(star, ' ',
                          opinion_counts.get_existent(tup[NumbersYearsStars.KEY_NUMBER]),
                          opinion_counts.get_occurences(tup[NumbersYearsStars.KEY_NUMBER]),
                          ' ', tup[NumbersYearsStars.KEY_NUMBER])
                    negstar_count += 1
                elif(posstar_count<posstar_max and star in [4,5]):
                    print(star, ' ',
                          opinion_counts.get_existent(tup[NumbersYearsStars.KEY_NUMBER]),
                          opinion_counts.get_occurences(tup[NumbersYearsStars.KEY_NUMBER]),
                          ' ', tup[NumbersYearsStars.KEY_NUMBER])
                    posstar_count += 1
                if(posstar_count>=posstar_max and negstar_count>=negstar_max):
                    break
            if(posstar_count>=posstar_max and negstar_count>=negstar_max):
                break
        if(posstar_count>=posstar_max and negstar_count>=negstar_max):
            break

## Numbers overview

In [None]:
nys = NumbersYearsStars(file_storage.get_filepath('AMORE-NumbersYearsStars'))

In [None]:
# Reads and also caches data from file
ys_lists = nys.get_by_year_star()

In [None]:
# Print table
if False:
    printer.ipython_display(printer.get_dataframe_with_sums(ys_lists))
if False:
    print(printer.get_dataframe_markdown(printer.get_dataframe_with_sums(ys_lists), float_as_integer=True))

|     |   1997 |   1998 |   1999 |   2000 |   2001 |   2002 |   2003 |   2004 |   2005 |   2006 |   2007 |   2008 |   2009 |   2010 |   2011 |   2012 |     Sum |
|:----|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|--------:|
| 1   |      6 |    191 |   4844 |  19944 |  24221 |  25311 |  25734 |  41016 |  54744 |  49049 |  49521 |  56076 |  59099 |  65343 |  72957 |  81276 |  629332 |
| 2   |      1 |    262 |   3631 |  17808 |  20320 |  22641 |  24183 |  33117 |  40868 |  37992 |  40205 |  40138 |  39680 |  41430 |  45767 |  47356 |  455399 |
| 3   |      8 |    442 |   6458 |  30907 |  35395 |  37798 |  43323 |  60489 |  71012 |  66128 |  75239 |  74057 |  73178 |  70279 |  72055 |  74826 |  791594 |
| 4   |     29 |    797 |  14178 |  73314 |  79152 |  84276 |  90527 | 119160 | 138000 | 135581 | 167632 | 161693 | 149771 | 142000 | 148457 | 150248 | 1654815 |
| 5   |     64 |   3313 |  49866 | 192002 | 189638 | 198712 | 205916 | 257603 | 308080 | 311252 | 452009 | 412870 | 422403 | 426248 | 465918 | 484650 | 4380544 |
| Sum |    108 |   5005 |  78977 | 333975 | 348726 | 368738 | 389683 | 511385 | 612704 | 600002 | 784606 | 744834 | 744131 | 745300 | 805154 | 838356 | 7911684 |

## Filter by opinion words

In [None]:
opinion_counts = OpinionCounts(file_storage.get_filepath('AMORE-OpinionCounts'))

In [None]:
# Examples with 0/neg for 5-star and with 0/pos for 1-star
print_examples(ys_lists, posstar_max=23, negstar_max=4)

In [None]:
time_begin = timeit.default_timer()

def add_tuple(dict_, tup):
    if tup[NumbersYearsStars.KEY_YEAR] not in dict_.keys():
        dict_[tup[NumbersYearsStars.KEY_YEAR]] = {}
    if tup[NumbersYearsStars.KEY_STAR] not in dict_[tup[NumbersYearsStars.KEY_YEAR]].keys():
        dict_[ tup[NumbersYearsStars.KEY_YEAR] ][ tup[NumbersYearsStars.KEY_STAR] ] = []
    dict_[tup[NumbersYearsStars.KEY_YEAR]][tup[NumbersYearsStars.KEY_STAR]].append(tup)

ys_opinion_lists = {}
for year in ys_lists.keys():
    for star in ys_lists[year].keys():
        for tup in ys_lists[year][star]:
            if(tup[NumbersYearsStars.KEY_STAR] in [1,2]):
                if(opinion_counts.get_existent(tup[NumbersYearsStars.KEY_NUMBER]) < 0 and
                   opinion_counts.get_occurences(tup[NumbersYearsStars.KEY_NUMBER]) < 0):
                    add_tuple(ys_opinion_lists, tup)
            elif(tup[NumbersYearsStars.KEY_STAR] == 3):
                continue
            elif(tup[NumbersYearsStars.KEY_STAR] in [4,5]):
                if(opinion_counts.get_existent(tup[NumbersYearsStars.KEY_NUMBER]) > 0 and
                   opinion_counts.get_occurences(tup[NumbersYearsStars.KEY_NUMBER]) > 0):
                    add_tuple(ys_opinion_lists, tup)
print('Runtime:', timeit.default_timer() - time_begin)

In [None]:
print_examples(ys_opinion_lists, posstar_max=23, negstar_max=4)

In [None]:
# Print table
if False:
    printer.ipython_display(printer.get_dataframe_with_sums(ys_opinion_lists))
if False:
    print(printer.get_dataframe_markdown(printer.get_dataframe_with_sums(ys_opinion_lists), float_as_integer=True))

|     |   1997 |   1998 |   1999 |   2000 |   2001 |   2002 |   2003 |   2004 |   2005 |   2006 |   2007 |   2008 |   2009 |   2010 |   2011 |   2012 |     Sum |
|:----|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|--------:|
| 1   |      6 |    104 |   3110 |  12698 |  14751 |  15934 |  16611 |  26835 |  35093 |  29878 |  30496 |  33972 |  35420 |  39177 |  43552 |  48432 |  386069 |
| 2   |      0 |    122 |   1589 |   7820 |   8866 |  10127 |  10742 |  15439 |  19141 |  17152 |  17724 |  17271 |  16641 |  18523 |  20105 |  21036 |  202298 |
| 4   |     13 |    599 |  10429 |  52130 |  56607 |  58688 |  62256 |  80602 |  93419 |  93912 | 121336 | 117024 | 110548 | 102739 | 109983 | 112609 | 1182894 |
| 5   |     52 |   2694 |  39621 | 150126 | 148077 | 153886 | 157562 | 194587 | 234448 | 242242 | 370047 | 341264 | 352366 | 353522 | 393641 | 410560 | 3544695 |
| Sum |     71 |   3519 |  54749 | 222774 | 228301 | 238635 | 247171 | 317463 | 382101 | 383184 | 539603 | 509531 | 514975 | 513961 | 567281 | 592637 | 5315956 |

In [None]:
# Write cache
InterimStorage('filtered-opinion-words').write(ys_opinion_lists).get_filepath()

# Dev

In [5]:
# Read cache
ys_opinion_lists = InterimStorage('filtered-opinion-words').read()
printer.ipython_display(printer.get_dataframe_with_sums(ys_opinion_lists))

Unnamed: 0,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,Sum
1,6.0,104,3110,12698,14751,15934,16611,26835,35093,29878,30496,33972,35420,39177,43552,48432,386069.0
2,,122,1589,7820,8866,10127,10742,15439,19141,17152,17724,17271,16641,18523,20105,21036,202298.0
4,13.0,599,10429,52130,56607,58688,62256,80602,93419,93912,121336,117024,110548,102739,109983,112609,1182894.0
5,52.0,2694,39621,150126,148077,153886,157562,194587,234448,242242,370047,341264,352366,353522,393641,410560,3544695.0
Sum,71.0,3519,54749,222774,228301,238635,247171,317463,382101,383184,539603,509531,514975,513961,567281,592637,5315956.0
