# Filtering

In [2]:
# Reload modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

# Import from project root
import sys; sys.path.insert(0, '../')

import timeit

from access.file_storage import FileStorage
from access.interim_storage import InterimStorage

from amore.printer import Printer

from readers.numbers_years_stars import NumbersYearsStars
from readers.opinion_counts import OpinionCounts
from readers.text_duplicates import TextDuplicates

In [3]:
file_storage = FileStorage()
printer = Printer()

In [5]:
def add_tuple(dict_, tup):
    if tup[NumbersYearsStars.KEY_YEAR] not in dict_.keys():
        dict_[tup[NumbersYearsStars.KEY_YEAR]] = {}
    if tup[NumbersYearsStars.KEY_STAR] not in dict_[tup[NumbersYearsStars.KEY_YEAR]].keys():
        dict_[ tup[NumbersYearsStars.KEY_YEAR] ][ tup[NumbersYearsStars.KEY_STAR] ] = []
    dict_[tup[NumbersYearsStars.KEY_YEAR]][tup[NumbersYearsStars.KEY_STAR]].append(tup)

In [12]:
nys = NumbersYearsStars(file_storage.get_filepath('AMORE-NumbersYearsStars'))

## Numbers overview

In [None]:
# Reads and also caches data from file
ys_lists = nys.get_by_year_star()

In [None]:
# Print table
if False:
    printer.ipython_display(printer.get_dataframe_with_sums(ys_lists))
if False:
    print(printer.get_dataframe_markdown(printer.get_dataframe_with_sums(ys_lists), float_as_integer=True))

|     |   1997 |   1998 |   1999 |   2000 |   2001 |   2002 |   2003 |   2004 |   2005 |   2006 |   2007 |   2008 |   2009 |   2010 |   2011 |   2012 |     Sum |
|:----|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|--------:|
| 1   |      6 |    191 |   4844 |  19944 |  24221 |  25311 |  25734 |  41016 |  54744 |  49049 |  49521 |  56076 |  59099 |  65343 |  72957 |  81276 |  629332 |
| 2   |      1 |    262 |   3631 |  17808 |  20320 |  22641 |  24183 |  33117 |  40868 |  37992 |  40205 |  40138 |  39680 |  41430 |  45767 |  47356 |  455399 |
| 3   |      8 |    442 |   6458 |  30907 |  35395 |  37798 |  43323 |  60489 |  71012 |  66128 |  75239 |  74057 |  73178 |  70279 |  72055 |  74826 |  791594 |
| 4   |     29 |    797 |  14178 |  73314 |  79152 |  84276 |  90527 | 119160 | 138000 | 135581 | 167632 | 161693 | 149771 | 142000 | 148457 | 150248 | 1654815 |
| 5   |     64 |   3313 |  49866 | 192002 | 189638 | 198712 | 205916 | 257603 | 308080 | 311252 | 452009 | 412870 | 422403 | 426248 | 465918 | 484650 | 4380544 |
| Sum |    108 |   5005 |  78977 | 333975 | 348726 | 368738 | 389683 | 511385 | 612704 | 600002 | 784606 | 744834 | 744131 | 745300 | 805154 | 838356 | 7911684 |

## Filter by opinion words

In [4]:
def print_examples(ysl, posstar_max=10, negstar_max=10):
    posstar_count=0
    negstar_count=0
    for year in ysl.keys():
        for star in ysl[year].keys():
            for tup in ysl[year][star]:
                if(negstar_count<negstar_max and star in [1,2]):
                    print(star, ' ',
                          opinion_counts.get_existent(tup[NumbersYearsStars.KEY_NUMBER]),
                          opinion_counts.get_occurences(tup[NumbersYearsStars.KEY_NUMBER]),
                          ' ', tup[NumbersYearsStars.KEY_NUMBER])
                    negstar_count += 1
                elif(posstar_count<posstar_max and star in [4,5]):
                    print(star, ' ',
                          opinion_counts.get_existent(tup[NumbersYearsStars.KEY_NUMBER]),
                          opinion_counts.get_occurences(tup[NumbersYearsStars.KEY_NUMBER]),
                          ' ', tup[NumbersYearsStars.KEY_NUMBER])
                    posstar_count += 1
                if(posstar_count>=posstar_max and negstar_count>=negstar_max):
                    break
            if(posstar_count>=posstar_max and negstar_count>=negstar_max):
                break
        if(posstar_count>=posstar_max and negstar_count>=negstar_max):
            break

In [None]:
opinion_counts = OpinionCounts(file_storage.get_filepath('AMORE-OpinionCounts'))

In [None]:
# Examples with 0/neg for 5-star and with 0/pos for 1-star
print_examples(ys_lists, posstar_max=23, negstar_max=4)

In [None]:
time_begin = timeit.default_timer()

ys_opinion_lists = {}
for year in ys_lists.keys():
    for star in ys_lists[year].keys():
        for tup in ys_lists[year][star]:
            if(tup[NumbersYearsStars.KEY_STAR] in [1,2]):
                if(opinion_counts.get_existent(tup[NumbersYearsStars.KEY_NUMBER]) < 0 and
                   opinion_counts.get_occurences(tup[NumbersYearsStars.KEY_NUMBER]) < 0):
                    add_tuple(ys_opinion_lists, tup)
            elif(tup[NumbersYearsStars.KEY_STAR] == 3):
                continue
            elif(tup[NumbersYearsStars.KEY_STAR] in [4,5]):
                if(opinion_counts.get_existent(tup[NumbersYearsStars.KEY_NUMBER]) > 0 and
                   opinion_counts.get_occurences(tup[NumbersYearsStars.KEY_NUMBER]) > 0):
                    add_tuple(ys_opinion_lists, tup)
print('Runtime:', timeit.default_timer() - time_begin)

In [None]:
print_examples(ys_opinion_lists, posstar_max=23, negstar_max=4)

In [None]:
# Print table
if False:
    printer.ipython_display(printer.get_dataframe_with_sums(ys_opinion_lists))
if False:
    print(printer.get_dataframe_markdown(printer.get_dataframe_with_sums(ys_opinion_lists), float_as_integer=True))

|     |   1997 |   1998 |   1999 |   2000 |   2001 |   2002 |   2003 |   2004 |   2005 |   2006 |   2007 |   2008 |   2009 |   2010 |   2011 |   2012 |     Sum |
|:----|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|--------:|
| 1   |      6 |    104 |   3110 |  12698 |  14751 |  15934 |  16611 |  26835 |  35093 |  29878 |  30496 |  33972 |  35420 |  39177 |  43552 |  48432 |  386069 |
| 2   |      0 |    122 |   1589 |   7820 |   8866 |  10127 |  10742 |  15439 |  19141 |  17152 |  17724 |  17271 |  16641 |  18523 |  20105 |  21036 |  202298 |
| 4   |     13 |    599 |  10429 |  52130 |  56607 |  58688 |  62256 |  80602 |  93419 |  93912 | 121336 | 117024 | 110548 | 102739 | 109983 | 112609 | 1182894 |
| 5   |     52 |   2694 |  39621 | 150126 | 148077 | 153886 | 157562 | 194587 | 234448 | 242242 | 370047 | 341264 | 352366 | 353522 | 393641 | 410560 | 3544695 |
| Sum |     71 |   3519 |  54749 | 222774 | 228301 | 238635 | 247171 | 317463 | 382101 | 383184 | 539603 | 509531 | 514975 | 513961 | 567281 | 592637 | 5315956 |

In [None]:
# Write cache
InterimStorage('filtered-opinion-words').write(ys_opinion_lists).get_filepath()

## Filter by duplicates

In [52]:
# Read cache
ys_opinion_lists = InterimStorage('filtered-opinion-words').read()
printer.ipython_display(printer.get_dataframe_with_sums(ys_opinion_lists))

Unnamed: 0,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,Sum
1,6.0,104,3110,12698,14751,15934,16611,26835,35093,29878,30496,33972,35420,39177,43552,48432,386069.0
2,,122,1589,7820,8866,10127,10742,15439,19141,17152,17724,17271,16641,18523,20105,21036,202298.0
4,13.0,599,10429,52130,56607,58688,62256,80602,93419,93912,121336,117024,110548,102739,109983,112609,1182894.0
5,52.0,2694,39621,150126,148077,153886,157562,194587,234448,242242,370047,341264,352366,353522,393641,410560,3544695.0
Sum,71.0,3519,54749,222774,228301,238635,247171,317463,382101,383184,539603,509531,514975,513961,567281,592637,5315956.0


In [53]:
# Get lists of duplicate texts
dup = TextDuplicates(file_storage.get_filepath('AMORE-TextDuplicates'))
print('Duplicate texts:', len(dup.get_data()), type(dup.get_data()))
print(dup.get_data()[0])
# Duplicate texts: 1239822

Duplicate texts: 1239822 <class 'list'>
[1, 5615911]


In [54]:
# Collect IDs (numbers) of duplicates
dup_ids = set()
for text_dups in dup.get_data():
    for rev_no in text_dups:
        dup_ids.add(rev_no)
print('Duplicates in reviews:', len(dup_ids))
# Duplicates in reviews: 7241411

Duplicates in reviews: 7241411


In [55]:
# Collect non-duplicates
added = 0
ys_no_dups = {}
for year in ys_opinion_lists.keys():
    for star in ys_opinion_lists[year].keys():
        for tup in ys_opinion_lists[year][star]:
            if(tup[NumbersYearsStars.KEY_NUMBER] not in dup_ids):
                add_tuple(ys_no_dups, tup)
                added += 1
print('Added:', added)
# Added: 474228

Added: 474228


In [38]:
# Print non-duplicates
# Print table
if False:
    printer.ipython_display(printer.get_dataframe_with_sums(ys_no_dups))
if False:
    print(printer.get_dataframe_markdown(printer.get_dataframe_with_sums(ys_no_dups), float_as_integer=True))

|     |   1997 |   1998 |   1999 |   2000 |   2001 |   2002 |   2003 |   2004 |   2005 |   2006 |   2007 |   2008 |   2009 |   2010 |   2011 |   2012 |    Sum |
|:----|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|
| 1   |      1 |      2 |     62 |    239 |    215 |    289 |    315 |    886 |   2171 |   2299 |   2898 |   3161 |   3726 |   3777 |   4154 |   4115 |  28310 |
| 2   |    nan |      1 |     24 |    115 |    125 |    146 |    181 |    387 |    834 |    999 |   1349 |   1506 |   1590 |   1625 |   1853 |   1736 |  12471 |
| 4   |    nan |     25 |    332 |   1235 |   1405 |   1330 |   1597 |   2965 |   5664 |   7134 |  11295 |  11591 |  11995 |  11735 |  12252 |  11059 |  91614 |
| 5   |      2 |    105 |   1038 |   3344 |   3463 |   3915 |   5179 |  10753 |  20128 |  25311 |  40497 |  41642 |  45820 |  44146 |  48542 |  47948 | 341833 |
| Sum |      3 |    133 |   1456 |   4933 |   5208 |   5680 |   7272 |  14991 |  28797 |  35743 |  56039 |  57900 |  63131 |  61283 |  66801 |  64858 | 474228 |

In [56]:
# If duplicate tuples have same year and star, add first one to collected non-duplicates (ys_no_dups).
# Collect rest in non_equal
non_equal = []
added = 0
for text_dups in dup.get_data():
    equal = True
    first_tup = None
    for rev_no in text_dups:
        tup = nys.get_by_number(rev_no)
        if first_tup is None:
            first_tup = tup
        else:
            if(tup[NumbersYearsStars.KEY_YEAR] == first_tup[NumbersYearsStars.KEY_YEAR] and
               tup[NumbersYearsStars.KEY_STAR] == first_tup[NumbersYearsStars.KEY_STAR]):
                continue
            else:
                equal = False
    if equal is False:
        non_equal.append(text_dups)
    elif not tup[NumbersYearsStars.KEY_STAR] == 3:
        add_tuple(ys_no_dups, first_tup)
        added += 1
print('Added:', added)
print('Non-equal:', len(non_equal))
# Added: 1109870
# Non-equal: 1010

Added: 1109870
Non-equal: 1010


In [58]:
# Print non-duplicates
# Print table
if False:
    printer.ipython_display(printer.get_dataframe_with_sums(ys_no_dups))
if False:
    print(printer.get_dataframe_markdown(printer.get_dataframe_with_sums(ys_no_dups), float_as_integer=True))

|     |   1997 |   1998 |   1999 |   2000 |   2001 |   2002 |   2003 |   2004 |   2005 |   2006 |   2007 |   2008 |   2009 |   2010 |   2011 |   2012 |     Sum |
|:----|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|--------:|
| 1   |      2 |     26 |    597 |   2512 |   3015 |   3597 |   3689 |   6643 |  10413 |   9943 |  11125 |  12661 |  14150 |  15822 |  19132 |  21570 |  134897 |
| 2   |    nan |     30 |    437 |   2162 |   2541 |   3048 |   3364 |   4880 |   7053 |   7050 |   8067 |   8417 |   8846 |   9536 |  11363 |  12041 |   88835 |
| 4   |      4 |    146 |   2166 |   9832 |  11216 |  12257 |  13466 |  19364 |  25958 |  27917 |  37664 |  36838 |  37089 |  36408 |  40392 |  40528 |  351245 |
| 5   |     14 |    561 |   7266 |  25204 |  26294 |  29576 |  32416 |  46222 |  64445 |  71619 | 108952 | 104455 | 112998 | 113957 | 130571 | 134571 | 1009121 |
| Sum |     20 |    763 |  10466 |  39710 |  43066 |  48478 |  52935 |  77109 | 107869 | 116529 | 165808 | 162371 | 173083 | 175723 | 201458 | 208710 | 1584098 |

In [59]:
# If year is not 3, add first tuple to collected non-duplicates (ys_no_dups).
rest = []
added = 0
print_eol = False
for ne_list in non_equal:
    if False: # Print list of review-numbers
        print(ne_list)
    add_tup = None
    for rev_no in ne_list:
        tup = nys.get_by_number(rev_no)
        if False: # Print stars
            print(tup[NumbersYearsStars.KEY_STAR], end=' ')
            print_eol = True
        if False: # Print years
            print(tup[NumbersYearsStars.KEY_YEAR], end=' ')
            print_eol = True
        if(tup[NumbersYearsStars.KEY_STAR] != 3):
            add_tup = tup
            break
    if add_tup is not None:
        add_tuple(ys_no_dups, tup)
        added += 1
    else:
        rest.append(ne_list)
    if(print_eol):
        print()
print('Added:', added)
print('Rest:', len(rest))

Added: 947
Rest: 63


In [72]:
# Rest only contains 3-star items
only_3 = True
for ne_list in rest:
    for rev_no in ne_list:
        if False:
            print(nys.get_by_number(rev_no)[NumbersYearsStars.KEY_STAR], end=' ')
        if(nys.get_by_number(rev_no)[NumbersYearsStars.KEY_STAR] != 3):
            only_3 = False
    if False:
        print()
print('Only 3-star items left:', only_3)

Only 3-star items left: True


In [63]:
if False:
    printer.ipython_display(printer.get_dataframe_with_sums(ys_no_dups))
if False:
    print(printer.get_dataframe_markdown(printer.get_dataframe_with_sums(ys_no_dups), float_as_integer=True))

|     |   1997 |   1998 |   1999 |   2000 |   2001 |   2002 |   2003 |   2004 |   2005 |   2006 |   2007 |   2008 |   2009 |   2010 |   2011 |   2012 |     Sum |
|:----|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|--------:|
| 1   |      2 |     26 |    597 |   2513 |   3017 |   3597 |   3692 |   6645 |  10415 |   9947 |  11130 |  12666 |  14156 |  15827 |  19142 |  21580 |  134952 |
| 2   |    nan |     30 |    437 |   2163 |   2542 |   3051 |   3367 |   4882 |   7059 |   7055 |   8072 |   8424 |   8860 |   9548 |  11378 |  12053 |   88921 |
| 4   |      4 |    146 |   2167 |   9834 |  11225 |  12262 |  13470 |  19372 |  25975 |  27939 |  37686 |  36864 |  37140 |  36444 |  40433 |  40571 |  351532 |
| 5   |     14 |    561 |   7269 |  25212 |  26305 |  29588 |  32426 |  46244 |  64479 |  71667 | 109005 | 104520 | 113073 | 114020 | 130632 | 134625 | 1009640 |
| Sum |     20 |    763 |  10470 |  39722 |  43089 |  48498 |  52955 |  77143 | 107928 | 116608 | 165893 | 162474 | 173229 | 175839 | 201585 | 208829 | 1585045 |

Original:

|     |   1997 |   1998 |   1999 |   2000 |   2001 |   2002 |   2003 |   2004 |   2005 |   2006 |   2007 |   2008 |   2009 |   2010 |   2011 |   2012 |     Sum |
|:----|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|--------:|
| 1   |      6 |    191 |   4844 |  19944 |  24221 |  25311 |  25734 |  41016 |  54744 |  49049 |  49521 |  56076 |  59099 |  65343 |  72957 |  81276 |  629332 |
| 2   |      1 |    262 |   3631 |  17808 |  20320 |  22641 |  24183 |  33117 |  40868 |  37992 |  40205 |  40138 |  39680 |  41430 |  45767 |  47356 |  455399 |
| 3   |      8 |    442 |   6458 |  30907 |  35395 |  37798 |  43323 |  60489 |  71012 |  66128 |  75239 |  74057 |  73178 |  70279 |  72055 |  74826 |  791594 |
| 4   |     29 |    797 |  14178 |  73314 |  79152 |  84276 |  90527 | 119160 | 138000 | 135581 | 167632 | 161693 | 149771 | 142000 | 148457 | 150248 | 1654815 |
| 5   |     64 |   3313 |  49866 | 192002 | 189638 | 198712 | 205916 | 257603 | 308080 | 311252 | 452009 | 412870 | 422403 | 426248 | 465918 | 484650 | 4380544 |
| Sum |    108 |   5005 |  78977 | 333975 | 348726 | 368738 | 389683 | 511385 | 612704 | 600002 | 784606 | 744834 | 744131 | 745300 | 805154 | 838356 | 7911684 |