# Filtering (deduplication first - archive version)

Note: Checking opinion words first results in more final reviews.


In [1]:
# Reload modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

# Import from project root
import sys; sys.path.insert(0, '../')

import timeit
from collections import OrderedDict

from access.file_storage import FileStorage
from access.interim_storage import InterimStorage

from amore.printer import Printer

from readers.numbers_years_stars import NumbersYearsStars
from readers.opinion_counts import OpinionCounts
from readers.text_duplicates import TextDuplicates

In [2]:
file_storage = FileStorage()
printer = Printer()

In [3]:
def count_ysl(ysl):
    c = 0
    for year in ysl.keys():
        for star in ysl[year].keys():
            c += len(ysl[year][star])
    return c

In [4]:
def add_tuple(dict_, tup):
    if tup[NumbersYearsStars.KEY_YEAR] not in dict_.keys():
        dict_[tup[NumbersYearsStars.KEY_YEAR]] = {}
    if tup[NumbersYearsStars.KEY_STAR] not in dict_[tup[NumbersYearsStars.KEY_YEAR]].keys():
        dict_[ tup[NumbersYearsStars.KEY_YEAR] ][ tup[NumbersYearsStars.KEY_STAR] ] = []
    dict_[tup[NumbersYearsStars.KEY_YEAR]][tup[NumbersYearsStars.KEY_STAR]].append(tup)

In [5]:
nys = NumbersYearsStars(file_storage.get_filepath('AMORE-NumbersYearsStars'))

## Numbers overview

In [6]:
# Reads and also caches data from file
ys_lists = nys.get_by_year_star()

In [7]:
# Print table
if False:
    printer.ipython_display(printer.get_dataframe_with_sums(ys_lists))
if False:
    print(printer.get_dataframe_markdown(printer.get_dataframe_with_sums(ys_lists), float_as_integer=True))
print('Reviews in ys_lists:', count_ysl(ys_lists))
# Reviews in ys_lists: 7,911,684

Reviews in ys_lists: 7911684


Original

|     |   1997 |   1998 |   1999 |   2000 |   2001 |   2002 |   2003 |   2004 |   2005 |   2006 |   2007 |   2008 |   2009 |   2010 |   2011 |   2012 |     Sum |
|:----|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|--------:|
| 1   |      6 |    191 |   4844 |  19944 |  24221 |  25311 |  25734 |  41016 |  54744 |  49049 |  49521 |  56076 |  59099 |  65343 |  72957 |  81276 |  629332 |
| 2   |      1 |    262 |   3631 |  17808 |  20320 |  22641 |  24183 |  33117 |  40868 |  37992 |  40205 |  40138 |  39680 |  41430 |  45767 |  47356 |  455399 |
| 3   |      8 |    442 |   6458 |  30907 |  35395 |  37798 |  43323 |  60489 |  71012 |  66128 |  75239 |  74057 |  73178 |  70279 |  72055 |  74826 |  791594 |
| 4   |     29 |    797 |  14178 |  73314 |  79152 |  84276 |  90527 | 119160 | 138000 | 135581 | 167632 | 161693 | 149771 | 142000 | 148457 | 150248 | 1654815 |
| 5   |     64 |   3313 |  49866 | 192002 | 189638 | 198712 | 205916 | 257603 | 308080 | 311252 | 452009 | 412870 | 422403 | 426248 | 465918 | 484650 | 4380544 |
| Sum |    108 |   5005 |  78977 | 333975 | 348726 | 368738 | 389683 | 511385 | 612704 | 600002 | 784606 | 744834 | 744131 | 745300 | 805154 | 838356 | 7911684 |

## Filter by text duplicates

In [8]:
# Get lists of duplicate texts
dup = TextDuplicates(file_storage.get_filepath('AMORE-TextDuplicates'))
# print(dup.get_data()[0]) # [1, 5615911]
print('Duplicate texts:', len(dup.get_data()), type(dup.get_data()))
# Duplicate texts: 1,239,822

Duplicate texts: 1239822 <class 'list'>


In [9]:
# Collect IDs (numbers) of duplicates (= affected reviews)
dup_ids = set()
for text_dups in dup.get_data():
    for rev_no in text_dups:
        dup_ids.add(rev_no)
print('Duplicates in reviews:', len(dup_ids))
# Duplicates in reviews: 7,241,411

Duplicates in reviews: 7241411


In [10]:
# Collect non-duplicates
added = 0
ys_no_dups = {}
for year in ys_lists.keys():
    for star in ys_lists[year].keys():
        for tup in ys_lists[year][star]:
            if(tup[NumbersYearsStars.KEY_NUMBER] not in dup_ids):
                add_tuple(ys_no_dups, tup)
                added += 1
print('Added:', added)
# Added: 670,273

Added: 670273


In [11]:
# Print non-duplicates
# Print table
if False:
    printer.ipython_display(printer.get_dataframe_with_sums(ys_no_dups))
if False:
    print(printer.get_dataframe_markdown(printer.get_dataframe_with_sums(ys_no_dups), float_as_integer=True))
print('Reviews in ys_no_dups:', count_ysl(ys_no_dups))
# Reviews in ys_no_dups: 670,273

Reviews in ys_no_dups: 670273


Non-duplicates

|     |   1997 |   1998 |   1999 |   2000 |   2001 |   2002 |   2003 |   2004 |   2005 |   2006 |   2007 |   2008 |   2009 |   2010 |   2011 |   2012 |    Sum |
|:----|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|
| 1   |      1 |      3 |    114 |    393 |    439 |    495 |    614 |   1646 |   3842 |   4133 |   5339 |   6072 |   6875 |   6964 |   7699 |   7636 |  52265 |
| 2   |      1 |      8 |     77 |    305 |    384 |    410 |    515 |   1117 |   2239 |   2680 |   3751 |   4256 |   4428 |   4541 |   4908 |   4435 |  34055 |
| 3   |    nan |     21 |    183 |    735 |    788 |    812 |    912 |   1881 |   3896 |   4611 |   6835 |   7278 |   7689 |   7920 |   8209 |   7319 |  59089 |
| 4   |    nan |     42 |    428 |   1649 |   1853 |   1811 |   2196 |   4073 |   7731 |   9653 |  14739 |  15214 |  15613 |  15352 |  16112 |  14486 | 120952 |
| 5   |      2 |    122 |   1270 |   4121 |   4295 |   4805 |   6375 |  13263 |  24742 |  30601 |  47850 |  48959 |  53337 |  51538 |  56580 |  56052 | 403912 |
| Sum |      4 |    196 |   2072 |   7203 |   7759 |   8333 |  10612 |  21980 |  42450 |  51678 |  78514 |  81779 |  87942 |  86315 |  93508 |  89928 | 670273 |

### Duplicates with same year and star: Re-add one review

In [12]:
# If duplicate tuples have same year and star, add first one to collected non-duplicates (ys_no_dups).
# Collect rest in non_equal
non_equal = []
added = 0
affected_reviews = 0
for text_dups in dup.get_data():
    equal = True
    first_tup = None
    for rev_no in text_dups:
        tup = nys.get_by_number(rev_no)
        if first_tup is None:
            first_tup = tup
        else:
            if(tup[NumbersYearsStars.KEY_YEAR] == first_tup[NumbersYearsStars.KEY_YEAR] and
               tup[NumbersYearsStars.KEY_STAR] == first_tup[NumbersYearsStars.KEY_STAR]):
                continue
            else:
                equal = False
    if equal is False:
        non_equal.append(text_dups)
    else:
        add_tuple(ys_no_dups, first_tup)
        added += 1
        affected_reviews += len(text_dups)
print('Added:', added)
print('Affected reviews:', affected_reviews)
print('Non-equal:', len(non_equal))
# Added:            1,238,812
# Affected reviews: 7,230,115
# Non-equal:            1,010

Added: 1238812
Affected reviews: 7230115
Non-equal: 1010


In [13]:
# Print non-duplicates
# Print table
if False:
    printer.ipython_display(printer.get_dataframe_with_sums(ys_no_dups))
if False:
    print(printer.get_dataframe_markdown(printer.get_dataframe_with_sums(ys_no_dups), float_as_integer=True))
print('Reviews in ys_no_dups:', count_ysl(ys_no_dups))
# Reviews in ys_no_dups: 1,909,085

Reviews in ys_no_dups: 1909085


Non-duplicates and deduplicated same year-star reviews

|     |   1997 |   1998 |   1999 |   2000 |   2001 |   2002 |   2003 |   2004 |   2005 |   2006 |   2007 |   2008 |   2009 |   2010 |   2011 |   2012 |     Sum |
|:----|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|--------:|
| 1   |      2 |     27 |    649 |   2666 |   3239 |   3803 |   3988 |   7403 |  12084 |  11777 |  13566 |  15572 |  17299 |  19009 |  22677 |  25091 |  158852 |
| 2   |      1 |     37 |    490 |   2352 |   2800 |   3312 |   3698 |   5610 |   8458 |   8731 |  10469 |  11167 |  11684 |  12452 |  14418 |  14740 |  110419 |
| 3   |      1 |     76 |   1018 |   4495 |   5176 |   5711 |   6609 |  10100 |  14487 |  14909 |  19070 |  19463 |  20491 |  20737 |  22797 |  22891 |  188031 |
| 4   |      4 |    163 |   2262 |  10246 |  11664 |  12738 |  14065 |  20472 |  28025 |  30436 |  41108 |  40461 |  40707 |  40025 |  44252 |  43955 |  380583 |
| 5   |     14 |    578 |   7498 |  25981 |  27126 |  30466 |  33612 |  48732 |  69059 |  76909 | 116305 | 111772 | 120515 | 121349 | 138609 | 142675 | 1071200 |
| Sum |     22 |    881 |  11917 |  45740 |  50005 |  56030 |  61972 |  92317 | 132113 | 142762 | 200518 | 198435 | 210696 | 213572 | 242753 | 249352 | 1909085 |

### Overview of non-deduplicated reviews

In [14]:
# Non-equal: Print overview
different_year_star = 0
print_eol = False
for ne_list in non_equal:
    different_year_star += len(ne_list)
    if False: # Print list of review-numbers
        print(ne_list)
    star_print = []
    for rev_no in ne_list:
        tup = nys.get_by_number(rev_no)
        if False: # Print stars
            star_print.append(tup[NumbersYearsStars.KEY_STAR])
        if False: # Print years
            print(tup[NumbersYearsStars.KEY_YEAR], end=' ')
            print_eol = True
    if(print_eol):
        print()
    if(len(star_print) != 0):
        print(star_print)
print('Reviews with different year-star:', different_year_star)
print('Inside list:', len(non_equal))
# Reviews with different year-star: 11,296
# Inside list: 1,010

Reviews with different year-star: 11296
Inside list: 1010


In [None]:
non_equal_dup_to_rev = {}
for i, ne_list in enumerate(non_equal):
    if i == 0:
        print('Example:')
        print(i, '- Affected reviews of duplicate:', len(ne_list), '- First review number:', ne_list[0], ne_list)
        print()
    c = len(ne_list)
    if c not in non_equal_dup_to_rev.keys():
        non_equal_dup_to_rev[c] = []
    non_equal_dup_to_rev[c].append(i)
non_equal_dup_to_rev = OrderedDict(sorted(non_equal_dup_to_rev.items()))

print('Affected reviews of duplicate, number of duplicates, example enumeration index:')
for c in non_equal_dup_to_rev.keys():
    print(c, len(non_equal_dup_to_rev[c]), non_equal_dup_to_rev[c][0], end=' | ')
print()

print()
print('Examples:')
for i, ne_list in enumerate(non_equal):
    if i == 7:
        print(i, '- Affected reviews of duplicate:', len(ne_list), '- First review number:', ne_list[0])
    if i == 16:
        print(i, '- Affected reviews of duplicate:', len(ne_list), '- First review number:', ne_list[0])
    if i == 4:
        print(i, '- Affected reviews of duplicate:', len(ne_list), '- First review number:', ne_list[0])
    if i == 29:
        print(i, '- Affected reviews of duplicate:', len(ne_list), '- First review number:', ne_list[0])
    if i == 147:
        print(i, '- Affected reviews of duplicate:', len(ne_list), '- First review number:', ne_list[0])
    if i == 112:
        print(i, '- Affected reviews of duplicate:', len(ne_list), '- First review number:', ne_list[0])

```
Example:
0 - Affected reviews of duplicate: 10 - First review number: 410 [410, 412, 2664455, 2664457, 4377432, 4377434, 5372483, 5372485, 6458827, 6458829]

Affected reviews of duplicate, number of duplicates, example enumeration index:
2 176 7 | 3 95 16 | 4 127 4 | 5 59 91 | 6 88 1 | 7 31 57 | 8 56 22 | 9 29 40 | 10 37 0 | 11 14 28 | 12 37 19 | 13 12 47 | 14 25 8 | 15 7 87 | 16 16 23 | 17 3 215 | 18 24 11 | 19 6 45 | 20 17 3 | 21 9 39 | 22 16 2 | 23 1 117 | 24 12 74 | 25 6 76 | 26 6 41 | 27 3 234 | 28 12 10 | 29 2 31 | 30 12 42 | 31 1 193 | 32 4 175 | 33 4 15 | 34 6 84 | 35 3 9 | 36 5 168 | 38 5 99 | 39 1 876 | 41 2 111 | 42 8 17 | 43 1 32 | 44 5 27 | 45 1 36 | 46 3 56 | 47 1 409 | 50 2 210 | 51 1 179 | 54 3 14 | 55 1 126 | 56 1 355 | 59 1 153 | 60 1 252 | 62 2 30 | 65 1 135 | 66 1 343 | 69 1 73 | 70 1 51 | 72 1 410 | 80 1 88 | 94 1 26 | 155 1 29 | 163 1 147 | 216 1 112 | 

Examples:
4 - Affected reviews of duplicate: 4 - First review number: 4901
7 - Affected reviews of duplicate: 2 - First review number: 7035
16 - Affected reviews of duplicate: 3 - First review number: 20359
29 - Affected reviews of duplicate: 155 - First review number: 34896
112 - Affected reviews of duplicate: 216 - First review number: 121943
147 - Affected reviews of duplicate: 163 - First review number: 179065
```

In [16]:
# Write cache
InterimStorage('deduplicated').write(ys_no_dups).get_filepath()
print('Reviews in ys_no_dups:', count_ysl(ys_no_dups))
# Reviews in ys_no_dups: 1,909,085

Reviews in ys_no_dups: 1909085


## Filter by opinion words

In [17]:
# Read cache
ys_no_dups = InterimStorage('deduplicated').read()
print('Reviews in ys_no_dups:', count_ysl(ys_no_dups))
# Reviews in ys_no_dups: 1,909,085

Reviews in ys_no_dups: 1909085


In [18]:
def print_examples(ysl, posstar_max=10, negstar_max=10):
    print('Star   existent occurences   review-number')
    posstar_count=0
    negstar_count=0
    for year in ysl.keys():
        for star in ysl[year].keys():
            for tup in ysl[year][star]:
                if(negstar_count<negstar_max and star in [1,2]):
                    print(star, ' ',
                          opinion_counts.get_existent(tup[NumbersYearsStars.KEY_NUMBER]),
                          opinion_counts.get_occurences(tup[NumbersYearsStars.KEY_NUMBER]),
                          ' ', tup[NumbersYearsStars.KEY_NUMBER])
                    negstar_count += 1
                elif(posstar_count<posstar_max and star in [4,5]):
                    print(star, ' ',
                          opinion_counts.get_existent(tup[NumbersYearsStars.KEY_NUMBER]),
                          opinion_counts.get_occurences(tup[NumbersYearsStars.KEY_NUMBER]),
                          ' ', tup[NumbersYearsStars.KEY_NUMBER])
                    posstar_count += 1
                if(posstar_count>=posstar_max and negstar_count>=negstar_max):
                    break
            if(posstar_count>=posstar_max and negstar_count>=negstar_max):
                break
        if(posstar_count>=posstar_max and negstar_count>=negstar_max):
            break

In [19]:
opinion_counts = OpinionCounts(file_storage.get_filepath('AMORE-OpinionCounts'))

In [20]:
# Examples with 0/neg for 5-star and with 0/pos for 1-star
print_examples(ys_no_dups, posstar_max=19, negstar_max=8)

Star   existent occurences   review-number
5   8 10   382
5   4 2   384
5   -11 -8   385
5   1 4   395
5   1 1   2972
5   5 4   2973
5   3 3   3357
5   2 2   4365
5   2 2   6297
5   9 9   6310
5   -1 0   6313
5   -1 -1   6314
5   2 2   6321
5   16 11   6588
5   14 11   6590
5   9 8   6593
5   4 4   6604
5   8 5   6605
5   0 0   7153
1   -1 -1   4368
1   -2 -3   6326
1   -6 -5   7371
1   -4 -4   9867
1   -5 -5   9868
1   -3 -3   12570
1   1 0   16501
1   0 0   16502


In [21]:
time_begin = timeit.default_timer()
ys_opinion_lists = {}
for year in ys_no_dups.keys():
    for star in ys_no_dups[year].keys():
        for tup in ys_no_dups[year][star]:
            if(tup[NumbersYearsStars.KEY_STAR] in [1,2]):
                if(opinion_counts.get_existent(tup[NumbersYearsStars.KEY_NUMBER]) < 0 and
                   opinion_counts.get_occurences(tup[NumbersYearsStars.KEY_NUMBER]) < 0):
                    add_tuple(ys_opinion_lists, tup)
            elif(tup[NumbersYearsStars.KEY_STAR] == 3):
                continue
            elif(tup[NumbersYearsStars.KEY_STAR] in [4,5]):
                if(opinion_counts.get_existent(tup[NumbersYearsStars.KEY_NUMBER]) > 0 and
                   opinion_counts.get_occurences(tup[NumbersYearsStars.KEY_NUMBER]) > 0):
                    add_tuple(ys_opinion_lists, tup)
print('Runtime:', timeit.default_timer() - time_begin)

Runtime: 4.798527679871768


In [22]:
print_examples(ys_opinion_lists, posstar_max=19, negstar_max=8)

Star   existent occurences   review-number
5   8 10   382
5   4 2   384
5   1 4   395
5   1 1   2972
5   5 4   2973
5   3 3   3357
5   2 2   4365
5   2 2   6297
5   9 9   6310
5   2 2   6321
5   16 11   6588
5   14 11   6590
5   9 8   6593
5   4 4   6604
5   8 5   6605
5   1 1   7157
5   5 3   7242
5   2 2   7708
5   3 3   7743
1   -1 -1   4368
1   -2 -3   6326
1   -6 -5   7371
1   -4 -4   9867
1   -5 -5   9868
1   -3 -3   12570
1   -1 -1   25562
1   -10 -11   30240


In [28]:
# Print table
if False:
    printer.ipython_display(printer.get_dataframe_with_sums(ys_opinion_lists))
if False:
    print(printer.get_dataframe_markdown(printer.get_dataframe_with_sums(ys_opinion_lists), float_as_integer=True))
print('Reviews in ys_no_dups:', count_ysl(ys_opinion_lists))
# Reviews in ys_no_dups: 1,301,417

Reviews in ys_no_dups: 1301417


Deduplicated and filtered by opinion words

|     |   1997 |   1998 |   1999 |   2000 |   2001 |   2002 |   2003 |   2004 |   2005 |   2006 |   2007 |   2008 |   2009 |   2010 |   2011 |   2012 |     Sum |
|:----|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|--------:|
| 1   |      2 |     15 |    398 |   1693 |   1902 |   2344 |   2453 |   4624 |   7431 |   7095 |   7980 |   9084 |  10031 |  11196 |  13352 |  14953 |   94553 |
| 2   |    nan |     12 |    200 |    980 |   1167 |   1425 |   1567 |   2422 |   3758 |   3787 |   4362 |   4507 |   4726 |   5273 |   6041 |   6408 |   46635 |
| 4   |      2 |    114 |   1685 |   7381 |   8386 |   9012 |   9823 |  14062 |  19369 |  21545 |  30384 |  29815 |  30389 |  29409 |  32888 |  32913 |  277177 |
| 5   |     12 |    478 |   6056 |  20592 |  21447 |  23971 |  26334 |  37813 |  54023 |  61434 |  96439 |  93463 | 101591 | 101680 | 117114 | 120605 |  883052 |
| Sum |     16 |    619 |   8339 |  30646 |  32902 |  36752 |  40177 |  58921 |  84581 |  93861 | 139165 | 136869 | 146737 | 147558 | 169395 | 174879 | 1301417 |

In [30]:
# Write cache
InterimStorage('deduplicated_opinion-filtered').write(ys_opinion_lists).get_filepath()
print('Reviews in ys_no_dups:', count_ysl(ys_opinion_lists))
# Reviews in ys_no_dups: 1,301,417

Reviews in ys_no_dups: 1301417
