# Filtering: Deduplication

Reads `opinion-filtered.pickle.bz2` and writes `deduplicated.pickle.bz2` and `non-deduplicated.pickle.bz2`.

Data format: `{year {star [(number, year, star)] } }`

In [1]:
# Reload modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

# Import from project root
import sys; sys.path.insert(0, '../')

from collections import OrderedDict

from access.file_storage import FileStorage
from access.interim_storage import InterimStorage
from amore.printer import Printer
from readers.numbers_years_stars import NumbersYearsStars
from readers.text_duplicates import TextDuplicates

KEY_NUMBER = 0
KEY_YEAR   = 1
KEY_STAR   = 2

In [2]:
file_storage = FileStorage()
printer = Printer()

In [3]:
def count_ysl(ysl):
    c = 0
    for year in ysl.keys():
        for star in ysl[year].keys():
            c += len(ysl[year][star])
    return c

In [4]:
def add_tuple(dict_, tup):
    if tup[KEY_YEAR] not in dict_.keys():
        dict_[tup[KEY_YEAR]] = {}
    if tup[KEY_STAR] not in dict_[tup[KEY_YEAR]].keys():
        dict_[ tup[KEY_YEAR] ][ tup[KEY_STAR] ] = []
    dict_[tup[KEY_YEAR]][tup[KEY_STAR]].append(tup)

## Read data

In [5]:
# Read cache
ys_opinion_lists = InterimStorage('opinion-filtered').read()
print('Reviews in ys_opinion_lists:', count_ysl(ys_opinion_lists))
# Reviews in ys_opinion_lists: 5,483,175

Reviews in ys_opinion_lists: 5483175


In [6]:
# Print table
if False:
    printer.ipython_display(printer.get_dataframe_with_sums(ys_opinion_lists))
if False:
    print(printer.get_dataframe_markdown(printer.get_dataframe_with_sums(ys_opinion_lists), float_as_integer=True))
print('Reviews in ys_opinion_lists:', count_ysl(ys_opinion_lists))
# Reviews in ys_opinion_lists: 5,483,175

Reviews in ys_opinion_lists: 5483175


Opinion

|     |   1997 |   1998 |   1999 |   2000 |   2001 |   2002 |   2003 |   2004 |   2005 |   2006 |   2007 |   2008 |   2009 |   2010 |   2011 |   2012 |     Sum |
|:----|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|--------:|
| 1   |      6 |    104 |   3110 |  12698 |  14751 |  15934 |  16611 |  26835 |  35093 |  29878 |  30496 |  33972 |  35420 |  39177 |  43552 |  48432 |  386069 |
| 2   |    nan |    122 |   1589 |   7820 |   8866 |  10127 |  10742 |  15439 |  19141 |  17152 |  17724 |  17271 |  16641 |  18523 |  20105 |  21036 |  202298 |
| 3   |    nan |    107 |   1535 |   6152 |   6376 |   6280 |   6917 |  10403 |  12668 |  11947 |  16364 |  16218 |  17691 |  17052 |  18243 |  19266 |  167219 |
| 4   |     13 |    599 |  10429 |  52130 |  56607 |  58688 |  62256 |  80602 |  93419 |  93912 | 121336 | 117024 | 110548 | 102739 | 109983 | 112609 | 1182894 |
| 5   |     52 |   2694 |  39621 | 150126 | 148077 | 153886 | 157562 | 194587 | 234448 | 242242 | 370047 | 341264 | 352366 | 353522 | 393641 | 410560 | 3544695 |
| Sum |     71 |   3626 |  56284 | 228926 | 234677 | 244915 | 254088 | 327866 | 394769 | 395131 | 555967 | 525749 | 532666 | 531013 | 585524 | 611903 | 5483175 |

## Collect non-duplicates

In [7]:
# Get lists of duplicate texts
dup = TextDuplicates(file_storage.get_filepath('AMORE-TextDuplicates'))
# print(dup.get_data()[0]) # [1, 5615911]
print('Duplicates of texts:', len(dup.get_data()), type(dup.get_data()))
# Duplicates of texts: 1,239,822

Duplicates of texts: 1239822 <class 'list'>


In [8]:
# Collect IDs (numbers) of duplicates (= affected reviews)
dup_ids = set()
for text_dups in dup.get_data():
    for rev_no in text_dups:
        dup_ids.add(rev_no)
print('Affected reviews:', len(dup_ids))
# Affected reviews: 7,241,411

Affected reviews: 7241411


In [9]:
# Collect non-duplicates
added = 0
ys_no_dups = {}
for year in ys_opinion_lists.keys():
    for star in ys_opinion_lists[year].keys():
        for tup in ys_opinion_lists[year][star]:
            if(tup[KEY_NUMBER] not in dup_ids):
                add_tuple(ys_no_dups, tup)
                added += 1
print('Added:', added)
# Added: 489,009

Added: 489009


In [10]:
# Print non-duplicates
# Print table
if False:
    printer.ipython_display(printer.get_dataframe_with_sums(ys_no_dups))
if False:
    print(printer.get_dataframe_markdown(printer.get_dataframe_with_sums(ys_no_dups), float_as_integer=True))
print('Reviews in ys_no_dups:', count_ysl(ys_no_dups))
# Reviews in ys_no_dups: 489,009

Reviews in ys_no_dups: 489009


Non-duplicates

|     |   1997 |   1998 |   1999 |   2000 |   2001 |   2002 |   2003 |   2004 |   2005 |   2006 |   2007 |   2008 |   2009 |   2010 |   2011 |   2012 |    Sum |
|:----|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|
| 1   |      1 |      2 |     62 |    239 |    215 |    289 |    315 |    886 |   2171 |   2299 |   2898 |   3161 |   3726 |   3777 |   4154 |   4115 |  28310 |
| 2   |    nan |      1 |     24 |    115 |    125 |    146 |    181 |    387 |    834 |    999 |   1349 |   1506 |   1590 |   1625 |   1853 |   1736 |  12471 |
| 3   |    nan |     10 |     45 |    172 |    174 |    165 |    163 |    373 |    829 |   1024 |   1697 |   1759 |   2033 |   2108 |   2208 |   2021 |  14781 |
| 4   |    nan |     25 |    332 |   1235 |   1405 |   1330 |   1597 |   2965 |   5664 |   7134 |  11295 |  11591 |  11995 |  11735 |  12252 |  11059 |  91614 |
| 5   |      2 |    105 |   1038 |   3344 |   3463 |   3915 |   5179 |  10753 |  20128 |  25311 |  40497 |  41642 |  45820 |  44146 |  48542 |  47948 | 341833 |
| Sum |      3 |    143 |   1501 |   5105 |   5382 |   5845 |   7435 |  15364 |  29626 |  36767 |  57736 |  59659 |  65164 |  63391 |  69009 |  66879 | 489009 |

## Duplicates with same year and star: Re-add one review

In [11]:
# If duplicate tuples have same year and star, add first one to collected non-duplicates (ys_no_dups).
# Collect rest in non_equal
nys = NumbersYearsStars(file_storage.get_filepath('AMORE-NumbersYearsStars'))
non_equal = []
added = 0
affected_reviews = 0
for text_dups in dup.get_data():
    equal = True
    first_tup = None
    for rev_no in text_dups:
        tup = nys.get_by_number(rev_no)
        if first_tup is None:
            first_tup = tup
        else:
            if(tup[KEY_YEAR] == first_tup[KEY_YEAR] and
               tup[KEY_STAR] == first_tup[KEY_STAR]):
                continue
            else:
                equal = False
    if equal is False:
        non_equal.append(text_dups)
    else:
        add_tuple(ys_no_dups, first_tup)
        added += 1
        affected_reviews += len(text_dups)
print('Added:', added)
print('Affected reviews:', affected_reviews)
print('Non-equal:', len(non_equal))
# Added:            1,238,812
# Affected reviews: 7,230,115
# Non-equal:            1,010

Added: 1238812
Affected reviews: 7230115
Non-equal: 1010


In [12]:
# Print non-duplicates
# Print table
if False:
    printer.ipython_display(printer.get_dataframe_with_sums(ys_no_dups))
if False:
    print(printer.get_dataframe_markdown(printer.get_dataframe_with_sums(ys_no_dups), float_as_integer=True))
print('Reviews in ys_no_dups:', count_ysl(ys_no_dups))
# Reviews in ys_no_dups: 1,727,821

Reviews in ys_no_dups: 1727821


Non-duplicates and deduplicated same year-star reviews

|     |   1997 |   1998 |   1999 |   2000 |   2001 |   2002 |   2003 |   2004 |   2005 |   2006 |   2007 |   2008 |   2009 |   2010 |   2011 |   2012 |     Sum |
|:----|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|--------:|
| 1   |      2 |     26 |    597 |   2512 |   3015 |   3597 |   3689 |   6643 |  10413 |   9943 |  11125 |  12661 |  14150 |  15822 |  19132 |  21570 |  134897 |
| 2   |    nan |     30 |    437 |   2162 |   2541 |   3048 |   3364 |   4880 |   7053 |   7050 |   8067 |   8417 |   8846 |   9536 |  11363 |  12041 |   88835 |
| 3   |      1 |     65 |    880 |   3932 |   4562 |   5064 |   5860 |   8592 |  11420 |  11322 |  13932 |  13944 |  14835 |  14925 |  16796 |  17593 |  143723 |
| 4   |      4 |    146 |   2166 |   9832 |  11216 |  12257 |  13466 |  19364 |  25958 |  27917 |  37664 |  36838 |  37089 |  36408 |  40392 |  40528 |  351245 |
| 5   |     14 |    561 |   7266 |  25204 |  26294 |  29576 |  32416 |  46222 |  64445 |  71619 | 108952 | 104455 | 112998 | 113957 | 130571 | 134571 | 1009121 |
| Sum |     21 |    828 |  11346 |  43642 |  47628 |  53542 |  58795 |  85701 | 119289 | 127851 | 179740 | 176315 | 187918 | 190648 | 218254 | 226303 | 1727821 |

In [13]:
# Write cache
print(InterimStorage('deduplicated').write(ys_no_dups).get_filepath())
print('Reviews in ys_no_dups:', count_ysl(ys_no_dups))
# Reviews in ys_no_dups: 1,727,821

/tmp/InterimStorage/deduplicated.pickle.bz2
Reviews in ys_no_dups: 1727821


## Overview of non-deduplicated reviews

In [14]:
# Non-equal: Print overview
different_year_star = 0
print_eol = False
for ne_list in non_equal:
    different_year_star += len(ne_list)
    if False: # Print list of review-numbers
        print(ne_list)
    star_print = []
    for rev_no in ne_list:
        tup = nys.get_by_number(rev_no)
        if False: # Print stars
            star_print.append(tup[KEY_STAR])
        if False: # Print years
            print(tup[KEY_YEAR], end=' ')
            print_eol = True
    if(print_eol):
        print()
    if(len(star_print) != 0):
        print(star_print)
print('Reviews with different year-star:', different_year_star)
print('Inside list:', len(non_equal))
# Reviews with different year-star: 11,296
# Inside list: 1,010

Reviews with different year-star: 11296
Inside list: 1010


In [15]:
non_equal_dup_to_rev = {}
for i, ne_list in enumerate(non_equal):
    if i == 0:
        print('Example:')
        print(i, '- Affected reviews of duplicate:', len(ne_list), '- First review number:', ne_list[0], ne_list)
        print()
    c = len(ne_list)
    if c not in non_equal_dup_to_rev.keys():
        non_equal_dup_to_rev[c] = []
    non_equal_dup_to_rev[c].append(i)
non_equal_dup_to_rev = OrderedDict(sorted(non_equal_dup_to_rev.items()))

print('Affected reviews of duplicate, number of duplicates, example enumeration index:')
for c in non_equal_dup_to_rev.keys():
    print(c, len(non_equal_dup_to_rev[c]), non_equal_dup_to_rev[c][0], end=' | ')
print()

print()
print('Examples:')
for i, ne_list in enumerate(non_equal):
    if i == 7:
        print(i, '- Affected reviews of duplicate:', len(ne_list), '- First review number:', ne_list[0])
    if i == 16:
        print(i, '- Affected reviews of duplicate:', len(ne_list), '- First review number:', ne_list[0])
    if i == 4:
        print(i, '- Affected reviews of duplicate:', len(ne_list), '- First review number:', ne_list[0])
    if i == 29:
        print(i, '- Affected reviews of duplicate:', len(ne_list), '- First review number:', ne_list[0])
    if i == 147:
        print(i, '- Affected reviews of duplicate:', len(ne_list), '- First review number:', ne_list[0])
    if i == 112:
        print(i, '- Affected reviews of duplicate:', len(ne_list), '- First review number:', ne_list[0])

Example:
0 - Affected reviews of duplicate: 10 - First review number: 410 [410, 412, 2664455, 2664457, 4377432, 4377434, 5372483, 5372485, 6458827, 6458829]

Affected reviews of duplicate, number of duplicates, example enumeration index:
2 176 7 | 3 95 16 | 4 127 4 | 5 59 91 | 6 88 1 | 7 31 57 | 8 56 22 | 9 29 40 | 10 37 0 | 11 14 28 | 12 37 19 | 13 12 47 | 14 25 8 | 15 7 87 | 16 16 23 | 17 3 215 | 18 24 11 | 19 6 45 | 20 17 3 | 21 9 39 | 22 16 2 | 23 1 117 | 24 12 74 | 25 6 76 | 26 6 41 | 27 3 234 | 28 12 10 | 29 2 31 | 30 12 42 | 31 1 193 | 32 4 175 | 33 4 15 | 34 6 84 | 35 3 9 | 36 5 168 | 38 5 99 | 39 1 876 | 41 2 111 | 42 8 17 | 43 1 32 | 44 5 27 | 45 1 36 | 46 3 56 | 47 1 409 | 50 2 210 | 51 1 179 | 54 3 14 | 55 1 126 | 56 1 355 | 59 1 153 | 60 1 252 | 62 2 30 | 65 1 135 | 66 1 343 | 69 1 73 | 70 1 51 | 72 1 410 | 80 1 88 | 94 1 26 | 155 1 29 | 163 1 147 | 216 1 112 | 

Examples:
4 - Affected reviews of duplicate: 4 - First review number: 4901
7 - Affected reviews of duplicate: 2

```
Example:
0 - Affected reviews of duplicate: 10 - First review number: 410 [410, 412, 2664455, 2664457, 4377432, 4377434, 5372483, 5372485, 6458827, 6458829]

Affected reviews of duplicate, number of duplicates, example enumeration index:
2 176 7 | 3 95 16 | 4 127 4 | 5 59 91 | 6 88 1 | 7 31 57 | 8 56 22 | 9 29 40 | 10 37 0 | 11 14 28 | 12 37 19 | 13 12 47 | 14 25 8 | 15 7 87 | 16 16 23 | 17 3 215 | 18 24 11 | 19 6 45 | 20 17 3 | 21 9 39 | 22 16 2 | 23 1 117 | 24 12 74 | 25 6 76 | 26 6 41 | 27 3 234 | 28 12 10 | 29 2 31 | 30 12 42 | 31 1 193 | 32 4 175 | 33 4 15 | 34 6 84 | 35 3 9 | 36 5 168 | 38 5 99 | 39 1 876 | 41 2 111 | 42 8 17 | 43 1 32 | 44 5 27 | 45 1 36 | 46 3 56 | 47 1 409 | 50 2 210 | 51 1 179 | 54 3 14 | 55 1 126 | 56 1 355 | 59 1 153 | 60 1 252 | 62 2 30 | 65 1 135 | 66 1 343 | 69 1 73 | 70 1 51 | 72 1 410 | 80 1 88 | 94 1 26 | 155 1 29 | 163 1 147 | 216 1 112 | 

Examples:
4 - Affected reviews of duplicate: 4 - First review number: 4901
7 - Affected reviews of duplicate: 2 - First review number: 7035
16 - Affected reviews of duplicate: 3 - First review number: 20359
29 - Affected reviews of duplicate: 155 - First review number: 34896
112 - Affected reviews of duplicate: 216 - First review number: 121943
147 - Affected reviews of duplicate: 163 - First review number: 179065
```

In [16]:
# Write cache
print(InterimStorage('non-deduplicated').write(non_equal).get_filepath())
print('Duplicate texts in non_equal:', len(non_equal))
# Duplicate texts in non_equal: 1,010

/tmp/InterimStorage/non-deduplicated.pickle.bz2
Duplicate texts in non_equal: 1010
