# Filtering: Read original data

Reads original data and writes `original.pickle.bz2`.

Data format: `{year {star [(number, year, star)] } }`

In [1]:
# Reload modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

# Import from project root
import sys; sys.path.insert(0, '../')

from access.file_storage import FileStorage
from access.interim_storage import InterimStorage
from amore.printer import Printer
from readers.numbers_years_stars import NumbersYearsStars

In [2]:
file_storage = FileStorage()
printer = Printer()

In [3]:
def count_ysl(ysl):
    c = 0
    for year in ysl.keys():
        for star in ysl[year].keys():
            c += len(ysl[year][star])
    return c

In [4]:
nys = NumbersYearsStars(file_storage.get_filepath('AMORE-NumbersYearsStars'))

## Numbers overview

In [5]:
# Reads and also caches data from file
ys_lists = nys.get_by_year_star()

In [6]:
# Print table
if False:
    printer.ipython_display(printer.get_dataframe_with_sums(ys_lists))
if False:
    print(printer.get_dataframe_markdown(printer.get_dataframe_with_sums(ys_lists), float_as_integer=True))
print('Reviews in ys_lists:', count_ysl(ys_lists))
# Reviews in ys_lists: 7,911,684

Reviews in ys_lists: 7911684


Original

|     |   1997 |   1998 |   1999 |   2000 |   2001 |   2002 |   2003 |   2004 |   2005 |   2006 |   2007 |   2008 |   2009 |   2010 |   2011 |   2012 |     Sum |
|:----|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|--------:|
| 1   |      6 |    191 |   4844 |  19944 |  24221 |  25311 |  25734 |  41016 |  54744 |  49049 |  49521 |  56076 |  59099 |  65343 |  72957 |  81276 |  629332 |
| 2   |      1 |    262 |   3631 |  17808 |  20320 |  22641 |  24183 |  33117 |  40868 |  37992 |  40205 |  40138 |  39680 |  41430 |  45767 |  47356 |  455399 |
| 3   |      8 |    442 |   6458 |  30907 |  35395 |  37798 |  43323 |  60489 |  71012 |  66128 |  75239 |  74057 |  73178 |  70279 |  72055 |  74826 |  791594 |
| 4   |     29 |    797 |  14178 |  73314 |  79152 |  84276 |  90527 | 119160 | 138000 | 135581 | 167632 | 161693 | 149771 | 142000 | 148457 | 150248 | 1654815 |
| 5   |     64 |   3313 |  49866 | 192002 | 189638 | 198712 | 205916 | 257603 | 308080 | 311252 | 452009 | 412870 | 422403 | 426248 | 465918 | 484650 | 4380544 |
| Sum |    108 |   5005 |  78977 | 333975 | 348726 | 368738 | 389683 | 511385 | 612704 | 600002 | 784606 | 744834 | 744131 | 745300 | 805154 | 838356 | 7911684 |

In [7]:
# Write cache
print(InterimStorage('original').write(ys_lists).get_filepath())
print('Reviews in ys_lists:', count_ysl(ys_lists))
# Reviews in ys_lists: 7,911,684

/tmp/InterimStorage/original.pickle.bz2
Reviews in ys_lists: 7911684
