#Querying tables

> Objectives:
> * Compare queries of tabular data for **in-memory** containers
> * Compare sizes and times for those

In [None]:
from ipython_memwatcher import MemWatcher
mw = MemWatcher()
mw.start_watching_memory()

In [None]:
import os
dset = 'movielens-1m'
fdata = os.path.join(dset, 'ratings.dat.gz')
fitem = os.path.join(dset, 'movies.dat')

In [None]:
import pandas as pd
# pass in column names for each CSV
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(fdata, sep=';', names=r_cols, compression='gzip')

m_cols = ['movie_id', 'title', 'genres']
movies = pd.read_csv(fitem, sep=';', names=m_cols,
                     dtype={'title': object, 'genres': object})

In [None]:
lens = pd.merge(movies, ratings)

In [None]:
print(lens[:10])
lens.info()

In [None]:
size_pandas = lens.memory_usage().sum(index=True) / 2**20.
size_pandas

In [None]:
result = lens.query("(title == 'Tom and Huck (1995)') & (rating == 5)")['user_id']
t = %timeit -o lens.query("(title == 'Tom and Huck (1995)') & (rating == 5)")['user_id']
result

In [None]:
qtime_pandas = t.best

##Use a compressed in-memory container via `bcolz`

In [None]:
import bcolz
bcolz.print_versions()
bcolz.defaults.cparams['cname'] = 'lz4'
bcolz.defaults.cparams['clevel'] = 5
bcolz.set_nthreads(4)

In [None]:
zlens = bcolz.ctable.fromdataframe(lens)

Sometimes the memory reported as used is too much biased (don't know why), so let's repeat the operation, but using a different container:

In [None]:
zlens2 = bcolz.ctable.fromdataframe(lens)

In [None]:
size_bcolz = zlens.cbytes / 2**20

In [None]:
zlens

In [None]:
size_pandas / size_bcolz

We can see that the space taken by a bcolz container is around 7x smaller (!) than a pandas one.

### Excercise

Why do you think that the number of uncompressed bytes (nbytes) that the ctable reports is 3x more than pandas (153 MB vs 54 MB)?

*Hint:* Pandas stores the string columns in NumPy containers with 'object' dtype whereas bcolz uses the equivalent to NumPy's 'string' objects.

In [None]:
# Querying a bcolz dataset
resultz = [(r.nrow__, r.user_id) for r in zlens.where("(title == b'Tom and Huck (1995)') & (rating == 5)", outcols=['nrow__', 'user_id'])]
t = %timeit -o [(r.nrow__, r.user_id) for r in zlens.where("(title == b'Tom and Huck (1995)') & (rating == 5)", outcols=['nrow__', 'user_id'])]

In [None]:
qtime_bcolz = t.best

In [None]:
qtime_pandas / qtime_bcolz

We see that bcolz containers, besides being in compressed state, provide ~2x faster query times than pandas

In [None]:
# Sanity check (always check for your results!)
print("results with pandas Dataframe:", result)
print("results with bcolz ctable:", resultz)

## Using structured NumPy arrays

In [None]:
nalens = lens.to_records()

In [None]:
size_numpy = (nalens.size * nalens.dtype.itemsize) / 2**20

In [None]:
resultna = nalens[(nalens['title'] == 'Tom and Huck (1995)') & (nalens['rating'] == 5)]
t = %timeit -o nalens[(nalens['title'] == 'Tom and Huck (1995)') & (nalens['rating'] == 5)]
resultna

Again, NumPy works the fastest for in-memory data containers, while memory consumption is close to pandas.

In [None]:
qtime_numpy = t.best

##Performance comparsion

In [None]:
%matplotlib inline

In [None]:
# Query times
labels = ["pandas", "bcolz", "numpy (recarray)"]
df = pd.DataFrame({'time (sec)': [qtime_pandas, qtime_bcolz, qtime_numpy]}, index=labels)
pd.options.display.mpl_style = 'default'
df.plot(kind='barh', figsize=(12,5), fontsize=16, title="Query times for MovieLens 1m (in-memory)")

## Size comparison

In [None]:
# Container sizes
df = pd.DataFrame({'size (MB)': [size_pandas, size_bcolz, size_numpy]}, index=labels)
pd.options.display.mpl_style = 'default'
df.plot(kind='barh', figsize=(12,5), fontsize=16, title="Container sizes for MovieLens 1m")

## Rules of thumb for querying in-memory tabular datasets

* Choose pure NumPy recarrays if you need the fastest speed
* Choose bcolz ctables if you need to store lots of data in limited memory and not want to loose too much speed
* Choose pandas if what you need is rich functionality on top of your containers (at the penalty of some speed)