# Queries and Selections

## Load movielens datasets

In [1]:
import os
import numpy as np
import pandas as pd
import tables

In [2]:
! rm movielens-*norm-*.h5

In [3]:
# Import CSV files via pandas
dset = 'movielens-1m'
fdata = os.path.join(dset, 'ratings.dat.gz')
fitem = os.path.join(dset, 'movies.dat')

# pass in column names for each CSV
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(fdata, sep=';', names=r_cols, compression='gzip')

m_cols = ['movie_id', 'title', 'genres']
movies = pd.read_csv(fitem, sep=';', names=m_cols,
                     dtype={'title': object, 'genres': object})

In [4]:
movies.ftypes

movie_id     int64:dense
title       object:dense
genres      object:dense
dtype: object

In [5]:
ratings.ftypes

user_id           int64:dense
movie_id          int64:dense
rating            int64:dense
unix_timestamp    int64:dense
dtype: object

In [6]:
def to_hdf5_norm(ratings, movies, filename):
    
    class Ratings(tables.IsDescription):
        user_id = tables.Int32Col(pos=0)
        movie_id = tables.Int32Col(pos=1)
        rating = tables.Int8Col(pos=2)
        unix_timestamp = tables.Int64Col(pos=3)
    
    class Movies(tables.IsDescription):
        movie_id = tables.Int32Col(pos=0)
        title = tables.StringCol(100, pos=1)
        genres = tables.StringCol(50, pos=2)
    
    filters = tables.Filters(complevel=5, complib="blosc:zstd")
    with tables.open_file(filename, "w", filters=filters) as f:
        table_ratings = f.create_table(f.root, "ratings", Ratings)
        table_ratings.append([ratings[col].values for col in ratings.ftypes.keys()])
        table_movies = f.create_table(f.root, "movies", Movies)
        table_movies.append([movies[col].values for col in movies.ftypes.keys()])

In [7]:
h5norm = "movielens-norm.h5"
to_hdf5_norm(ratings, movies, h5norm)

In [8]:
!ptdump -v -R0,10 movielens-norm.h5

/ (RootGroup) ''
/movies (Table(3883,), shuffle, blosc:zstd(5)) ''
  description := {
  "movie_id": Int32Col(shape=(), dflt=0, pos=0),
  "title": StringCol(itemsize=100, shape=(), dflt=b'', pos=1),
  "genres": StringCol(itemsize=50, shape=(), dflt=b'', pos=2)}
  byteorder := 'little'
  chunkshape := (425,)
  Data dump:
[0] (1, b'Toy Story (1995)', b"Animation|Children's|Comedy")
[1] (2, b'Jumanji (1995)', b"Adventure|Children's|Fantasy")
[2] (3, b'Grumpier Old Men (1995)', b'Comedy|Romance')
[3] (4, b'Waiting to Exhale (1995)', b'Comedy|Drama')
[4] (5, b'Father of the Bride Part II (1995)', b'Comedy')
[5] (6, b'Heat (1995)', b'Action|Crime|Thriller')
[6] (7, b'Sabrina (1995)', b'Comedy|Romance')
[7] (8, b'Tom and Huck (1995)', b"Adventure|Children's")
[8] (9, b'Sudden Death (1995)', b'Action')
[9] (10, b'GoldenEye (1995)', b'Action|Adventure|Thriller')
/ratings (Table(1000209,), shuffle, blosc:zstd(5)) ''
  description := {
  "user_id": Int32Col(shape=(), dflt=0, pos=0),
  "movie_id": 

In [9]:
# create one merged DataFrame
lens = pd.merge(movies, ratings)

In [10]:
lens.ftypes

movie_id           int64:dense
title             object:dense
genres            object:dense
user_id            int64:dense
rating             int64:dense
unix_timestamp     int64:dense
dtype: object

In [11]:
def to_hdf5_denorm(lens, filename):
    
    class Lens(tables.IsDescription):
        user_id = tables.Int32Col(pos=0)
        rating = tables.Int8Col(pos=1)
        unix_timestamp = tables.Int64Col(pos=2)
        title = tables.StringCol(100, pos=3)
        genres = tables.StringCol(50, pos=4)
    
    filters = tables.Filters(complevel=5, complib="blosc:zstd")
    with tables.open_file(filename, "w", filters=filters) as f:
        table_lens = f.create_table(f.root, "lens", Lens)
        table_lens.append([lens[col].values for col in Lens().columns.keys()])


In [12]:
h5denorm = "movielens-denorm.h5"
to_hdf5_denorm(lens, h5denorm)

In [13]:
!ptdump -v -R0,10 movielens-denorm.h5

/ (RootGroup) ''
/lens (Table(1000209,), shuffle, blosc:zstd(5)) ''
  description := {
  "user_id": Int32Col(shape=(), dflt=0, pos=0),
  "rating": Int8Col(shape=(), dflt=0, pos=1),
  "unix_timestamp": Int64Col(shape=(), dflt=0, pos=2),
  "title": StringCol(itemsize=100, shape=(), dflt=b'', pos=3),
  "genres": StringCol(itemsize=50, shape=(), dflt=b'', pos=4)}
  byteorder := 'little'
  chunkshape := (402,)
  Data dump:
[0] (1, 5, 978824268, b'Toy Story (1995)', b"Animation|Children's|Comedy")
[1] (6, 4, 978237008, b'Toy Story (1995)', b"Animation|Children's|Comedy")
[2] (8, 4, 978233496, b'Toy Story (1995)', b"Animation|Children's|Comedy")
[3] (9, 5, 978225952, b'Toy Story (1995)', b"Animation|Children's|Comedy")
[4] (10, 5, 978226474, b'Toy Story (1995)', b"Animation|Children's|Comedy")
[5] (18, 4, 978154768, b'Toy Story (1995)', b"Animation|Children's|Comedy")
[6] (19, 5, 978555994, b'Toy Story (1995)', b"Animation|Children's|Comedy")
[7] (21, 3, 978139347, b'Toy Sto

In [14]:
!ls -lh movielens*.h5

-rw-r--r--  1 faltet  staff   5.4M May 11 10:01 movielens-denorm.h5
-rw-r--r--  1 faltet  staff   4.3M May 11 10:01 movielens-norm.h5


In [15]:
!du -sh movielens-1m

5.8M	movielens-1m


## Querying in PyTables

### Denormalized tables

In [16]:
h5file = tables.open_file(h5denorm)
h5lens = h5file.root.lens

In [17]:
h5lens

/lens (Table(1000209,), shuffle, blosc:zstd(5)) ''
  description := {
  "user_id": Int32Col(shape=(), dflt=0, pos=0),
  "rating": Int8Col(shape=(), dflt=0, pos=1),
  "unix_timestamp": Int64Col(shape=(), dflt=0, pos=2),
  "title": StringCol(itemsize=100, shape=(), dflt=b'', pos=3),
  "genres": StringCol(itemsize=50, shape=(), dflt=b'', pos=4)}
  byteorder := 'little'
  chunkshape := (402,)

In [18]:
%%time
ratings = [0] * 6
for rt in range(0,6):
    ratings[rt] = sum(1 for r in h5lens.where("(title == b'Tom and Huck (1995)') & (rating == rt)"))

CPU times: user 951 ms, sys: 145 ms, total: 1.1 s
Wall time: 1.02 s


In [19]:
ratings

[0, 4, 15, 28, 18, 3]

In [20]:
h5file.close()

### Normalized tables

In [21]:
h5file = tables.open_file("movielens-norm.h5")
h5ratings = h5file.root.ratings
h5movies = h5file.root.movies

In [22]:
h5ratings

/ratings (Table(1000209,), shuffle, blosc:zstd(5)) ''
  description := {
  "user_id": Int32Col(shape=(), dflt=0, pos=0),
  "movie_id": Int32Col(shape=(), dflt=0, pos=1),
  "rating": Int8Col(shape=(), dflt=0, pos=2),
  "unix_timestamp": Int64Col(shape=(), dflt=0, pos=3)}
  byteorder := 'little'
  chunkshape := (3855,)

In [23]:
h5movies

/movies (Table(3883,), shuffle, blosc:zstd(5)) ''
  description := {
  "movie_id": Int32Col(shape=(), dflt=0, pos=0),
  "title": StringCol(itemsize=100, shape=(), dflt=b'', pos=1),
  "genres": StringCol(itemsize=50, shape=(), dflt=b'', pos=2)}
  byteorder := 'little'
  chunkshape := (425,)

In [24]:
%%time
ratings = [0] * 6
for rt in range(0,6):
    th_movie_id = [r['movie_id'] for r in h5movies.where("(title == b'Tom and Huck (1995)')")][0]
    ratings[rt] = sum(1 for r in h5ratings.where("(movie_id == th_movie_id) & (rating == rt)"))

CPU times: user 309 ms, sys: 42.7 ms, total: 352 ms
Wall time: 349 ms


In [25]:
ratings

[0, 4, 15, 28, 18, 3]

In [26]:
h5file.close()

So, the query in the normalized version is more than 2x faster than using the denormalized file.  However, this is just a simple example, and in general experimentation should be done so as to determine the best layout for your data.

## Indexing

### Denormalized case

In [27]:
## Copy the original PyTables table into another file
import shutil
h5idx = "movielens-denorm-indexed.h5"
if os.path.exists(h5idx):
    os.unlink(h5idx)
shutil.copyfile(h5denorm, h5idx)

'movielens-denorm-indexed.h5'

In [28]:
# Open the new file in 'a'ppend mode
h5i = tables.open_file(h5idx, mode="a")

In [29]:
# Create an index for the 'title' column
h5lens = h5i.root.lens
blosc_filter = tables.Filters(complevel=9, complib="blosc")
%time h5lens.cols.title.create_csindex(filters=blosc_filter)

CPU times: user 1.41 s, sys: 291 ms, total: 1.7 s
Wall time: 1.78 s


1000209

In [30]:
%%time
ratings = [0] * 6
for rt in range(0,6):
    ratings[rt] = sum(1 for r in h5lens.where("(title == b'Tom and Huck (1995)') & (rating == rt)"))

CPU times: user 9.29 ms, sys: 1.81 ms, total: 11.1 ms
Wall time: 10.5 ms


In [31]:
ratings

[0, 4, 15, 28, 18, 3]

In [32]:
# Create an index for the rating column
%time h5lens.cols.rating.create_csindex(filters=blosc_filter)

CPU times: user 410 ms, sys: 37.6 ms, total: 448 ms
Wall time: 516 ms


1000209

In [33]:
%%time
ratings = [0] * 6
for rt in range(0,6):
    ratings[rt] = sum(1 for r in h5lens.where("(title == b'Tom and Huck (1995)') & (rating == rt)"))

CPU times: user 3.9 ms, sys: 1.19 ms, total: 5.09 ms
Wall time: 6.7 ms


In [34]:
ratings

[0, 4, 15, 28, 18, 3]

In [35]:
h5i.close()

### Normalized case

In [36]:
## Copy the original PyTables table into another file
import shutil
h5idx = "movielens-norm-indexed.h5"
if os.path.exists(h5idx):
    os.unlink(h5idx)
shutil.copyfile(h5norm, h5idx)

'movielens-norm-indexed.h5'

In [37]:
# Open the new file in 'a'ppend mode
h5i = tables.open_file(h5idx, mode="a")
h5ratings = h5i.root.ratings
h5movies = h5i.root.movies

In [38]:
# Create an index for the rating column
%time h5ratings.cols.rating.create_csindex(filters=blosc_filter)

CPU times: user 312 ms, sys: 33.9 ms, total: 346 ms
Wall time: 394 ms


1000209

In [39]:
%%time
ratings = [0] * 6
for rt in range(6):
    th_movie_id = [r['movie_id'] for r in h5movies.where("(title == b'Tom and Huck (1995)')")][0]
    ratings[rt] = sum(1 for r in h5ratings.where("(movie_id == th_movie_id) & (rating == rt)"))

CPU times: user 299 ms, sys: 38.5 ms, total: 338 ms
Wall time: 315 ms


In [40]:
ratings

[0, 4, 15, 28, 18, 3]

In [41]:
# Create an index for the movie_id column
%time h5ratings.cols.movie_id.create_csindex(filters=blosc_filter)

CPU times: user 311 ms, sys: 29.3 ms, total: 341 ms
Wall time: 364 ms


1000209

In [42]:
%%time
ratings = [0] * 6
for rt in range(6):
    th_movie_id = [r['movie_id'] for r in h5movies.where("(title == b'Tom and Huck (1995)')")][0]
    ratings[rt] = sum(1 for r in h5ratings.where("(movie_id == th_movie_id) & (rating == rt)"))

CPU times: user 17.4 ms, sys: 2.47 ms, total: 19.9 ms
Wall time: 27.6 ms


In [43]:
ratings

[0, 4, 15, 28, 18, 3]

In [44]:
h5i.close()

In [45]:
!du -sh movielens* | sort -nr

9.9M	movielens-denorm-indexed.h5
9.6M	movielens-norm-indexed.h5
5.8M	movielens-1m
5.4M	movielens-denorm.h5
4.3M	movielens-norm.h5


## Exercise

We have not created an index for the title for the normalized case.  Create such an index and determine if there is a noticeable speed-up or not.  Explain why you think that is the case.  Note: the times for a cold query can be **significatively** different from a hot query.

In [46]:
## Copy the original PyTables table into another file
import shutil
h5idx2 = "movielens-norm-indexed2.h5"
if os.path.exists(h5idx2):
    os.unlink(h5idx2)
shutil.copyfile(h5idx, h5idx2)

'movielens-norm-indexed2.h5'

In [47]:
# Open the new file in 'a'ppend mode
h5i = tables.open_file(h5idx2, mode="a")
h5ratings = h5i.root.ratings
h5movies = h5i.root.movies

In [48]:
# Create an index for the movie_id column
%time h5movies.cols.title.create_csindex(filters=blosc_filter)

CPU times: user 13.9 ms, sys: 5.28 ms, total: 19.2 ms
Wall time: 16.3 ms


3883

In [49]:
%%time
ratings = [0] * 6
for rt in range(6):
    th_movie_id = [r['movie_id'] for r in h5movies.where("(title == b'Tom and Huck (1995)')")][0]
    ratings[rt] = sum(1 for r in h5ratings.where("(movie_id == th_movie_id) & (rating == rt)"))

CPU times: user 155 ms, sys: 25.9 ms, total: 181 ms
Wall time: 211 ms


In [50]:
ratings

[0, 4, 15, 28, 18, 3]

In [51]:
h5i.close()

In [52]:
!du -sh movielens* | sort -nr

9.9M	movielens-denorm-indexed.h5
9.7M	movielens-norm-indexed2.h5
9.6M	movielens-norm-indexed.h5
5.8M	movielens-1m
5.4M	movielens-denorm.h5
4.3M	movielens-norm.h5
