# bookcave_images

## Set Up

In [None]:
# Math.
import numpy as np
# Visualization.
import matplotlib.pyplot as plt
# File I/O.
import os
# Images
from PIL import Image
import keras

# Data.
import bookcave

Declare file path constants.

In [None]:
FIGURES_ROOT = os.path.join('..', 'figures')
try:
    os.mkdir(FIGURES_ROOT)
except FileExistsError:
    pass

Read all of the image files.

In [None]:
inputs, _, _, _, book_ids, _, _, _, _ = bookcave.get_data({'text', 'images'}, return_meta=True)

## Images

In [None]:
image_fnames = np.array([images[0] for images in inputs['images']])
len(image_fnames)

See an example cover image.

In [None]:
book_ids[42]

In [None]:
example_image = Image.open(image_fnames[42])
plt.imshow(example_image)

In [None]:
example_tensor = keras.preprocessing.image.img_to_array(example_image)
example_tensor.shape

What does the file size distribution look like?

In [None]:
image_sizes = np.array([os.path.getsize(fname) for fname in image_fnames])
plt.hist(image_sizes, 40)
plt.show()

Which are the largest image files?

In [None]:
largest_image_size_indices = np.argpartition(image_sizes, -10)[-10:]
sorted_largest_image_size_indices = largest_image_size_indices[np.argsort(image_sizes[largest_image_size_indices])]
largest_book_id_sizes = list(zip(book_ids[sorted_largest_image_size_indices],
                                 image_sizes[sorted_largest_image_size_indices]))
largest_book_id_sizes

Zoom in.

In [None]:
majority_image_sizes = np.array([size for size in image_sizes if size < 150000])
plt.hist(majority_image_sizes, 40)
plt.show()

Which are the smallest image files?

In [None]:
smallest_image_size_indices = np.argpartition(image_sizes, 20)[:20]
sorted_smallest_image_size_indices = smallest_image_size_indices[np.argsort(image_sizes[smallest_image_size_indices])]
smallest_book_id_sizes = list(zip(book_ids[sorted_smallest_image_size_indices],
                                  image_sizes[sorted_smallest_image_size_indices]))
smallest_book_id_sizes

In [None]:
for fname in image_fnames[sorted_smallest_image_size_indices]:
    image = Image.open(fname)
    plt.figure()
    plt.imshow(image)
    plt.show()

Zoom in more.

In [None]:
more_majority_image_sizes = np.array([size for size in majority_image_sizes if size > 90000])
plt.hist(more_majority_image_sizes, 40)
plt.show()

View the distribution of image dimensions in pixels.

In [None]:
shapes = np.ndarray((len(image_fnames), 3), dtype=np.int32)
for i, fname in enumerate(image_fnames):
    tensor = keras.preprocessing.image.img_to_array(Image.open(fname))
    shapes[i] = tensor.shape

In [None]:
widths = np.array([shape[1] for shape in shapes])
plt.hist(widths, 40)
plt.show()

In [None]:
heights = np.array([shape[0] for shape in shapes])
plt.hist(heights, 40)
plt.show()

In [None]:
aspects = np.array([shape[1] / shape[0] for shape in shapes])
plt.hist(aspects, 40)
plt.show()

In [None]:
plt.hist([aspect for aspect in aspects if aspect <= 1.0], 40)
plt.show()