In [24]:
import struct
from struct import unpack

import json
import numpy as np
import pandas as pd
import os
import time
import re

import PIL
from PIL import ImageDraw

%pylab inline
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')
plt.style.use('seaborn-poster')
sns.set(style='white', context='notebook', palette='deep')

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [25]:
def unpack_drawing(file_handle):
    key_id, = unpack('Q', file_handle.read(8))
    countrycode, = unpack('2s', file_handle.read(2))
    recognized, = unpack('b', file_handle.read(1))
    timestamp, = unpack('I', file_handle.read(4))
    n_strokes, = unpack('H', file_handle.read(2))
    image = []
    for i in range(n_strokes):
        n_points, = unpack('H', file_handle.read(2))
        fmt = str(n_points) + 'B'
        x = unpack(fmt, file_handle.read(n_points))
        y = unpack(fmt, file_handle.read(n_points))
        image.append((x, y))

    return {
        #'key_id': key_id,
        'countrycode': countrycode,
        'recognized': recognized,
        #'timestamp': timestamp,
        'image': image
    }

def unpack_drawings(filename):
    with open(filename, 'rb') as f:
        while True:
            try:
                yield unpack_drawing(f)
            except struct.error:
                break


In [27]:
max_imgs_per_category = 5e4

df = []
imgs = []
files_to_read = [f for f in os.listdir('data/') if f.endswith('.bin')]
t_start = time.time()

for i, f in enumerate(files_to_read):
    
    imgs_meta = []
    category = f.split('-')[-1].split('.')[0]  # Cut from the filename.
    if category not in ['airplane', 'bat', 'bird', 'car', 'bus', 'mug', 'cup']:
        continue
    for j, drawing in enumerate(unpack_drawings('data/'+f)):
        img = drawing.pop('image')
        imgs.append(img)
        imgs_meta.append(drawing)
        
        if j % 1000 == 0:
            time_elapsed = round((time.time() - t_start)/60, 1)
            print('Processing file {} out of {}. Done {}k images in current file. Minutes elapsed {}      '
                  .format(i+1, len(files_to_read), int(j/1000), time_elapsed), end='\r')
        if j >= max_imgs_per_category-1:
            break
            
    df_tmp = pd.DataFrame()
    df_tmp['countrycode'] = [re.sub('[^A-Z]+', '', str(d['countrycode']))
                             for d in imgs_meta]
    df_tmp['recognized'] = [d['recognized'] for d in imgs_meta]
    df_tmp['target'] = category
    df.append(df_tmp)
    
    
df = pd.concat(df, axis=0).reset_index(drop=True)
imgs = np.array(imgs)

del imgs_meta, df_tmp

Processing file 7 out of 7. Done 49k images in current file. Minutes elapsed 0.1      

In [28]:
print(df.shape[0])
np.save('data/strokes_raw.npy', imgs)
df.to_csv('data/df.csv', index=False)

350000
