In [1]:
%load_ext autoreload
%autoreload 2
%pylab inline
%matplotlib inline



Populating the interactive namespace from numpy and matplotlib


In [2]:
import pandas as pd

In [3]:
assocs_df = pd.read_csv('../data/tag_walk/assocs.csv')
assocs_df['tag'] = assocs_df['tag'].astype('category')
assocs_df['tag_index'] = assocs_df['tag'].cat.codes
assocs_df.head()

Unnamed: 0.1,Unnamed: 0,tag,image,tag_index
0,0,1920,04.jpg,0
1,1,1920,06.jpg,0
2,2,1920,08.jpg,0
3,3,1920,14234E_FA16_PP_RUNWAY_SHOW_LOOK_51.jpg,0
4,4,1920,14234E_FA16_PP_RUNWAY_SHOW_LOOK_55.jpg,0


In [None]:
grouped_images = {}

for _, row in assocs_df.iterrows():
    image = row.image
    if not image in grouped_images.keys():
        grouped_images[image] = []
    grouped_images[image].append(row.tag)
    
tagged_images_df = pd.DataFrame({
    'image': grouped_images.keys(),
    'tags': grouped_images.values()
})

tagged_images_df.head()

In [None]:
n_tags = len(assocs_df.tag.drop_duplicates())
n_images = len(assocs_df.image.drop_duplicates())
print "Number of tags in dataset: %s" %( n_tags )
print "Number of images in dataset: %s" % ( n_images )


In [None]:
test_size = tagged_images_df.shape[0] // 3
validation_size = test_size // 2

In [None]:
print test_size
print validation_size

In [None]:
from sklearn import cross_validation

# Shuffle
tagged_images_df = tagged_images_df.sample(frac=1)

X_train, X_test, y_train, y_test = (
    cross_validation
    .train_test_split(tagged_images_df.image, tagged_images_df.tags, test_size=test_size, random_state=42)
)

X_val, X_test, y_val, y_test = (
    cross_validation
    .train_test_split(X_test, y_test, test_size=validation_size, random_state=42)
)

In [53]:
import os
from shutil import copyfile


def build_data_tree(inputs, labels, name, base_path):
    output_path = '/'.join([base_path, name])
    print output_path

    inputs_df = pd.DataFrame(inputs, columns = ['image'])
    labels_df = pd.DataFrame(labels, columns = ['tags'])
    df = pd.concat([inputs_df, labels_df], axis=1)

    if not os.path.exists(output_path):
        os.makedirs(output_path)

    for _, row in df.iterrows():
        image = row.image
        for tag in row.tags:
            tag_path = '/'.join([output_path, tag])
#             print tag_path
            if not os.path.exists(tag_path):
                os.makedirs(tag_path)
            src = '/'.join([base_path, 'all', image])
            dst = '/'.join([tag_path, image])
#             print "%s --> %s" % (src, dst)
            copyfile(src, dst)

In [None]:
build_data_tree(X_train, y_train,
                'tagged/train',
                '/Volumes/bobby/tag_walk/tag_walk/data/tag_walk/images/')

In [None]:
build_data_tree(X_val, y_val,
                'tagged/validation',
                '/Volumes/bobby/tag_walk/tag_walk/data/tag_walk/images/')