In [1]:
########################################################################
#
# Functions for downloading and extracting data-files from the internet.
#
# Implemented in Python 3.5
#
########################################################################
#
# This file is part of the TensorFlow Tutorials available at:
#
# https://github.com/Hvass-Labs/TensorFlow-Tutorials
#
# Published under the MIT License. See the file LICENSE for details.
#
# Copyright 2016 by Magnus Erik Hvass Pedersen
#
########################################################################

import sys
import os
import urllib.request
import tarfile
import zipfile
import pydoop
import hops.hdfs as hdfs

########################################################################


def _print_download_progress(count, block_size, total_size):
    """
    Function used for printing the download progress.
    Used as a call-back function in maybe_download_and_extract().
    """

    # Percentage completion.
    pct_complete = float(count * block_size) / total_size

    # Status-message. Note the \r which means the line should overwrite itself.
    msg = "\r- Download progress: {0:.1%}".format(pct_complete)

    # Print it.
    sys.stdout.write(msg)
    sys.stdout.flush()


########################################################################


def maybe_download_and_extract(url):
    """
    Download and extract the data if it doesn't already exist.
    Assumes the url is a tar-ball file.

    :param url:
        Internet URL for the tar-file to download.
        Example: "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"

    :return:
        Nothing.
    """

    download_dir = os.environ['PDIR']
    # Filename for saving the file downloaded from the internet.
    # Use the filename from the URL and add it to the download_dir.
    filename = url.split('/')[-1]
    file_path = os.path.join(download_dir, filename)
    print("Path is: " + file_path)

    # Check if the file already exists.
    # If it exists then we assume it has also been extracted,
    # otherwise we need to download and extract it now.
    if not os.path.exists(file_path):
        # Check if the download directory exists, otherwise create it.
        if not os.path.exists(download_dir):
            os.makedirs(download_dir)

        # Download the file from the internet.
        file_path, _ = urllib.request.urlretrieve(url=url,
                                                  filename=file_path,
                                                  reporthook=_print_download_progress)

        print()
        print("Download finished. Extracting files.")

        if file_path.endswith(".zip"):
            # Unpack the zip-file.
            zipfile.ZipFile(file=file_path, mode="r").extractall(download_dir)
        elif file_path.endswith((".tar.gz", ".tgz")):
            # Unpack the tar-ball.
            tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)

    else:
        print("Data has apparently already been downloaded and unpacked.")

    (name,ext) = os.path.splitext(file_path)
    print(name)
    pydoop.hdfs.put(name, hdfs.project_path() + "Resources")
    print("Done.")

########################################################################

maybe_download_and_extract("http://snurran.sics.se/hops/knifey-spoony.zip")

Path is: /srv/hops/staging/private_dirs/b3357fba7f118c6154cb038110bd6932b825be59a3d8d58ffbfe08dbd66911cd/knifey-spoony.zip
Data has apparently already been downloaded and unpacked.
/srv/hops/staging/private_dirs/b3357fba7f118c6154cb038110bd6932b825be59a3d8d58ffbfe08dbd66911cd/knifey-spoony


OSError: '/Projects/demo_tensorflow_admin000/Resources/knifey-spoony' already exists

In [11]:
########################################################################
#
# Class for creating a data-set consisting of all files in a directory.
#
# Example usage is shown in the file knifey.py and Tutorial #09.
#
# Implemented in Python 3.5
#
########################################################################
#
# This file is part of the TensorFlow Tutorials available at:
#
# https://github.com/Hvass-Labs/TensorFlow-Tutorials
#
# Published under the MIT License. See the file LICENSE for details.
#
# Copyright 2016 by Magnus Erik Hvass Pedersen
#
########################################################################

import numpy as np
import os
import hops.hdfs as hdfs
import pydoop.hdfs.path as hpath

########################################################################


def one_hot_encoded(class_numbers, num_classes=None):
    """
    Generate the One-Hot encoded class-labels from an array of integers.

    For example, if class_number=2 and num_classes=4 then
    the one-hot encoded label is the float array: [0. 0. 1. 0.]

    :param class_numbers:
        Array of integers with class-numbers.
        Assume the integers are from zero to num_classes-1 inclusive.

    :param num_classes:
        Number of classes. If None then use max(class_numbers)+1.

    :return:
        2-dim array of shape: [len(class_numbers), num_classes]
    """

    # Find the number of classes if None is provided.
    # Assumes the lowest class-number is zero.
    if num_classes is None:
        num_classes = np.max(class_numbers) + 1

    return np.eye(num_classes, dtype=float)[class_numbers]


def inodes_in_dir(hdfs_path, kind):
    """
    :param fs:
        Pydoop hdfs objcet
    
    :param hdfs_path:
        Full pathname as a string to a hdfs directory
    
    :param kind:
        'file' or 'directory'
        
    :return:
        List of either files or directories in the supplied hdfs_path
    
    """
    fs = hdfs.get()
    print("dirs in dir")
    list_files=[]
    if hpath.isdir(hdfs_path):
        for path_spec in fs.list_directory(hdfs_path):
            if path_spec["kind"] == kind:
                filename=path_spec['name']
                if filename.lower().endswith(".jpg"):
                    list_files.append(filename)
    else:
        print("Was not a directory: " + hdfs_path)  
    fs.close()
    return list_files
    
def files_in_dir(hdfs_path):
    print("files in dir")
    return inodes_in_dir(hdfs_path, 'file')

def dirs_in_dir(hdfs_path):
    print("dirs in dir")
    return inodes_in_dir(hdfs_path, 'directory')

########################################################################


class DataSet:
    def __init__(self, in_dir, exts='.jpg'):
        """
        Create a data-set consisting of the filenames in the given directory
        and sub-dirs that match the given filename-extensions.

        For example, the knifey-spoony data-set (see knifey.py) has the
        following dir-structure:

        knifey-spoony/forky/
        knifey-spoony/knifey/
        knifey-spoony/spoony/
        knifey-spoony/forky/test/
        knifey-spoony/knifey/test/
        knifey-spoony/spoony/test/

        This means there are 3 classes called: forky, knifey, and spoony.

        If we set in_dir = "knifey-spoony/" and create a new DataSet-object
        then it will scan through these directories and create a training-set
        and test-set for each of these classes.

        The training-set will contain a list of all the *.jpg filenames
        in the following directories:

        knifey-spoony/forky/
        knifey-spoony/knifey/
        knifey-spoony/spoony/

        The test-set will contain a list of all the *.jpg filenames
        in the following directories:

        knifey-spoony/forky/test/
        knifey-spoony/knifey/test/
        knifey-spoony/spoony/test/

        See the TensorFlow Tutorial #09 for a usage example.

        :param in_dir:
            Root-dir for the files in the data-set.
            This would be 'knifey-spoony/' in the example above.

        :param exts:
            String or tuple of strings with valid filename-extensions.
            Not case-sensitive.

        :return:
            Object instance.
        """

        # Extend the input directory to the full path.
        #in_dir = os.path.abspath(in_dir)

        # Input directory.
        self.in_dir = in_dir

        # Convert all file-extensions to lower-case.
        self.exts = tuple(ext.lower() for ext in exts)

        # Names for the classes.
        self.class_names = []

        # Filenames for all the files in the training-set.
        self.filenames = []

        # Filenames for all the files in the test-set.
        self.filenames_test = []

        # Class-number for each file in the training-set.
        self.class_numbers = []

        # Class-number for each file in the test-set.
        self.class_numbers_test = []

        # Total number of classes in the data-set.
        self.num_classes = 0

        # Pydoop HDFS Clients from hops
        self.fs = hdfs.get()
        self.dfs = hdfs.get_fs()

        # For all files/dirs in the input directory.
        for name in self.fs.list_directory(in_dir):
            fullpath=name['name']
            dirname=hpath.dirname(fullpath)
            # If it is a directory.
            if hpath.isdir(fullpath):
                # Add the dir-name to the list of class-names.
                self.class_names.append(hpath.basename(fullpath))

                # Training-set.

                # Get all the valid filenames in the dir (not sub-dirs).
                #filenames = self._get_filenames(current_dir)
                print(fullpath)
                filenames = files_in_dir(fullpath)

                # Append them to the list of all filenames for the training-set.
                self.filenames.extend(filenames)

                # The class-number for this class.
                class_number = self.num_classes

                # Create an array of class-numbers.
                class_numbers = [class_number] * len(filenames)

                # Append them to the list of all class-numbers for the training-set.
                self.class_numbers.extend(class_numbers)

                # Test-set.

                # Get all the valid filenames in the sub-dir named 'test'.
#                filenames_test = self._get_filenames(os.path.join(current_dir, 'test'))
                filenames_test = files_in_dir(fullpath + "/test")

                
                # Append them to the list of all filenames for the test-set.
                self.filenames_test.extend(filenames_test)

                # Create an array of class-numbers.
                class_numbers = [class_number] * len(filenames_test)

                # Append them to the list of all class-numbers for the test-set.
                self.class_numbers_test.extend(class_numbers)

                # Increase the total number of classes in the data-set.
                self.num_classes += 1



                
    def get_paths(self, test=False):
        """
        Get the full paths for the files in the data-set.

        :param test:
            Boolean. Return the paths for the test-set (True) or training-set (False).

        :return:
            Iterator with strings for the path-names.
        """

        if test:
            # Use the filenames and class-numbers for the test-set.
            filenames = self.filenames_test
            class_numbers = self.class_numbers_test

            # Sub-dir for test-set.
            test_dir = "test/"
        else:
            # Use the filenames and class-numbers for the training-set.
            filenames = self.filenames
            class_numbers = self.class_numbers

            # Don't use a sub-dir for test-set.
            test_dir = ""

        for filename, cls in zip(filenames, class_numbers):
            # Full path-name for the file.
            path = hpath.join(self.in_dir, self.class_names[cls], test_dir, filename)

            yield path

    def get_training_set(self):
        """
        Return the list of paths for the files in the training-set,
        and the list of class-numbers as integers,
        and the class-numbers as one-hot encoded arrays.
        """

        return list(self.get_paths()), \
               np.asarray(self.class_numbers), \
               one_hot_encoded(class_numbers=self.class_numbers,
                               num_classes=self.num_classes)

    def get_test_set(self):
        """
        Return the list of paths for the files in the test-set,
        and the list of class-numbers as integers,
        and the class-numbers as one-hot encoded arrays.
        """

        return list(self.get_paths(test=True)), \
               np.asarray(self.class_numbers_test), \
               one_hot_encoded(class_numbers=self.class_numbers_test,
                               num_classes=self.num_classes)

dataset = DataSet(in_dir="hdfs:///Projects/demo_tensorflow_admin000/Resources/knifey-spoony")
dataset.get_training_set()

hdfs://10.0.2.15:8020/Projects/demo_tensorflow_admin000/Resources/knifey-spoony/forky
files in dir
dirs in dir
files in dir
dirs in dir
Was not a directory: hdfs://10.0.2.15:8020/Projects/demo_tensorflow_admin000/Resources/knifey-spoony/forky/test
hdfs://10.0.2.15:8020/Projects/demo_tensorflow_admin000/Resources/knifey-spoony/knifey
files in dir
dirs in dir
files in dir
dirs in dir
Was not a directory: hdfs://10.0.2.15:8020/Projects/demo_tensorflow_admin000/Resources/knifey-spoony/knifey/test
hdfs://10.0.2.15:8020/Projects/demo_tensorflow_admin000/Resources/knifey-spoony/spoony
files in dir
dirs in dir
files in dir
dirs in dir
hdfs://10.0.2.15:8020/Projects/demo_tensorflow_admin000/Resources/knifey-spoony/test
files in dir
dirs in dir
files in dir
dirs in dir
Was not a directory: hdfs://10.0.2.15:8020/Projects/demo_tensorflow_admin000/Resources/knifey-spoony/test/test
hdfs://10.0.2.15:8020/Projects/demo_tensorflow_admin000/Resources/knifey-spoony/train
files in dir
dirs in dir
files in

(['hdfs://10.0.2.15:8020/Projects/demo_tensorflow_admin000/Resources/knifey-spoony/forky/forky-01-0001.jpg',
  'hdfs://10.0.2.15:8020/Projects/demo_tensorflow_admin000/Resources/knifey-spoony/forky/forky-01-0002.jpg',
  'hdfs://10.0.2.15:8020/Projects/demo_tensorflow_admin000/Resources/knifey-spoony/forky/forky-01-0003.jpg',
  'hdfs://10.0.2.15:8020/Projects/demo_tensorflow_admin000/Resources/knifey-spoony/forky/forky-01-0004.jpg',
  'hdfs://10.0.2.15:8020/Projects/demo_tensorflow_admin000/Resources/knifey-spoony/forky/forky-01-0005.jpg',
  'hdfs://10.0.2.15:8020/Projects/demo_tensorflow_admin000/Resources/knifey-spoony/forky/forky-01-0006.jpg',
  'hdfs://10.0.2.15:8020/Projects/demo_tensorflow_admin000/Resources/knifey-spoony/forky/forky-01-0007.jpg',
  'hdfs://10.0.2.15:8020/Projects/demo_tensorflow_admin000/Resources/knifey-spoony/forky/forky-01-0008.jpg',
  'hdfs://10.0.2.15:8020/Projects/demo_tensorflow_admin000/Resources/knifey-spoony/forky/forky-01-0009.jpg',
  'hdfs://10.0.2.15