In [1]:
from collections import defaultdict
import os
import requests
from shutil import copyfileobj
from lxml import html
import random

In [2]:
# Cached tree to prevent calling the initial directory multiple times.
dir_request = requests.get('http://people.csail.mit.edu/brussell/research/LabelMe/Images/')
tree = html.fromstring(dir_request.content)

In [3]:
class LabelMe:
    """
    Simple API interface to download images from LabelMe's image database, a project that provides
    digital images and annotations. Basic directory search is also supported.
    For more info, visit http://labelme2.csail.mit.edu/Release3.0/index.php.    
    """
    
    BASE_IMAGE_DIR_URL = 'http://people.csail.mit.edu/brussell/research/LabelMe/Images/'
    VALID_IMAGE_EXT = ['jpg', 'jpeg', 'png']
    
    def __init__(self, tree=None):
        """Initialize with an existing tree or get the source tree from LabelMe's site"""
        self.image_html_tree = tree or self.get_tree_from_source(self.BASE_IMAGE_DIR_URL)
        self.cached_search_stats = {}
        
    def get_tree_from_source(self, dir_url):
        """Retrieves the parsed HTML tree from a given directory URL."""
        dir_request = requests.get(dir_url)
        return html.fromstring(dir_request.content)
        
    def search_links(self, search_term):
        """Returns the matching search links for a given search term """
        # Returns an array of "Element a" objects.
        links_results = self.image_html_tree.xpath('.//a[contains(text(), "{}")]'.format(search_term))
        
        # Only get the links from the results
        return [link_result.get('href') for link_result in links_results if self.valid_dir(link_result.get('href'))]
    
    def search_stats(self, search_term):
        """Returns stats for a given search term including total number of matching dirs & images."""
        if search_term not in self.cached_search_stats.keys():
            search_stats = defaultdict(int)
            found_dir_links = self.search_links(search_term)
            search_stats['number_of_matching_dirs'] += len(found_dir_links)

            for dir_link in found_dir_links:
                images_dir_path = self.BASE_IMAGE_DIR_URL + dir_link
                image_link_elements = self.get_tree_from_source(images_dir_path).xpath('.//a')
                image_urls = [element.get('href') for element in image_link_elements if self.valid_image(element.get('href'))]
                search_stats['total_number_of_images'] += len(image_urls)
                search_stats[dir_link] += len(image_urls)

            self.cached_search_stats[search_term] = dict(search_stats)

        print(self.cached_search_stats[search_term])

    def download_images_from_dir(self, dir_path, current_count=0, limit=1000, user_path=None):
        """Downloads all valid images from a given directory to a local directory and outputs process stats"""
        # Downloads the images into a local directory with the same dir_path name or with a provided dir_path.
        images_dir_path = self.BASE_IMAGE_DIR_URL + dir_path
        image_link_elements = self.get_tree_from_source(images_dir_path).xpath('.//a')
        
        # Only gets the valid image links from within this directory
        image_urls = [element.get('href') for element in image_link_elements if self.valid_image(element.get('href'))]
        self.maybe_create_directory((user_path or ('images/' + dir_path)))

        download_statuses = defaultdict(int)

        for image_url in image_urls:
            download_statuses[self.download_image(dir_path, image_url, user_path)] += 1
            if current_count + download_statuses['success'] + download_statuses['skip'] >= limit:
                break
        
        print("Downloaded: {}, Skipped: {}, Failed: {}".format(download_statuses['success'], download_statuses['skip'], download_statuses['fail']))
        return download_statuses
        
    def download_image(self, target_dir, image_file, user_path=None):
        """Downloads an image and returns a status"""
        image_url_source = self.BASE_IMAGE_DIR_URL + target_dir + image_file
        local_target_path = (user_path or ('images/' + target_dir)) + image_file
        
        if os.path.isfile(local_target_path):
            print("Duplicate file detected, skipping image file: {}".format(image_file))
            return 'skip'

        image_request = requests.get(image_url_source, stream=True)
        if image_request.status_code == 200:
            with open(local_target_path, 'wb') as f:
                image_request.raw.decode_content = True
                copyfileobj(image_request.raw, f)
            return 'success'
        else:
            return 'fail'
    
    def valid_image(self, url):
        """Checks for a valid extension for a given image URL and is not part of a movie file"""
        return url.endswith(tuple(self.VALID_IMAGE_EXT))
    
    def valid_dir(self, dir_path):
        """Exclude repetitive movie or sequential frame dirs"""
        return all(inv not in dir_path for inv in ['mvi', 'seq'])
    
    def maybe_create_directory(self, dir_name):
        """Creates a directory if one does not previously exist"""
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)

In [4]:
label_me = LabelMe(tree)



In [8]:
matching_dirs = []
number_of_images = 5000

for group_type in ['living', 'hotel']:
  
  total_images = len(
    [file for file in os.listdir("images/{}/".format(group_type)) if os.path.isfile("images/{}/{}".format(group_type, file)) and file.endswith('jpg')]
  )

  while total_images <= number_of_images:
    matching_dirs = label_me.search_links(group_type)
    for matching_dir in matching_dirs:
      image_counts = label_me.download_images_from_dir(matching_dir, total_images, number_of_images, 'images/{}/'.format(group_type))
      total_images += (image_counts['success'] + image_counts['skip'])
      if total_images >= number_of_images:
        break
    if total_images >= number_of_images:
        break

Duplicate file detected, skipping image file: aa016553.jpg
Duplicate file detected, skipping image file: aa016556.jpg
Duplicate file detected, skipping image file: at_01_5a_1604_05_l.jpg
Duplicate file detected, skipping image file: at_01_6b_5490_30a_l.jpg
Duplicate file detected, skipping image file: at_98_5_950_34_l.jpg
Duplicate file detected, skipping image file: book_living_room.jpg
Duplicate file detected, skipping image file: cdmc1298.jpg
Duplicate file detected, skipping image file: easyst013.jpg
Duplicate file detected, skipping image file: easyst020.jpg
Duplicate file detected, skipping image file: easyst022.jpg
Duplicate file detected, skipping image file: easyst035.jpg
Duplicate file detected, skipping image file: familyroom22.jpg
Duplicate file detected, skipping image file: familyroom97.jpg
Duplicate file detected, skipping image file: in108.jpg
Duplicate file detected, skipping image file: int4.jpg
Duplicate file detected, skipping image file: int13.jpg
Duplicate file de

KeyboardInterrupt: 