# Creating your own datasets

In [1]:
from fastai import *
from fastai.vision.all import *

As a fun project I want to build a classifier which can tell one Avenger apart from another. For this first I need to create a dataset of the images corresponding to different Avengers for which I am going to use the `search_images_ddg` function defined in [fastai docs here](https://course.fast.ai/images#DuckDuckGo).

In [2]:
def search_images_ddg(term, max_images):
    "Search for `term` with DuckDuckGo and return a unique urls of about `max_images` images"
    
    assert max_images<1000
    
    url = 'https://duckduckgo.com/'
    res = urlread(url,data={'q':term})
    searchObj = re.search(r'vqd=([\d-]+)\&', res)
    assert searchObj
    
    requestUrl = url + 'i.js'
    params = dict(l='us-en', o='json', q=term, vqd=searchObj.group(1), f=',,,', p='1', v7exp='a')
    urls,data = set(),{'next':1}
    
    while len(urls)<max_images and 'next' in data:
        try:
            data = urljson(requestUrl,data=params)
            urls.update(L(data['results']).itemgot('image'))
            requestUrl = url + data['next']
        except (URLError,HTTPError): pass
        time.sleep(0.2)
    
    return L(urls)

In [3]:
# Define a list of my favorite avengers
animals = L(["cats", "horses", "tigers", "frogs"])

In [4]:
# Define a dictionary which maps all the avengers to their respective links as returned 
# from the function above and collect data from the same
animalsLinks = {}
for animal in progress_bar(animals, comment = "Extracting image links"):
    animalsLinks[animal] = search_images_ddg(animal, max_images = 64)

In [5]:
# Now that we have the links for every item, let's download the images
path = Path("/home/vinayak/Animals")

if not path.exists():
    path.mkdir()
    for animal in progress_bar(animalsLinks.keys(), comment = "Downloading images..."):
        dest = (path/animal)
        dest.mkdir(exist_ok=True)
        results = animalsLinks[animal]
        download_images(dest, urls=results, timeout=2)


 Download of http://www.zastavki.com/pictures/originals/2013/Animals___Horses_Thoroughbred_horses_053765_.jpg has failed after 5 retries
 Fix the download manually:
$ mkdir -p /home/vinayak/Animals/horses
$ cd /home/vinayak/Animals/horses
$ wget -c http://www.zastavki.com/pictures/originals/2013/Animals___Horses_Thoroughbred_horses_053765_.jpg
$ tar xf Animals___Horses_Thoroughbred_horses_053765_.jpg
 And re-run your code once the download is successful


 Download of http://www.zastavki.com/pictures/originals/2013/Animals___Horses_Trotting_horses_053825_.jpg has failed after 5 retries
 Fix the download manually:
$ mkdir -p /home/vinayak/Animals/horses
$ cd /home/vinayak/Animals/horses
$ wget -c http://www.zastavki.com/pictures/originals/2013/Animals___Horses_Trotting_horses_053825_.jpg
$ tar xf Animals___Horses_Trotting_horses_053825_.jpg
 And re-run your code once the download is successful



In [7]:
# Let's see how many total image links are there v/s how many images got downloaded
for animal in animalsLinks.keys():
    downloaded_images = get_image_files(path/animal)
    print(f"{animal:<20}: n_image links = {len(animalsLinks[animal])}; downloaded images = {len(downloaded_images)}")

cats                : n_image links = 100; downloaded images = 88
horses              : n_image links = 100; downloaded images = 87
tigers              : n_image links = 100; downloaded images = 85
frogs               : n_image links = 100; downloaded images = 86


In [8]:
# See how many images got downloaded in all
filenames = get_image_files(path)
filenames

(#346) [Path('/home/vinayak/Animals/horses/00000057.jpg'),Path('/home/vinayak/Animals/horses/00000045.jpg'),Path('/home/vinayak/Animals/horses/00000047.jpg'),Path('/home/vinayak/Animals/horses/00000009.jpg'),Path('/home/vinayak/Animals/horses/00000073.jpg'),Path('/home/vinayak/Animals/horses/00000024.jpg'),Path('/home/vinayak/Animals/horses/00000080.jpg'),Path('/home/vinayak/Animals/horses/00000051.jpg'),Path('/home/vinayak/Animals/horses/00000088.jpg'),Path('/home/vinayak/Animals/horses/00000034.jpg')...]

In [9]:
# Check the images out of the downloaded images which were corrupt/ couldn't be downloaded etc.
failed = verify_images(filenames)
failed

(#0) []

In [10]:
# Delete the images which were corrupt/weren't downloaded properly
failed.map(Path.unlink);

This is how we can use fastai helper functions along with basic python datastructures to create our very own dataset for training.