# Creating your own datasets

In [1]:
from fastai import *
from fastai.vision.all import *

As a fun project I want to build a classifier which can tell one cartoon apart from another. For this first I need to create a dataset of the images corresponding to different cartoons for which I am going to use the `search_images_ddg` function defined in [fastai docs here](https://course.fast.ai/images#DuckDuckGo).

In [2]:
def search_images_ddg(term, max_images):
    "Search for `term` with DuckDuckGo and return a unique urls of about `max_images` images"
    
    assert max_images<1000
    
    url = 'https://duckduckgo.com/'
    res = urlread(url,data={'q':term})
    searchObj = re.search(r'vqd=([\d-]+)\&', res)
    assert searchObj
    
    requestUrl = url + 'i.js'
    params = dict(l='us-en', o='json', q=term, vqd=searchObj.group(1), f=',,,', p='1', v7exp='a')
    urls,data = set(),{'next':1}
    
    while len(urls)<max_images and 'next' in data:
        try:
            data = urljson(requestUrl,data=params)
            urls.update(L(data['results']).itemgot('image'))
            requestUrl = url + data['next']
        except (URLError,HTTPError): pass
        time.sleep(0.2)
    
    return L(urls)

In [3]:
# Define a list of my favorite cartoons
cartoons = L(["bugs bunny", "donald duck", "mickey mouse", "shinchan"])

In [4]:
# Define a dictionary which maps all the cartoons to their respective links as returned 
# from the function above and collect data from the same
cartoonsLinks = {}
for cartoon in progress_bar(cartoons, comment = "Extracting image links"):
    cartoonsLinks[cartoon] = search_images_ddg(cartoon, max_images = 200)

In [5]:
# Now that we have the links for every item, let's download the images
path = Path("/home/vinayak/cartoons")

# Do this only if the cartoons folder doesn't exist
if not path.exists():
    path.mkdir()
    
    # Loop over each cartoon links
    for cartoon in progress_bar(cartoonsLinks.keys(), comment = "Downloading images..."):
        # Create a folder to hold the cartoons images (if a folder already exists, don't raise an error) 
        dest = (path/cartoon)
        dest.mkdir(exist_ok=True)
        
        # Retrieve the cartoons links and download the images to the specified path
        results = cartoonsLinks[cartoon]
        download_images(dest, urls=results, timeout=2)

In [10]:
# Let's see how many total image links are there v/s how many images got downloaded
for cartoon in cartoonsLinks.keys():
    downloaded_images = get_image_files(path/cartoon)
    print(f"{cartoon:<20}: n_image links = {len(cartoonsLinks[cartoon])}; \
    downloaded images = {len(downloaded_images)}")

bugs bunny          : n_image links = 265;     downloaded images = 231
donald duck         : n_image links = 255;     downloaded images = 217
mickey mouse        : n_image links = 285;     downloaded images = 243
shinchan            : n_image links = 254;     downloaded images = 240


In [7]:
# See how many images got downloaded in all
filenames = get_image_files(path)
filenames

(#961) [Path('/home/vinayak/cartoons/donald duck/00000112.jpg'),Path('/home/vinayak/cartoons/donald duck/00000216.jpg'),Path('/home/vinayak/cartoons/donald duck/00000154.jpg'),Path('/home/vinayak/cartoons/donald duck/00000045.jpg'),Path('/home/vinayak/cartoons/donald duck/00000029.png'),Path('/home/vinayak/cartoons/donald duck/00000069.jpeg'),Path('/home/vinayak/cartoons/donald duck/00000187.jpg'),Path('/home/vinayak/cartoons/donald duck/00000090.png'),Path('/home/vinayak/cartoons/donald duck/00000132.png'),Path('/home/vinayak/cartoons/donald duck/00000123.png')...]

In [8]:
# Check the images out of the downloaded images which were corrupt/ couldn't be downloaded etc.
failed = verify_images(filenames)
failed



(#30) [Path('/home/vinayak/cartoons/donald duck/00000029.png'),Path('/home/vinayak/cartoons/donald duck/00000132.png'),Path('/home/vinayak/cartoons/donald duck/00000105.png'),Path('/home/vinayak/cartoons/donald duck/00000137.jpg'),Path('/home/vinayak/cartoons/donald duck/00000244.png'),Path('/home/vinayak/cartoons/donald duck/00000236.png'),Path('/home/vinayak/cartoons/donald duck/00000225.png'),Path('/home/vinayak/cartoons/donald duck/00000151.png'),Path('/home/vinayak/cartoons/donald duck/00000115.png'),Path('/home/vinayak/cartoons/donald duck/00000198.png')...]

In [9]:
# Delete the images which were corrupt/weren't downloaded properly
failed.map(Path.unlink);

This is how we can use fastai helper functions along with basic python datastructures to create our very own dataset for training.