In [None]:
import os
import json
import random
from copy import deepcopy
import shutil
from fastai.vision import *
from pathlib import Path

In [None]:
class Breed:
    def __repr__(self):
        return f"{self.name} - ({len(self.files)})"
    
    def __init__(self, name, rank, files):
        self.name = name
        self.rank = -1
        self.files = files

In [None]:
#folder setup
if os.path.exists("./data/images/unprocessed"): shutil.rmtree("./data/images/unprocessed")
if os.path.exists("./data/images/processed"): shutil.rmtree("./data/images/processed")

In [None]:
pathUnprocessed = Path("./data/images/unprocessed")
pathUnprocessedOxford = Path("./data/images/unprocessed/oxford")
pathUnprocessedStanford = Path("./data/images/unprocessed/stanford")
pathUnprocessedGoogle = Path("./data/images/unprocessed/google")
pathGoogleImagesSource = Path("./data/google-images-source")
pathGoogleImagesSourceCleaned = Path("./data/google-images-source-cleaned")
pathUnprocessedCC0 = Path("./data/images/unprocessed/cc0")
pathCC0ImagesSource = Path("./data/cc0-images-source")
pathProcessed = Path("./data/images/processed")

In [None]:
os.makedirs(pathUnprocessed)
os.makedirs(pathUnprocessedOxford)
os.makedirs(pathUnprocessedStanford)
os.makedirs(pathProcessed)
os.makedirs(pathUnprocessedGoogle)
os.makedirs(pathGoogleImagesSourceCleaned)
os.makedirs(pathUnprocessedCC0)

In [None]:
#download
pathArchiveOxford = download_data("https://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz", pathUnprocessedOxford/"oxford.tar.gz", ext = "")
pathArchiveStanford = download_data("http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar", pathUnprocessedStanford/"stanford.tar", ext = "")
pathArchiveOxford, pathArchiveStanford

In [None]:
remove = {}
remove["bichons-frise"] = [44, 45, 50, 78]
remove["american-curl"] = [46, 193]
remove["american-shorthair"] = [45, 89, 94]
remove["australian-cattle-dog"] = [77, 78, 97]
remove["black-mouth-cur"] = [81, 88]
remove["brussels-griffon"] = [39]
remove["burmese"] = [90]
remove["chinese-crested"] = [57]
remove["chinese-shar-pei"] = [90]
remove["cornish-rex"] = [62, 66]
remove["coton-de-tulear"] = [49, 87]
remove["dalmation"] = [53, 70, 75, 76, 79, 98]
remove["devon-rex"] = [0, 56, 75]
remove["dogues-de-bordeaux"] = [41, 62, 64, 72, 91]
remove["exotic-shorthair"] = [22, 41, 47, 49, 69, 78, 86, 92, 93, 95]
remove["lagotti-romagnoli"] = [10, 20, 29, 34, 39, 43, 45, 51, 57, 58, 73, 74, 77, 78, 87, 97, 98]
remove["mastiff"] = [42, 45, 57]
remove["norwegian-forest-cat"] = [48]
remove["nova-scotia-duck-tolling-retriever"] = [68, 71, 83, 85]
remove["portuguese-water-dog"] = [40, 57]
remove["ragamuffin"] = [29, 31, 45, 60, 66, 75, 81, 99]
remove["rat-terrier"] = [42, ]
remove["scottish-fold"] = [41, 69, ]
remove["siberian"] = [50, 62, 104, 144, 156, 158, 177]

In [None]:
# google
for file in pathGoogleImagesSource.ls():
    filePath = file
    fileName = os.path.basename(file)
    breedName = fileName.replace(".txt", "")

    if breedName in remove:
        filePath = pathGoogleImagesSourceCleaned/fileName
        with open(file) as f:
            lines = f.readlines()
            items = [ l for (i, l) in enumerate(lines) if i not in remove[breedName] ]
            with open(filePath, "w") as w:
                w.writelines(items)
                
    print(breedName)
    download_images(filePath, pathUnprocessedGoogle/breedName)
    verify_images(pathUnprocessedGoogle/breedName)   

In [None]:
# CC0
for file in pathCC0ImagesSource.ls():
    name = os.path.basename(file).replace(".txt", "")
    print(name)
    download_images(file, pathUnprocessedCC0/name)
    verify_images(pathUnprocessedCC0/name)   

In [None]:
#extract
def extract(inputFile, outputPath):
    tar = tarfile.open(inputFile)
    tar.extractall(outputPath)
    tar.close()

print("extraction started")
extract(pathArchiveOxford, pathUnprocessedOxford)
print("oxford complete")
extract(pathArchiveStanford, pathUnprocessedStanford)
print("stanford complete")

In [None]:
#catalog
data = {}

# oxford
for file in (pathUnprocessedOxford/"images").ls():
    name = "-".join(os.path.basename(file).replace(".jpg", "").lower().split("_",)[:-1])
    if name not in data: data[name] = []
    data[name].append(file)

In [None]:
# stanford
for file in (pathUnprocessedStanford/"Images").ls():
    name = "-".join(str(file).split("-")[1:]).replace("_", "-").lower()
    if name not in data: data[name] = []
    data[name].extend(file.ls())

In [None]:
# google
for folder in pathUnprocessedGoogle.ls():
    name = os.path.basename(folder)
    if name not in data: data[name] = []
    data[name].extend(folder.ls())

In [None]:
# cc0
for folder in pathUnprocessedCC0.ls():
    name = os.path.basename(folder)
    if name not in data: data[name] = []
    data[name].extend(folder.ls())

In [None]:
with open("data/dog-rankings-source.json") as f: 
    j = json.load(f)
    dogRankings = { item["rank"] : item["name"] for item in j}
    
rankedDogs = [ dogRankings[rank] for rank in sorted(dogRankings.keys())]
rankedDogs[:5]

for name in rankedDogs: 
    if name not in data: data[name] = []

In [None]:
with open("data/cat-rankings-source.json") as f: 
    j = json.load(f)
    catRankings = { item["rank"] : item["name"] for item in j}
    
rankedCats = [ catRankings[rank] for rank in sorted(catRankings.keys())]
rankedCats[:5]

for name in rankedCats: 
    if name not in data: data[name] = []

In [None]:
#clean duplicates
cleanedData = deepcopy(data)
cleanedData["basset-hound"].extend(cleanedData.pop("basset"))
cleanedData["german-short-haired-pointer"].extend(cleanedData.pop("german-shorthaired"))
cleanedData["leonberger"].extend(cleanedData.pop("leonberg"))
#cleanedData["samoyed"].extend(cleanedData.pop("samyed")) 
cleanedData["japanese-chin"].extend(cleanedData.pop("japanese-spaniel"))
cleanedData["scottish-terrier"].extend(cleanedData.pop("scotch-terrier"))
cleanedData["wheaten-terrier"].extend(cleanedData.pop("soft-coated-wheaten-terrier"))
cleanedData["staffordshire-bull-terrier"].extend(cleanedData.pop("staffordshire-bullterrier"))
cleanedData["chow-chow"].extend(cleanedData.pop("chow"))
cleanedData["doberman-pinscher"].extend(cleanedData.pop("doberman"))
cleanedData["pembroke-welsh-corgi"].extend(cleanedData.pop("pembroke"))

In [None]:
#sort dogs
dogs = []
for i, n in enumerate(rankedDogs):
    if n in cleanedData: dogs.append(Breed(n, i, cleanedData[n]))
    else: dogs.append(Breed(n, i, []))
dogs[:5]

In [None]:
#sort cats
cats = []
for i, n in enumerate(rankedCats):
    if n in cleanedData: cats.append(Breed(n, i, cleanedData[n]))
    else: cats.append(Breed(n, i, []))
cats[:5]

In [None]:
#top dogs
topDogs = dogs[:100]
topDogs.extend([x for x in dogs if x.name == "black-mouth-cur"])

In [None]:
topDogsMissingData = { i:x for i, x in enumerate(topDogs) if len(x.files) == 0}
topDogsMissingData, len(topDogsMissingData)

In [None]:
#top cats
topCats = [ breed for breed in cats if len(breed.files) > 0 ]
topCats

In [None]:
if os.path.exists(pathProcessed): shutil.rmtree(pathProcessed)

topBreeds = topDogs + topCats
for d in topBreeds:
    breedDir = pathProcessed/d.name
    os.makedirs(breedDir)
    
    for file in d.files: 
        shutil.copy(file, breedDir)
    print(f"copied {d.name}")

In [None]:
for d in topBreeds:
    if len(d.files) < 100:
        print(d)