In [1]:
#imports
import sys, os, numpy, pandas, cv2, csv, operator, json
from time import sleep
from tqdm import tnrange, tqdm_notebook

In [2]:
#globals
path = "train_data/originals/"
graypath = "train_data/grayscale/"
csvpath = "train_data/train.csv"
image_set = []
label_set = []
petID_column = 21
adoptionspeed_column = 23
#create dictionary
pets = {}

##### makeDirectories() <br/>
_This method creates directories if needed for each petid denoted by the image filenames._

##### removeEmptyFolders() <br/>
_This method deletes empty folders inside of a specified folder path._

##### loadImages() <br/>
_This method reads in the original image files one by one, previewing a certain number of them if specified, resizing them all, converting them to grayscale, and saving them out if they don't already exist._

In [3]:
def makeDirectories():
    if not os.path.exists("train_data"):
        os.mkdir("train_data")
    if not os.path.exists(path):
        os.mkdir(path)
    if not os.path.exists(graypath):
        os.mkdir(graypath)
    
    valid_types = [".jpg",".gif",".png",".tga"]
    for f in tqdm_notebook(os.listdir(path), desc='Directory Labelling Progress'):
        ext = os.path.splitext(f)[1]
        if ext.lower() not in valid_types:
            return
        n = f.split("-")[0]
        label_set.append(n)
        if not os.path.exists(graypath + n + "/"):
            os.mkdir(graypath + n + "/")
            print("\rCreated Directory: " + graypath + n + "/", end='')
    return

In [4]:
def removeEmptyFolders(folderpath="train_data/grayscale/"):
    for folder in tqdm_notebook(os.listdir(folderpath), desc='Empty folder deletion Progress'):
        if os.path.exists(folderpath + folder):
            if next(os.scandir(folderpath + folder), None) is None:
                print("\rRemoving empty folder: " + folder + "\tfrom: " + folderpath)
                os.rmdir(folderpath + folder)
return

In [5]:
def loadImages(img_preview=0, sizex=200, sizey=200):
    if not os.path.exists("train_data"):
        os.mkdir("train_data")
    if not os.path.exists(path):
        os.mkdir(path)
    if not os.path.exists(graypath):
        os.mkdir(graypath)
  
    valid_types = [".jpg",".gif",".png",".tga"]
    #i = 0
    for f in tqdm_notebook(os.listdir(path), desc='Grayscale Conversion Progress'):
        ext = os.path.splitext(f)[1]
        if ext.lower() not in valid_types:
            return
        n = f.split("-")[0]
        if not os.path.isfile(graypath + n + "/" + f):
            image = cv2.imread(path+"/"+f, 0) #pass in 0 as second parameter to automatically convert to grayscale
            if image is not None:
                image = cv2.resize(image, dsize=(sizex, sizey), interpolation = cv2.INTER_CUBIC)
                image_set.append(image)
                if not os.path.isfile(graypath + n + "/" + f):
                    cv2.imwrite(graypath + n + "/" + f, image)
                    print("\rCreated File: " + graypath + n + "/" + f, end='')
    return

##### initial_image_processing() <br/>
_This method runs the previous 3 in a batch call in a specific order._

In [6]:
def initial_image_processing():
    #for i in tnrange(1, desc="initial image processing progress"):
    makeDirectories()
    loadImages()
    removeEmptyFolders()
    return

##### isEnglish() <br/>
_This method returns true if the passed in string is commonly used in the english alphabet._

##### readCSV() <br/>
_This method goes through each row in a csv file specified in the globals and matches their petid column with the petid labels we got from loading all of our images in. we then store the pet id and adoption speed into a pets dictionary._

##### pandasCSV() <br/>
_This method creates reads in specified columns of the csv file, petid and adoptionspeed, and if the petid matches a label in the label set from earlier loading then we store them into a dictionary, I had problems using this and running out of memory or something on jupyter notebook, so I simply used the readCSV method, but this one compares faster as far as it seems._

In [7]:
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

def readCSV(printout=None):
    with open(csvpath, 'r', encoding="utf-8") as csv_file:
        csv_reader= csv.reader(csv_file)
        label_set.sort()
        for line in tqdm_notebook(csv_reader, desc="csv label matching progress"):
            for l in label_set:
                if line[petID_column] == l:
                    if printout is not None:
                        print(line[petID_column] + ", " + l)
                    key = l
                    if not key in pets:
                        pets.setdefault(key, []).append(line[adoptionspeed_column])
        return pets

def pandasCSV():
    #pandas alternative start to above
    with open(csvpath, 'r', encoding="utf-8") as csv_file:
        df = pandas.read_csv(csv_file, usecols=['PetID', 'AdoptionSpeed'])
        for l in tqdm_notebook(label_set, desc="csv label matching progress"):
            key = l
            pets.setdefault(key, []).append(df[df['PetID'] == l])

##### save_json() <br/>
_This method uses a specified or default filepath and filename as well as the pets dictionary, which can also be specified to save other variables out, such as lists or arrays. Using these, it simply saves out the data into a .json file._

##### load_json() <br/>
_This method uses a specified or default filepath and filename as well as the pets dictionary, which can also be specified to load and store the data into a different variable name or type, such as a list or an array. Using these it simply loads the data from a .json file._

In [8]:
def save_json(filepath="train_data/", filename="pets_dictionary.json", dictionary=pets):
    with open(filepath + filename, 'w') as file:
        json.dump(dictionary, file, sort_keys=True, indent=4)
    return
def load_json(filepath="train_data/", filename="pets_dictionary.json", dictionary=pets):
    with open(filepath + filename, 'r') as file:
        dictionary = json.load(file)
    return dictionary

## current compilation of methods up until now.

In [9]:
initial_image_processing()
pets = readCSV()
print(pets)
save_json()
save_json(filename="label_set.json", dictionary=label_set)
pets = load_json()
print(pets)
label_set = load_json(filename="label_set.json", dictionary=label_set)
print(label_set)

HBox(children=(IntProgress(value=0, description='Directory Labelling Progress', max=58311, style=ProgressStyle…




HBox(children=(IntProgress(value=0, description='Grayscale Conversion Progress', max=58311, style=ProgressStyl…




HBox(children=(IntProgress(value=0, description='Empty folder deletion Progress', max=14652, style=ProgressSty…




HBox(children=(IntProgress(value=1, bar_style='info', description='csv label matching progress', max=1, style=…




{'0008c5398': ['3'], '000a290e4': ['2'], '000fb9572': ['3'], '0011d7c25': ['2'], '00156db4a': ['1'], '001a1aaad': ['2'], '001b1507c': ['1'], '002230dea': ['1'], '0025a8313': ['4'], '0038234c6': ['4'], '0038c9343': ['2'], '003dd2e26': ['2'], '0045ed62a': ['3'], '004a26127': ['3'], '004c2f355': ['1'], '0052dcf47': ['4'], '00553ae55': ['1'], '0058586f1': ['4'], '005afe792': ['2'], '005bb92d8': ['1'], '0063bd7e0': ['4'], '0063f83c9': ['1'], '00648f96f': ['2'], '006610fe3': ['4'], '006d301e9': ['3'], '006ffebaf': ['2'], '00709d75b': ['1'], '0073c33d0': ['3'], '007bc1db4': ['4'], '007ffeccd': ['2'], '00904fc2c': ['3'], '0090eb457': ['4'], '009239f92': ['3'], '00a1a8af4': ['2'], '00a1f270a': ['4'], '00ac364a4': ['4'], '00af25fc0': ['2'], '00b00d5c1': ['2'], '00b23513e': ['2'], '00be0a63e': ['1'], '00bfa5da9': ['4'], '00c19f4fa': ['2'], '00c71a320': ['4'], '00e50c7a7': ['1'], '00eca0391': ['4'], '00eced1f0': ['3'], '00f3a3993': ['2'], '00f63c6fc': ['4'], '00f997463': ['4'], '010a59236': ['4'],

{'0008c5398': ['3'], '000a290e4': ['2'], '000fb9572': ['3'], '0011d7c25': ['2'], '00156db4a': ['1'], '001a1aaad': ['2'], '001b1507c': ['1'], '002230dea': ['1'], '0025a8313': ['4'], '0038234c6': ['4'], '0038c9343': ['2'], '003dd2e26': ['2'], '0045ed62a': ['3'], '004a26127': ['3'], '004c2f355': ['1'], '0052dcf47': ['4'], '00553ae55': ['1'], '0058586f1': ['4'], '005afe792': ['2'], '005bb92d8': ['1'], '0063bd7e0': ['4'], '0063f83c9': ['1'], '00648f96f': ['2'], '006610fe3': ['4'], '006d301e9': ['3'], '006ffebaf': ['2'], '00709d75b': ['1'], '0073c33d0': ['3'], '007bc1db4': ['4'], '007ffeccd': ['2'], '00904fc2c': ['3'], '0090eb457': ['4'], '009239f92': ['3'], '00a1a8af4': ['2'], '00a1f270a': ['4'], '00ac364a4': ['4'], '00af25fc0': ['2'], '00b00d5c1': ['2'], '00b23513e': ['2'], '00be0a63e': ['1'], '00bfa5da9': ['4'], '00c19f4fa': ['2'], '00c71a320': ['4'], '00e50c7a7': ['1'], '00eca0391': ['4'], '00eced1f0': ['3'], '00f3a3993': ['2'], '00f63c6fc': ['4'], '00f997463': ['4'], '010a59236': ['4'],




['0008c5398', '0008c5398', '0008c5398', '0008c5398', '0008c5398', '0008c5398', '000a290e4', '000a290e4', '000fb9572', '000fb9572', '000fb9572', '000fb9572', '000fb9572', '000fb9572', '0011d7c25', '0011d7c25', '0011d7c25', '00156db4a', '00156db4a', '00156db4a', '00156db4a', '00156db4a', '001a1aaad', '001a1aaad', '001a1aaad', '001a1aaad', '001b1507c', '001b1507c', '001b1507c', '001b1507c', '001b1507c', '001b1507c', '002230dea', '002230dea', '002230dea', '002230dea', '002278114', '002278114', '002278114', '002278114', '002278114', '002278114', '002278114', '002278114', '002278114', '002278114', '002278114', '002278114', '002278114', '0025a8313', '0025a8313', '0038234c6', '0038234c6', '0038234c6', '0038c9343', '0038c9343', '0038c9343', '0038c9343', '0038c9343', '003dd2e26', '003dd2e26', '003dd2e26', '0045ed62a', '0045ed62a', '0045ed62a', '004709939', '004709939', '004709939', '004a26127', '004a26127', '004c2f355', '004c2f355', '0052dcf47', '0052dcf47', '0052dcf47', '00553ae55', '00553ae55'