# The Picnic Custom Computer Vision Model

As an experiment, to measure the performance of existing image classification algorithms, the computer vision solution provided by Microsoft Azure will be used.

So let's get started!

In [1]:
import pandas as pd
import requests
import io
import os
import zipfile

In [2]:
if not os.path.exists("The Picnic Hackathon 2019/"):
    zf = zipfile.ZipFile("../The Picnic Hackathon 2019.zip") # Change this to the folder containing your data zipfile
    zf.extractall()

In [3]:
df = pd.read_csv('The Picnic Hackathon 2019/train.tsv', delimiter='\t', header=0) # the filename/labels of the training data
df.head()

Unnamed: 0,file,label
0,0.png,"Bananas, apples & pears"
1,1.png,Berries & cherries
2,2.jpeg,"Pork, beef & lamb"
3,3.png,Berries & cherries
4,4.jpeg,"Bell peppers, zucchinis & eggplants"


In [4]:
df.groupby("label").count() # this shows class distribution

Unnamed: 0_level_0,file
label,Unnamed: 1_level_1
"Asparagus, string beans & brussels sprouts",129
"Bananas, apples & pears",851
"Bell peppers, zucchinis & eggplants",543
Berries & cherries,273
"Broccoli, cauliflowers, carrots & radish",248
Cheese,288
Citrus fruits,307
"Cucumber, tomatoes & avocados",515
Eggs,251
Fish,93


We cannot use any files that exeed the 6MB file limit, so we need to filter those.

In [5]:
df_filtered = pd.DataFrame(columns=["file", "label"])
base_image_url = "The Picnic Hackathon 2019/train/"

print("Checking images...")

image_list = []

for index, row in df.iterrows():
    # Check if file smaller than 6MB
    if(os.stat(base_image_url + row["file"]).st_size < 6291456) :
        df_filtered = df_filtered.append(row)

Checking images...


### Image Duplication

It turns out that our training data contains duplicate images, we do not want duplicates!

Let's use the image hash to find and remove duplicates

In [6]:
import os, sys
from PIL import Image, ImageOps, ImageStat
 
def findDup(parentFolder):
    # Dups in format {hash:[names]}
    dups = {}
    for dirName, subdirs, fileList in os.walk(parentFolder):
        print('Scanning %s...' % dirName)
        for filename in fileList:
            # Get the path to the file
            path = os.path.join(dirName, filename)
            # Calculate hash
            file_hash = hash_image2(path)
            # Add or append the file path
            if file_hash in dups:
                dups[file_hash].append(filename)
            else:
                dups[file_hash] = [filename]
    return dups

def hash_image2(image_path):
    img = Image.open(image_path).resize((8,8), Image.LANCZOS).convert(mode="L")
    mean = ImageStat.Stat(img).mean[0]
    return sum((1 if p > mean else 0) << i for i, p in enumerate(img.getdata()))

In [7]:
dupdict = findDup("The Picnic Hackathon 2019/train/")

Scanning The Picnic Hackathon 2019/train/...


  ' expressed in bytes should be converted ' +


In [8]:
print(len(dupdict))

7208


In [9]:
to_drop = []

for value in dupdict.values():
    if len(value) > 1:
        for e in value[1:]:
            to_drop.append(e)
        
df_filtered = df_filtered[~df_filtered.file.isin(to_drop)]

### Creating the trainer, adding labels and adding training data

In [26]:
from azure.cognitiveservices.vision.customvision.training import CustomVisionTrainingClient
from azure.cognitiveservices.vision.customvision.training.models import ImageFileCreateEntry

ENDPOINT = "https://westeurope.api.cognitive.microsoft.com"

# Replace with a valid key
training_key = open("../training_key.txt", "r").read() # change this line to use your own training key
prediction_key = open("../prediction_key.txt", "r").read() # change this line to use your own prediction key
prediction_resource_id = open("../prediction_resource_id.txt", "r").read()

publish_iteration_name = "classifyModel"

trainer = CustomVisionTrainingClient(training_key, endpoint=ENDPOINT)

domains = {}

for domain in trainer.get_domains():
    domains[domain.name] = domain

# Create a new project
print ("Creating project...")
project = trainer.create_project("My Picnic Classifier 2", domain_id= domains["Food"].id, classification_type="Multiclass")

Creating project...


In [27]:
# Make tags in the new project
asp = trainer.create_tag(project.id, "Asparagus, string beans & brussels sprouts")
ban = trainer.create_tag(project.id, "Bananas, apples & pears")
bel = trainer.create_tag(project.id, "Bell peppers, zucchinis & eggplants")
ber = trainer.create_tag(project.id, "Berries & cherries")
bro = trainer.create_tag(project.id, "Broccoli, cauliflowers, carrots & radish")
che = trainer.create_tag(project.id, "Cheese")
cit = trainer.create_tag(project.id, "Citrus fruits")
cuc = trainer.create_tag(project.id, "Cucumber, tomatoes & avocados")
egg = trainer.create_tag(project.id, "Eggs")
fis = trainer.create_tag(project.id, "Fish")
frb = trainer.create_tag(project.id, "Fresh bread")
frh = trainer.create_tag(project.id, "Fresh herbs")
kiw = trainer.create_tag(project.id, "Kiwis, grapes & mango")
lun = trainer.create_tag(project.id, "Lunch & Deli Meats")
mil = trainer.create_tag(project.id, "Milk")
mim = trainer.create_tag(project.id, "Minced meat & meatballs")
nec = trainer.create_tag(project.id, "Nectarines, peaches & apricots")
oni = trainer.create_tag(project.id, "Onions, leek, garlic & beets")
pin = trainer.create_tag(project.id, "Pineapples, melons & passion fruit")
por = trainer.create_tag(project.id, "Pork, beef & lamb")
pot = trainer.create_tag(project.id, "Potatoes")
pou = trainer.create_tag(project.id, "Poultry")
pre = trainer.create_tag(project.id, "Pre-baked breads")
pud = trainer.create_tag(project.id, "Pudding, yogurt & quark")
sal = trainer.create_tag(project.id, "Salad & cress")

In [28]:
labels = [asp, ban, bel, ber, bro, che, cit, cuc, egg, fis, frb, frh, kiw,
            lun, mil, mim, nec, oni, pin, por, pot, pou, pre, pud, sal]

In [29]:
base_image_url = "The Picnic Hackathon 2019/train/"

print("Adding images...")

image_list = []

for index, row in df_filtered.iterrows():
    for l in labels:
        if(row["label"] == l.name):
            with open(base_image_url + row["file"], "rb") as image_contents:
                image_list.append(ImageFileCreateEntry(name=row["file"], contents=image_contents.read(), tag_ids=[l.id]))

Adding images...


According to the documentation of the API that uploads images:

        This API accepts a batch of files, and optionally tags, to create
        images. There is a limit of 64 images and 20 tags.
        
Since every class contains 93 images, we need to manually create batches.        

In [30]:
import math

batches = []

# create mini-batches and add them to a list
for i in range(math.ceil(len(image_list)/64)):
    batch = image_list[i*64:(i+1)*64]
    batches.append(batch)

for batch in batches:
    upload_result = trainer.create_images_from_files(project.id, images=batch)
    if not upload_result.is_batch_successful:
        print("Image batch upload failed.")
        for result in upload_result.images:
            print("Image status: ", result.status, result.image.created)
        exit(-1)

In [32]:
import time

print ("Training...")
iteration = trainer.train_project(project.id)
while (iteration.status != "Completed"):
    iteration = trainer.get_iteration(project.id, iteration.id)
    print ("Training status: " + iteration.status)
    time.sleep(10)

# The iteration is now trained. Publish it to the project endpoint
trainer.publish_iteration(project.id, iteration.id, publish_iteration_name, prediction_resource_id)
print ("Done!")

Training...
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training
Training status: Training


### Using the published model to classify test data

In [33]:
# Read the tsv containing the test submissions:
df_test = pd.read_csv('The Picnic Hackathon 2019/test.tsv', delimiter='\t', header=0)

In [34]:
print("Checking images...")

large_files = []

for index, row in df_test.iterrows():
    # Check if file smaller than 6MB
    if(os.stat("The Picnic Hackathon 2019/test/" + row["file"]).st_size > 4194304) :
        large_files.append(row["file"])
        
large_files

Checking images...


[]

In [35]:
for f in large_files:
    image = Image.open("The Picnic Hackathon 2019/test/" + f)
    image.save("The Picnic Hackathon 2019/test/" + f,quality=20,optimize=True)

In [36]:
def classify_file(row):
    with open("The Picnic Hackathon 2019/test/" + row["file"], "rb") as image_contents:
        results = predictor.classify_image(project.id, publish_iteration_name, image_contents.read())
        return results.predictions[0].tag_name

In [37]:
from azure.cognitiveservices.vision.customvision.prediction import CustomVisionPredictionClient

# Now there is a trained endpoint that can be used to make a prediction
predictor = CustomVisionPredictionClient(prediction_key, endpoint=ENDPOINT)

df_test["label"] = df_test.apply(lambda row: classify_file(row), axis=1)

In [38]:
df_test

Unnamed: 0,file,label
0,7263.jpeg,"Bell peppers, zucchinis & eggplants"
1,7264.jpeg,Eggs
2,7265.jpeg,"Broccoli, cauliflowers, carrots & radish"
3,7266.png,Lunch & Deli Meats
4,7267.jpeg,Potatoes
5,7268.png,"Bananas, apples & pears"
6,7269.jpeg,"Broccoli, cauliflowers, carrots & radish"
7,7270.png,Fresh herbs
8,7271.png,Eggs
9,7272.jpeg,Berries & cherries


In [39]:
df_test.to_csv("11-04-19_2.tsv", sep='\t')