In [7]:
# Will use pre-trained MXNet model for detecting cats.
# See https://gluon-cv.mxnet.io/build/examples_detection/demo_yolo.html#sphx-glr-build-examples-detection-demo-yolo-py
from gluoncv import model_zoo, data, utils
import mxnet
import numpy as np

# Utils used to extract images from tars
import tarfile
import os

# Used to display progress message
from tqdm import tqdm_notebook

# Used to read dataset
import pandas
import pyarrow.parquet as parquet

# Where the downloaded data are
input_path = '/home/adeykin/projects/SNAHackaton/'
# Where to store results
output_path = '/home/adeykin/projects/SNAHackaton/'

In [3]:
#ctx = mxnet.gpu(0)
ctx = mxnet.cpu(0)

In [4]:
# Using the fastest model for detection
net = model_zoo.get_model('yolo3_darknet53_coco', pretrained=True, ctx=ctx)

In [5]:
def extractFeatures(inputFile, outputFile, batchSize = 37*2):

    # Batches speedup inference significantly
    #numImages = 352758
    numImages = 0
    with tarfile.open(input_path + inputFile) as thumbnails:
        numImages = sum(1 for member in thumbnails if member.isreg())
    print('numImages = ' + str(numImages))

    path = './tmp/'
    with mxnet.Context(mxnet.gpu(0)):
        # Iterate over images in tar, etract, predict, persist results
        with tarfile.open(input_path + inputFile) as thumbnails:
            with open(output_path + outputFile, 'w') as out:
                for batch in tqdm_notebook(range(int(numImages / batchSize))):
                    # Accumulate batch results (mxnet has lazy computation model - 
                    # actuall computation occure when you access the results)
                    classes = list()
                    scores = list()
                    coords = list()
                    names = list()
                    sizes = list()
                    for i in range(batchSize):

                        # Extract the image
                        image = thumbnails.next()
                        imagePath = path + image.name
                        thumbnails.extract(image, path)

                        # Load and pre-process
                        x, _ = data.transforms.presets.yolo.load_test(imagePath, short=120)

                        # Predict
                        class_IDs, scores_v, coords_v = net(x.as_in_context(ctx))

                        # Accumulate
                        names.append(image.name)
                        classes.append(class_IDs)
                        scores.append(scores_v)
                        coords.append(coords_v)
                        sizes.append((x.shape[2],x.shape[3]))

                        # Cleanup
                        os.remove(path + image.name)

                    # P rocess accumulated batch results
                    for class_IDs, scores, name, coords, size in zip(classes,scores,names,coords,sizes):
                        resultList = []
                        resultList.append(name[0:32])
                        resultList.append(str(size[0]))
                        resultList.append(str(size[1]))
                        classIDsArr = class_IDs.flatten()[0].asnumpy()
                        scoresArr = scores.flatten()[0].asnumpy()
                        coordsArr = coords.flatten()[0].asnumpy().reshape((-1,4))
                        indexes = classIDsArr >= 0
                        resultList.append( ','.join(classIDsArr[indexes].astype('int32').astype('str')) )
                        resultList.append( ','.join(scoresArr[indexes].astype('str')) )
                        resultList.append( ','.join(coordsArr[indexes,:].flatten().astype('str')) )

                        out.write( ','.join(resultList) + '\n')

In [15]:
extractFeatures('/imagesThumbnails/testThumbnails.tar', '/testFeatures.csv', 1024)

HBox(children=(IntProgress(value=0, max=344), HTML(value='')))

In [16]:
extractFeatures('/imagesThumbnails/trainThumbnails.tar', '/trainFeatures.csv', 1024)

HBox(children=(IntProgress(value=0, max=344), HTML(value='')))

In [6]:
extractFeatures('/imagesThumbnails/trainThumbnails.tar', '/trainFeatures2.csv', 1024)

numImages = 1629380


HBox(children=(IntProgress(value=0, max=1591), HTML(value='')))




In [9]:
extractFeatures('/imagesThumbnails/tmp.tar', '/testFeatures_append.csv', 1)

numImages = 249


HBox(children=(IntProgress(value=0, max=249), HTML(value='')))


