# Image Recognition using SynapseML

---

S.Yu. Papulin (papulin_bmstu@mail.ru)

Load numpy and matplotlib related packages:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

Load spark related packages:

In [None]:
import os
import sys

os.environ["SPARK_HOME"] = "/home/ubuntu/BigData/spark"
os.environ["PYSPARK_PYTHON"] = "/home/ubuntu/ML/anaconda3/bin/python"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/home/ubuntu/ML/anaconda3/bin/python"

spark_home = os.environ.get("SPARK_HOME")
sys.path.insert(0, os.path.join(spark_home, "python"))
sys.path.insert(0, os.path.join(spark_home, "python/lib/py4j-0.10.7-src.zip"))

In [None]:
import pyspark
from pyspark.sql import SparkSession

import pyspark.sql.functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import (
    StructType, StructField, StringType,
    ArrayType, FloatType, IntegerType
)

## Loading Dataset

In [None]:
def load_dataset():
    
    import os
    import urllib.request
    import tarfile
    import pickle
    
    
    def show_download_progress():
        received = 0
        def _show_progress(num, size, total):
            if num == 0:
                nonlocal received
                received = 0
            received += size
            print("{}/{}".format(received, total), end="\r")
        return _show_progress

    
    CIFAR_URL = "http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
    CIFAR_LOCAL_PATH = "/YOUR_PATH/data/cifar-106/cifar-10-python.tar.gz"

    # Load the dataset if there doesn't exist the dataset archive
    if not os.path.isfile(CIFAR_LOCAL_PATH):
        os.makedirs(os.path.dirname(CIFAR_LOCAL_PATH))
        urllib.request.urlretrieve(CIFAR_URL, CIFAR_LOCAL_PATH, 
                                   reporthook=show_download_progress())

    test_batch = None
    meta_info = None

    # Unpack the test set
    with tarfile.open(CIFAR_LOCAL_PATH, "r:gz") as tar:
        f = tar.extractfile("cifar-10-batches-py/test_batch")
        test_batch = pickle.load(f, encoding="latin1")
        f = tar.extractfile("cifar-10-batches-py/batches.meta")
        meta_info = pickle.load(f, encoding="latin1")
    
    return test_batch, meta_info

In [None]:
# Load dataset
test_batch, meta_info = load_dataset()

In [None]:
# Display meta 
meta_info

In [None]:
# Label names
meta_info["label_names"]

In [None]:
# Keys and values of the test set dictionary
for key in test_batch.keys():
    print(key, test_batch[key])

In [None]:
# Single image array
test_batch["data"][0]

In [None]:
# Array shape 
test_batch["data"][0].shape

In [None]:
test_batch["data"].shape

Display images

In [None]:
NUM_DISPLAY_IMAGES = 10
NUM_CLASSES = len(meta_info["label_names"])

In [None]:
plt.figure(figsize=[16, 2*NUM_CLASSES])

labels = np.array(test_batch["labels"])

for i, name in enumerate(meta_info["label_names"]):
    image_indxs = np.random.choice(
        np.where(labels==i)[0], 
        NUM_DISPLAY_IMAGES, 
        replace=False)
    for j in range(NUM_DISPLAY_IMAGES):
        reshaped_image = test_batch["data"][image_indxs[j]]\
            .reshape(3, 32, 32)\
            .transpose(1, 2, 0)\
            .astype("uint8")
        plt.subplot(NUM_CLASSES, NUM_DISPLAY_IMAGES, j+i*NUM_DISPLAY_IMAGES+1)
        plt.title("{}".format(name))
        plt.imshow(reshaped_image)
        plt.axis("off")

## Starting Spark Session

In [None]:
conf = pyspark.SparkConf()\
        .setMaster("local[*]")\
        .set("spark.jars.packages", "com.microsoft.ml.spark:mmlspark_2.11:0.18.0")\
        .set("spark.jars.repositories", "https://mmlspark.azureedge.net/maven")

In [None]:
spark = SparkSession\
    .builder\
    .appName("imageRecognition")\
    .config(conf=conf)\
    .getOrCreate()
spark

In [None]:
# Load synapseml related packages
from mmlspark.cntk import CNTKModel
from mmlspark.downloader import ModelDownloader

## Creating Image DataFrame

In [None]:
schema = StructType([
    StructField("label", IntegerType(), True),
    StructField("image", ArrayType(FloatType()), True),
    StructField("filename", StringType(), True)
])

In [None]:
def convert2float(row):
    return row[0], row[1].astype("float").tolist(), row[2]


images = zip(test_batch["labels"], test_batch["data"], test_batch["filenames"])

df_images = spark.sparkContext\
    .parallelize(images)\
    .map(convert2float)\
    .toDF(schema)

df_images.persist().count()

In [None]:
df_images.show(5)

In [None]:
df_images.printSchema()

## Loading Model

In [None]:
model_name = "ConvNet"
model_dir = "file:///tmp/models/"

In [None]:
downloader = ModelDownloader(spark, model_dir)
model = downloader.downloadByName(model_name)

In [None]:
model.uri

## Recognition

In [None]:
cntk_model = CNTKModel()\
    .setInputCol("image")\
    .setOutputCol("output")\
    .setModelLocation(model.uri)\
    .setOutputNode("z")

In [None]:
df_images__predict_proba = cntk_model.transform(df_images)
num_images = df_images__predict_proba.persist().count()
num_images

In [None]:
df_images__predict_proba.show(5)

In [None]:
@udf(IntegerType())
def predict(proba):
    return int(proba.argmax())

In [None]:
df_images__predict = df_images__predict_proba\
    .withColumn("prediction", predict("output"))\
    .select("prediction", "label")

df_images__predict.show(5)

## Evaluation

In [None]:
# Accuracy
correct_count = df_images__predict\
    .where(F.col("prediction") == F.col("label"))\
    .count()

correct_count / num_images

In [None]:
images__predict = df_images__predict.toPandas()
y, y_hat = images__predict["label"], images__predict["prediction"]

In [None]:
cm = confusion_matrix(y, y_hat)

plt.figure(figsize=[8,8])
labels = meta_info["label_names"]
plt.imshow(cm, cmap=plt.cm.Blues)
plt.colorbar()
tick_marks = np.arange(len(labels))
plt.xticks(tick_marks, labels, rotation=90)
plt.yticks(tick_marks, labels)
plt.xlabel("Predicted label")
plt.ylabel("True Label")

plt.show()

## References

- [Model ConvNet CIFAR10](https://github.com/Microsoft/CNTK/blob/master/Examples/Image/Classification/ConvNet/BrainScript/ConvNet_CIFAR10.cntk)
- [Ingesting CIFAR Images into Spark DataFrames and Evaluating Pre-Trained CNTK Models](https://github.com/microsoft/SynapseML/blob/v0.18.0/notebooks/samples/DeepLearning%20-%20CIFAR10%20Convolutional%20Network.ipynb)
- [Ingesting CIFAR Images into Spark DataFrames and Evaluating Pre-Trained CNTK Models](https://notebook.community/rastala/mmlspark/notebooks/samples/301%20-%20CIFAR10%20CNTK%20CNN%20Evaluation)