# Celebrity Quote Analysis with The Cognitive Services on Spark

<img src="https://mmlspark.blob.core.windows.net/graphics/SparkSummit2/cog_services.png" width="800" style="float: center;"/>

In [1]:
from mmlspark.cognitive import *
from pyspark.ml import PipelineModel
from pyspark.sql.functions import col, udf
from pyspark.ml.feature import SQLTransformer
import os

# get api key from AzureKeyVault linked service: https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/microsoft-spark-utilities?pivots=programming-language-python
TEXT_API_KEY = mssparkutils.credentials.getSecret("<akv-service-name>", "<akv-secret-name>", "<linked-service-name>")
VISION_API_KEY = mssparkutils.credentials.getSecret("<akv-service-name>", "<akv-secret-name>", "<linked-service-name>")
BING_IMAGE_SEARCH_KEY = mssparkutils.credentials.getSecret("<akv-service-name>", "<akv-secret-name>", "<linked-service-name>")

TEXT_API_LOCATION = "<cognitive-service-location>"
VISION_API_LOCATION = "<cognitive-service-location>"

StatementMeta(SamplePool, 31, 1, Finished, Available)



### Extracting celebrity quote images using Bing Image Search on Spark

Here we define two Transformers to extract celebrity quote images.

<img src="https://mmlspark.blob.core.windows.net/graphics/Cog%20Service%20NB/step%201.png" width="600" style="float: center;"/>

In [2]:
imgsPerBatch = 2 #the number of images Bing will return for each query
offsets = [(i*imgsPerBatch,) for i in range(5)] # A list of offsets, used to page into the search results
bingParameters = spark.createDataFrame(offsets, ["offset"])

bingSearch = BingImageSearch()\
  .setSubscriptionKey(BING_IMAGE_SEARCH_KEY)\
  .setOffsetCol("offset")\
  .setQuery("celebrity quotes")\
  .setCount(imgsPerBatch)\
  .setOutputCol("images")

#Transformer to that extracts and flattens the richly structured output of Bing Image Search into a simple URL column
getUrls = BingImageSearch.getUrlTransformer("images", "url")

StatementMeta(SamplePool, 31, 2, Finished, Available)



### Recognizing Images of Celebrities
This block identifies the name of the celebrities for each of the images returned by the Bing Image Search.

<img src="https://mmlspark.blob.core.windows.net/graphics/Cog%20Service%20NB/step%202.png" width="600" style="float: center;"/>

In [3]:
celebs = RecognizeDomainSpecificContent()\
          .setSubscriptionKey(VISION_API_KEY)\
          .setModel("celebrities")\
          .setLocation(VISION_API_LOCATION)\
          .setImageUrlCol("url")\
          .setOutputCol("celebs")

#Extract the first celebrity we see from the structured response
firstCeleb = SQLTransformer(statement="SELECT *, celebs.result.celebrities[0].name as firstCeleb FROM __THIS__")

StatementMeta(SamplePool, 31, 3, Finished, Available)



### Reading the quote from the image.
This stage performs OCR on the images to recognize the quotes.

<img src="https://mmlspark.blob.core.windows.net/graphics/Cog%20Service%20NB/step%203.png" width="600" style="float: center;"/>

In [4]:
from mmlspark.stages import UDFTransformer 

recognizeText = RecognizeText()\
  .setSubscriptionKey(VISION_API_KEY)\
  .setLocation(VISION_API_LOCATION)\
  .setImageUrlCol("url")\
  .setMode("Printed")\
  .setOutputCol("ocr")\
  .setConcurrency(5)

def getTextFunction(ocrRow):
    if ocrRow is None: return None
    return "\n".join([line.text for line in ocrRow.recognitionResult.lines])

# this transformer wil extract a simpler string from the structured output of recognize text
getText = UDFTransformer().setUDF(udf(getTextFunction)).setInputCol("ocr").setOutputCol("text")


StatementMeta(SamplePool, 31, 4, Finished, Available)



### Understanding the Sentiment of the Quote

<img src="https://mmlspark.blob.core.windows.net/graphics/Cog%20Service%20NB/step4.jpg" width="600" style="float: center;"/>

In [5]:
sentimentTransformer = TextSentiment()\
    .setTextCol("text")\
    .setLocation(TEXT_API_LOCATION)\
    .setSubscriptionKey(TEXT_API_KEY)\
    .setOutputCol("sentiment")

#Extract the sentiment score from the API response body
getSentiment = SQLTransformer(statement="SELECT *, sentiment[0].sentiment as sentimentLabel FROM __THIS__")

StatementMeta(SamplePool, 31, 5, Finished, Available)



### Tying it all together

Now that we have built the stages of our pipeline its time to chain them together into a single model that can be used to process batches of incoming data

<img src="https://mmlspark.blob.core.windows.net/graphics/Cog%20Service%20NB/full_pipe_2.jpg" width="800" style="float: center;"/>

In [6]:
from mmlspark.stages import SelectColumns
# Select the final coulmns
cleanupColumns = SelectColumns().setCols(["url", "firstCeleb", "text", "sentimentLabel"])

celebrityQuoteAnalysis = PipelineModel(stages=[
  bingSearch, getUrls, celebs, firstCeleb, recognizeText, getText, sentimentTransformer, getSentiment, cleanupColumns])

celebrityQuoteAnalysis.transform(bingParameters).show(5)

StatementMeta(SamplePool, 31, 6, Finished, Available)

+---+----------+----+--------------+
|url|firstCeleb|text|sentimentLabel|
+---+----------+----+--------------+
+---+----------+----+--------------+

In [7]:
spark.stop()

StatementMeta(SamplePool, 31, 7, Finished, Available)

