# Text classification on Spark with MMLSpark

This notebook shows how to make a text classfication web service using MML Spark serving and deploy it to a Sparl cluster.

## Get data

In [1]:
# get the text data from the github repo and unzip it
from fit_and_store_pipeline import unzip_file_here
import urllib
import os

if not os.path.isfile('./text_data/attack_data.csv'):
    if not os.path.isfile('./text_data.zip'): 
        urllib.request.urlretrieve('https://activelearning.blob.core.windows.net/activelearningdemo/text_data.zip', 'text_data.zip')
    unzip_file_here('text_data.zip')

if not os.path.isfile('miniglove_6B_50d_w2v.txt'):
    unzip_file_here('miniglove_6B_50d_w2v.zip')
    
print('Data files here')

Data files here


In [22]:
# make a train-test data pair

from fit_and_store_pipeline import create_train_test_split

# requires training_set_01.csv and test_set_01.csv to be present
training_data, test_data = create_train_test_split()

In [49]:
# put data in the spark format

train_sdf = spark.createDataFrame(training_data)
train_sdf = train_sdf\
            .withColumn("label", train_sdf["is_attack"].cast('integer'))\
            .select(["comment", "label"])
                                                             
test_sdf = spark.createDataFrame(test_data)
test_sdf = test_sdf\
            .withColumn("label", test_sdf["is_attack"].cast('integer'))\
            .select(["comment", "label"])

# What have we?
# train_sdf.limit(10).toPandas()
# train_sdf.groupBy("label").count().toPandas()

In [58]:
# make an ML-Lib pipeline involving preprocessor and vectorizer

from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, Word2Vec
from pyspark.ml.classification import RandomForestClassifier

# comment is the text field
tokenizer = Tokenizer(inputCol="comment", outputCol="words")
partitions = train_sdf.rdd.getNumPartitions()
word2vec = Word2Vec(maxIter=4, seed=44, inputCol="words", outputCol="features",
                    numPartitions=partitions)
rfc = RandomForestClassifier(labelCol="label")
textClassifier = Pipeline(stages = [tokenizer, word2vec, rfc]).fit(train_sdf)

In [61]:
# if you are going to try a couple different models, pre-featurize first
textFeaturizer = Pipeline(stages = [tokenizer, word2vec]).fit(train_sdf)
ptrain = textFeaturizer.transform(train_sdf).select(["label", "features"])
ptest = textFeaturizer.transform(test_sdf).select(["label", "features"])
ptrain.limit(5).toPandas()

Unnamed: 0,label,features
0,0,"[-0.197263322771, 0.021295234561, 0.0276349233..."
1,0,"[-0.240008686802, 0.117446979774, -0.058907808..."
2,0,"[-0.274001607099, 0.127858318913, -0.087485656..."
3,0,"[-0.166309199855, 0.141414361075, -0.047271387..."
4,0,"[-0.270946531485, 0.090287042728, -0.037509560..."


In [62]:
# test prediction on some new data
import pandas as pd

test_attacks = ['You are scum.', 'I like your shoes.', 'You are pxzx.', 
             'Your mother was a hamster and your father smelt of elderberries',
             'One bag of hagfish slime, please']

ta_sdf = spark.createDataFrame(pd.DataFrame({"comment" : test_attacks}))

prediction = textClassifier.transform(ta_sdf)
prediction.toPandas()

Unnamed: 0,comment,words,features,rawPrediction,probability,prediction
0,You are scum.,"[you, are, scum.]","[-0.421156624953, 0.194889614979, -0.121429090...","[9.5761742881, 10.4238257119]","[0.478808714405, 0.521191285595]",1.0
1,I like your shoes.,"[i, like, your, shoes.]","[-0.217787927017, 0.201306108385, 0.0849744305...","[11.6430517863, 8.35694821373]","[0.582152589314, 0.417847410686]",0.0
2,You are pxzx.,"[you, are, pxzx.]","[-0.421156624953, 0.194889614979, -0.121429090...","[9.5761742881, 10.4238257119]","[0.478808714405, 0.521191285595]",1.0
3,Your mother was a hamster and your father smel...,"[your, mother, was, a, hamster, and, your, fat...","[-0.299377459017, 0.195304373449, -0.186904550...","[9.65799917433, 10.3420008257]","[0.482899958716, 0.517100041284]",1.0
4,"One bag of hagfish slime, please","[one, bag, of, hagfish, slime,, please]","[-0.28979960084, 0.345299651225, -0.2011535658...","[11.4189348584, 8.5810651416]","[0.57094674292, 0.42905325708]",0.0


In [63]:
# test prediction on the larger test set

scored_test = textClassifier.transform(test_sdf)
scored_test.groupBy(["label", "prediction"]).count()\
            .toPandas().pivot(index="label", columns="prediction")

Unnamed: 0_level_0,count,count
prediction,0.0,1.0
label,Unnamed: 1_level_2,Unnamed: 2_level_2
0,818,31
1,130,21


In [66]:
# why does this not work? Is it because the model is a pipeline?
from mmlspark import ComputeModelStatistics
metrics = ComputeModelStatistics().transform(scored_test)
metrics.limit(10).toPandas()

Py4JJavaError: An error occurred while calling o4523.transform.
: java.lang.Exception: Please score the model prior to evaluating
	at com.microsoft.ml.spark.metrics.MetricUtils$.getSchemaInfo(MetricUtils.scala:27)
	at com.microsoft.ml.spark.ComputeModelStatistics.transform(ComputeModelStatistics.scala:71)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)


## Deploy the model as a Spark Streaming job

In [64]:
# now deploy the trained classifier as a streaming job
# define the interface to be like the model's input

from pyspark.sql.functions import col, from_json
from pyspark.sql.types import *
import uuid

serving_inputs = spark.readStream.server() \
    .address("localhost", 9977, "text_api") \
    .load()\
    .withColumn("variables", from_json(col("value"), test_sdf.schema))\
    .select("id","variables.*")

# says to extract "variables" from the "value" field of json-encoded webservice input

In [67]:
serving_outputs = textClassifier.transform(serving_inputs) \
  .withColumn("prediction", col("prediction").cast("string"))

In [68]:
server = serving_outputs.writeStream \
    .server() \
    .option("name", "text_api") \
    .queryName("mml_text_query") \
    .option("replyCol", "prediction") \
    .option("checkpointLocation", "checkpoints-{}".format(uuid.uuid1())) \
    .start()

In [76]:
# if we want to change something above, we'll need
# to stop the active server

server.stop()


## Test web service

In [70]:
# inputs and outputs - schema
serving_inputs

DataFrame[id: bigint, comment: string, label: int]

In [71]:
serving_outputs

DataFrame[id: bigint, comment: string, label: int, words: array<string>, features: vector, rawPrediction: vector, probability: vector, prediction: string]

In [73]:
import requests
import json
import time

# calling the service
data = pd.DataFrame({ "comment" : test_attacks })

for instance in range(len(test_attacks)):    
    row_as_dict = data.to_dict('records')[instance]        
    r = requests.post(data=json.dumps(row_as_dict), url="http://localhost:9977/text_api")
    time.sleep(0.5)
    print("Response to : '{}' is {}".format(test_attacks[instance], r.text))

Response to : 'You are scum.' is 1.0
Response to : 'I like your shoes.' is 0.0
Response to : 'You are pxzx.' is 1.0
Response to : 'Your mother was a hamster and your father smelt of elderberries' is 1.0
Response to : 'One bag of hagfish slime, please' is 0.0
