In [None]:
# Deploying a scikit pipeline onto MML Spark
This notebook shows how to take your scikit pipeline and make a web service out of it using MML Spark Serving.

## Get data

In [1]:
# get the text data from the github repo and unzip it
from fit_and_store_pipeline import unzip_file_here
import urllib
import os

if not os.path.isfile('./text_data/attack_data.csv'):
    if not os.path.isfile('./text_data.zip'): 
        urllib.request.urlretrieve('https://activelearning.blob.core.windows.net/activelearningdemo/text_data.zip', 'text_data.zip')
    unzip_file_here('text_data.zip')

if not os.path.isfile('miniglove_6B_50d_w2v.txt'):
    unzip_file_here('miniglove_6B_50d_w2v.zip')
    
print('Data files here')

Data files here


## Doing it the scikit way

In [None]:
# install all the required dependencies
# !pip install gensim

In [66]:
# make the pipeline

import pandas as pd
import numpy as np
import gensim
import random
import gensim
from gensim.models import KeyedVectors
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from fit_and_store_pipeline import *

# text preprocessor
preprocessor = GensimPreprocessor()

# word2vec featurizer
w2v_file = 'miniglove_6B_50d_w2v.txt' # convert glove file to w2v format using gensim.scripts.glove2word2vec
word_vectors = KeyedVectors.load_word2vec_format(w2v_file, binary=False)
vectorizer = AvgWordVectorFeaturizer(word_vectors)

# classifier
classifier = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=1)

# assemble a scikit-learn pipeline
model_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('vectorizer', vectorizer),
        ('classifier', classifier),
    ])


In [67]:
# make a train-test data pair

from fit_and_store_pipeline import create_train_test_split

# requires training_set_01.csv and test_set_01.csv to be present
training_data, test_data = create_train_test_split()

In [68]:
# test - does the pipeline fit happily in a scikit fashion?
fitted_pipe = model_pipeline.fit(training_data.comment, [int(x) for x in training_data.is_attack])
fitted_pipe

Pipeline(memory=None,
     steps=[('preprocessor', GensimPreprocessor(newline_token=None)), ('vectorizer', <fit_and_store_pipeline.AvgWordVectorFeaturizer object at 0x7f0ffe92ff28>), ('classifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_...estimators=100, n_jobs=1,
            oob_score=True, random_state=1, verbose=0, warm_start=False))])

In [69]:
# test prediction from the scikit pipeline
test_attacks = ['You are scum.', 'I like your shoes.', 'You are pxzx.', 
             'Your mother was a hamster and your father smelt of elderberries',
             'One bag of hagfish slime, please']

fitted_pipe.predict(test_attacks)

array([1, 0, 0, 0, 1])

You could now deploy the native scikit pipeline using Azure ML Services. But today, we want to show the open-source way - MMLSpark. 

## Deploy the model as a Spark Streaming job

In [6]:
# turn the training data from a pandas.DataFrame into a spark.DataFrame
train_sdf = spark.createDataFrame(training_data)
train_sdf

DataFrame[comment: string, year: bigint, logged_in: boolean, ns: string, sample: string, split: string, count: bigint, avg_attack: double, is_attack: boolean]

In [8]:
# transforms - only pipeline
trans_pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('vectorizer', vectorizer)
    ])

trans_pipe.fit(training_data.comment, [int(x) for x in training_data.is_attack])
featurized_train = trans_pipe.transform(training_data.comment)
feature_names = [ 'F' + str(i) for i in range(featurized_train.shape[1]) ]

ft_df = pd.DataFrame(featurized_train, columns=feature_names);
ft_df['is_attack'] = [1 if x else 0 for x in training_data['is_attack']]

print(training_data.head())
print(ft_df.head())

featurized_train_sdf = spark.createDataFrame(ft_df)
featurized_train_sdf

                                                     comment  year  logged_in  \
rev_id                                                                          
416170166  NEWLINE_TOKENNEWLINE_TOKEN== Please stop? ==NE...  2011      False   
180046743  NEWLINE_TOKENfine I get threatened and he is a...  2007       True   
536943251  Lets see double standard of some one who tries...  2013       True   
93070973   my comments in on the arbitration request he f...  2006       True   
272666818  `NEWLINE_TOKENNEWLINE_TOKEN==You maybe interes...  2009       True   

                ns   sample  split  count  avg_attack  is_attack  
rev_id                                                            
416170166     user  blocked  train      8    0.125000      False  
180046743     user  blocked  train      9    0.222222      False  
536943251  article  blocked    dev     10    0.200000      False  
93070973      user  blocked   test      9    0.000000      False  
272666818     user   random  t

DataFrame[F0: double, F1: double, F2: double, F3: double, F4: double, F5: double, F6: double, F7: double, F8: double, F9: double, F10: double, F11: double, F12: double, F13: double, F14: double, F15: double, F16: double, F17: double, F18: double, F19: double, F20: double, F21: double, F22: double, F23: double, F24: double, F25: double, F26: double, F27: double, F28: double, F29: double, F30: double, F31: double, F32: double, F33: double, F34: double, F35: double, F36: double, F37: double, F38: double, F39: double, F40: double, F41: double, F42: double, F43: double, F44: double, F45: double, F46: double, F47: double, F48: double, F49: double, is_attack: bigint]

In [70]:
from mmlspark import TrainClassifier
from pyspark.ml.classification import RandomForestClassifier
model = TrainClassifier(model=RandomForestClassifier(), 
                        labelCol="is_attack", 
                        numFeatures=256).fit(featurized_train_sdf)

In [71]:
# test the model

# first, featurize the test data using the same pipeline
featurized_test = trans_pipe.transform(test_data.comment)
# use the same feature names we've had
# feature_names = [ 'F' + str(i) for i in range(featurized_test.shape[1]) ]

ftst_df = pd.DataFrame(featurized_test, columns=feature_names);
ftst_df['is_attack'] = [1 if x else 0 for x in test_data['is_attack']]

# second, make a prediction on the test set
ftst_sdf = spark.createDataFrame(ft_df)

prediction = model.transform(ftst_sdf)

In [72]:
# third, compute performance stats
from mmlspark import ComputeModelStatistics, TrainedClassifierModel
metrics = ComputeModelStatistics().transform(prediction)
metrics.limit(10).toPandas()

Unnamed: 0,evaluation_type,confusion_matrix,accuracy,precision,recall,AUC
0,Classification,"DenseMatrix([[ 276., 6.],\n [ ...",0.922727,0.955882,0.822785,0.98218


In [73]:
# now deploy the trained classifier as a streaming job
# define the interface to be like the model's input

from pyspark.sql.functions import col, from_json
from pyspark.sql.types import *
import uuid

serving_inputs = spark.readStream.server() \
    .address("localhost", 9999, "text_api") \
    .load()\
    .withColumn("variables", from_json(col("value"), ftst_sdf.schema))\
    .select("id","variables.*")

# says to extract "variables" from the "value" field of json-encoded webservice input

In [74]:
serving_outputs = model.transform(serving_inputs) \
  .withColumn("scored_labels", col("scored_labels").cast("string"))

In [77]:
server = serving_outputs.writeStream \
    .server() \
    .option("name", "text_api") \
    .queryName("my_query") \
    .option("replyCol", "scored_labels") \
    .option("checkpointLocation", "checkpoints-{}".format(uuid.uuid1())) \
    .start()

In [76]:
# if we want to change something above, we'll need
# to stop the active server

server.stop()


## Test web service

In [78]:
import numbers
import numpy as np

data = pd.DataFrame(trans_pipe.transform(test_attacks), columns=feature_names)

# float32s are not json-serializable, float64s are
data = data.applymap(lambda x: np.float64(x) if isinstance(x, numbers.Number) else x)

In [64]:
# inputs and outputs - schema
serving_inputs

DataFrame[id: bigint, F0: double, F1: double, F2: double, F3: double, F4: double, F5: double, F6: double, F7: double, F8: double, F9: double, F10: double, F11: double, F12: double, F13: double, F14: double, F15: double, F16: double, F17: double, F18: double, F19: double, F20: double, F21: double, F22: double, F23: double, F24: double, F25: double, F26: double, F27: double, F28: double, F29: double, F30: double, F31: double, F32: double, F33: double, F34: double, F35: double, F36: double, F37: double, F38: double, F39: double, F40: double, F41: double, F42: double, F43: double, F44: double, F45: double, F46: double, F47: double, F48: double, F49: double, is_attack: bigint]

In [19]:
serving_outputs

DataFrame[id: bigint, F0: double, F1: double, F2: double, F3: double, F4: double, F5: double, F6: double, F7: double, F8: double, F9: double, F10: double, F11: double, F12: double, F13: double, F14: double, F15: double, F16: double, F17: double, F18: double, F19: double, F20: double, F21: double, F22: double, F23: double, F24: double, F25: double, F26: double, F27: double, F28: double, F29: double, F30: double, F31: double, F32: double, F33: double, F34: double, F35: double, F36: double, F37: double, F38: double, F39: double, F40: double, F41: double, F42: double, F43: double, F44: double, F45: double, F46: double, F47: double, F48: double, F49: double, is_attack: bigint, scores: vector, scored_probabilities: vector, scored_labels: string]

In [79]:
import requests
import json
import time

# calling the service

for instance in range(len(test_attacks)):    
    row_as_dict = data.to_dict('records')[instance]        
    r = requests.post(data=json.dumps(row_as_dict), url="http://localhost:9999/text_api")
    time.sleep(0.5)
    print("Response to : '{}' is {}".format(test_attacks[instance], r.text))

Response to : 'You are scum.' is 1.0
Response to : 'I like your shoes.' is 0.0
Response to : 'You are pxzx.' is 0.0
Response to : 'Your mother was a hamster and your father smelt of elderberries' is 1.0
Response to : 'One bag of hagfish slime, please' is 1.0
