In [1]:
# install all the required dependencies
!pip install gensim

SDK version: 0.1.0.1005480


In [1]:
# get the text data from the github repo and unzip it
from fit_and_store_pipeline import unzip_file_here
import urllib
import os

if not os.path.isfile('./text_data/attack_data.csv'):
    if not os.path.isfile('./text_data.zip'): 
        urllib.request.urlretrieve('https://activelearning.blob.core.windows.net/activelearningdemo/text_data.zip', 'text_data.zip')
    unzip_file_here('text_data.zip')

if not os.path.isfile('miniglove_6B_50d_w2v.txt'):
    unzip_file_here('miniglove_6B_50d_w2v.zip')
    
print('Data files here')

Data files here


In [2]:
# make the pipeline

import pandas as pd
import numpy as np
import gensim
import random
import gensim
from gensim.models import KeyedVectors
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from pipeline_parts import *

# text preprocessor
preprocessor = GensimPreprocessor()

# word2vec featurizer
w2v_file = 'miniglove_6B_50d_w2v.txt' # convert glove file to w2v format using gensim.scripts.glove2word2vec
word_vectors = KeyedVectors.load_word2vec_format(w2v_file, binary=False)
vectorizer = AvgWordVectorFeaturizer(word_vectors)

# classifier
classifier = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=1)

# assemble a scikit-learn pipeline
model_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('vectorizer', vectorizer),
        ('classifier', classifier),
    ])


In [5]:
# make a train-test data pair

from fit_and_store_pipeline import create_train_test_split

# requires training_set_01.csv and test_set_01.csv to be present
training_data, test_data = create_train_test_split()

In [8]:
# test - does the pipeline fit happily in a scikit fashion?
fitted_pipe = model_pipeline.fit(training_data.comment, [int(x) for x in training_data.is_attack])
fitted_pipe

Pipeline(memory=None,
     steps=[('preprocessor', GensimPreprocessor(newline_token=None)), ('vectorizer', <pipeline_parts.AvgWordVectorFeaturizer object at 0x7ff6a0901a90>), ('classifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nod...estimators=100, n_jobs=1,
            oob_score=True, random_state=1, verbose=0, warm_start=False))])

In [14]:
# test prediction from the scikit pipeline
test_attacks = ['You are scum.', 'I like your shoes.', 'You are pxzx.', 
             'Your mother was a hamster and your father smelt of elderberries',
             'One bag of hagfish slime, please']

fitted_pipe.predict(test_attacks)

array([1, 0, 0, 0, 1])

## Deploy the model as a Spark Streaming job

In [9]:
# Plan:
#
# 1. Collect the training data in a spark dataframe
# 2. Build a model (pipeline duck-typed like the model in notebook 108)
# 3. Use TrainClassifier to train it
# 4. Use SparkStreaming to make a web service

In [16]:
# turn the training data from a pandas.DataFrame into a spark.DataFrame
# type(training_data)
# training_data
train_sdf = spark.createDataFrame(training_data)
train_sdf

DataFrame[comment: string, year: bigint, logged_in: boolean, ns: string, sample: string, split: string, count: bigint, avg_attack: double, is_attack: boolean]

In [None]:
# fit the same pipeline using the MMLSpark TrainClassifier (Will not work)

from mmlspark import TrainClassifier
model = TrainClassifier(model=fitted_pipe, labelCol="is_attack", numFeatures=256).fit(train_sdf)

# we cannot use a scikit pipeline in here:
# 'Pipeline' object has no attribute '_to_java'
# It looks like only native classifiers are OK
#
# The new game plan is to run the scikit pipeline for featurization,
# then feed the featurized data into a native MML Spark classifier for training


In [47]:
# transforms-only pipeline
trans_pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('vectorizer', vectorizer)
    ])

trans_pipe.fit(training_data.comment, [int(x) for x in training_data.is_attack])
featurized_train = trans_pipe.transform(training_data.comment)
feature_names = [ 'F' + str(i) for i in range(featurized_train.shape[1]) ]

ft_df = pd.DataFrame(featurized_train, columns=feature_names);
ft_df['is_attack'] = [1 if x else 0 for x in training_data['is_attack']]

print(training_data.head())
print(ft_df.head())

featurized_train_sdf = spark.createDataFrame(ft_df)
featurized_train_sdf

                                                     comment  year  logged_in  \
rev_id                                                                          
416170166  NEWLINE_TOKENNEWLINE_TOKEN== Please stop? ==NE...  2011      False   
180046743  NEWLINE_TOKENfine I get threatened and he is a...  2007       True   
536943251  Lets see double standard of some one who tries...  2013       True   
93070973   my comments in on the arbitration request he f...  2006       True   
272666818  `NEWLINE_TOKENNEWLINE_TOKEN==You maybe interes...  2009       True   

                ns   sample  split  count  avg_attack  is_attack  
rev_id                                                            
416170166     user  blocked  train      8    0.125000      False  
180046743     user  blocked  train      9    0.222222      False  
536943251  article  blocked    dev     10    0.200000      False  
93070973      user  blocked   test      9    0.000000      False  
272666818     user   random  t

DataFrame[F0: double, F1: double, F2: double, F3: double, F4: double, F5: double, F6: double, F7: double, F8: double, F9: double, F10: double, F11: double, F12: double, F13: double, F14: double, F15: double, F16: double, F17: double, F18: double, F19: double, F20: double, F21: double, F22: double, F23: double, F24: double, F25: double, F26: double, F27: double, F28: double, F29: double, F30: double, F31: double, F32: double, F33: double, F34: double, F35: double, F36: double, F37: double, F38: double, F39: double, F40: double, F41: double, F42: double, F43: double, F44: double, F45: double, F46: double, F47: double, F48: double, F49: double, is_attack: bigint]

In [48]:
from mmlspark import TrainClassifier
from pyspark.ml.classification import LogisticRegression
model = TrainClassifier(model=LogisticRegression(), 
                        labelCol="is_attack", 
                        numFeatures=256).fit(featurized_train_sdf)

In [56]:
# test the model

# first, featurize the test data using the same pipeline
featurized_test = trans_pipe.transform(test_data.comment)
# use the same feature names we've had
# feature_names = [ 'F' + str(i) for i in range(featurized_test.shape[1]) ]

ftst_df = pd.DataFrame(featurized_test, columns=feature_names);
ftst_df['is_attack'] = [1 if x else 0 for x in test_data['is_attack']]

# second, make a prediction on the test set
ftst_sdf = spark.createDataFrame(ft_df)

prediction = model.transform(ftst_sdf)

root
 |-- F0: double (nullable = true)
 |-- F1: double (nullable = true)
 |-- F2: double (nullable = true)
 |-- F3: double (nullable = true)
 |-- F4: double (nullable = true)
 |-- F5: double (nullable = true)
 |-- F6: double (nullable = true)
 |-- F7: double (nullable = true)
 |-- F8: double (nullable = true)
 |-- F9: double (nullable = true)
 |-- F10: double (nullable = true)
 |-- F11: double (nullable = true)
 |-- F12: double (nullable = true)
 |-- F13: double (nullable = true)
 |-- F14: double (nullable = true)
 |-- F15: double (nullable = true)
 |-- F16: double (nullable = true)
 |-- F17: double (nullable = true)
 |-- F18: double (nullable = true)
 |-- F19: double (nullable = true)
 |-- F20: double (nullable = true)
 |-- F21: double (nullable = true)
 |-- F22: double (nullable = true)
 |-- F23: double (nullable = true)
 |-- F24: double (nullable = true)
 |-- F25: double (nullable = true)
 |-- F26: double (nullable = true)
 |-- F27: double (nullable = true)
 |-- F28: double (nullabl

In [57]:
# third, compute performance stats
from mmlspark import ComputeModelStatistics, TrainedClassifierModel
metrics = ComputeModelStatistics().transform(prediction)
metrics.limit(10).toPandas()

Unnamed: 0,evaluation_type,confusion_matrix,accuracy,precision,recall,AUC
0,Classification,"DenseMatrix([[ 250., 32.],\n [ ...",0.779545,0.744,0.588608,0.847675


In [58]:
# now deploy the trained classifier as a streaming job
# define the interface to be like the model's input

from pyspark.sql.functions import col, from_json
from pyspark.sql.types import *
import uuid

serving_inputs = spark.readStream.server() \
    .address("localhost", 9999, "text_api") \
    .load()\
    .withColumn("variables", from_json(col("value"), ftst_sdf.schema))\
    .select("id","variables.*")

# says to extract "variables" from the "value" field of json-encoded webservice input

In [59]:
serving_outputs = model.transform(serving_inputs) \
  .withColumn("scored_labels", col("scored_labels").cast("string"))

In [63]:
server = serving_outputs.writeStream \
    .server() \
    .option("name", "text_api") \
    .queryName("my_query") \
    .option("replyCol", "scored_labels") \
    .option("checkpointLocation", "checkpoints-{}".format(uuid.uuid1())) \
    .start()

In [62]:
# if we want to change something above, we'll need
# to stop the active server
server.stop()


## Test web service

In [78]:
import requests
import json

print(test_attacks)
data = pd.DataFrame(trans_pipe.transform(test_attacks), columns=feature_names)
# turn data into a json string representation of a dictionary
row_as_dict = data.to_dict('records')[1]
print(row_as_dict)
ws_input = json.dumps(row_as_dict)

print(ws_input)

r = requests.post(data=ws_input, url="http://localhost:9999/text_api")
print("Response {}".format(r.text))



['You are scum.', 'I like your shoes.', 'You are pxzx.', 'Your mother was a hamster and your father smelt of elderberries', 'One bag of hagfish slime, please']
{'F0': -0.21418767, 'F1': 0.40059671, 'F2': 0.0071733347, 'F3': -0.33567235, 'F4': 0.50522667, 'F5': 0.080480002, 'F6': -0.45744696, 'F7': -0.746481, 'F8': 0.26008001, 'F9': 0.071898997, 'F10': -0.098540008, 'F11': 0.65155435, 'F12': -0.36874136, 'F13': 0.16127668, 'F14': 0.53228664, 'F15': 0.55447334, 'F16': -0.29821333, 'F17': 0.42311099, 'F18': 0.43166566, 'F19': -1.27487, 'F20': -0.058775995, 'F21': 0.48372331, 'F22': 0.13969998, 'F23': 0.076476656, 'F24': 0.085326649, 'F25': -1.6213666, 'F26': -1.1134467, 'F27': 0.58301669, 'F28': 0.98441666, 'F29': -1.1452667, 'F30': 2.8082666, 'F31': 0.97427672, 'F32': -0.29472566, 'F33': 0.48504066, 'F34': -0.058868002, 'F35': 0.60678333, 'F36': -0.069305666, 'F37': 0.26238334, 'F38': 0.23104002, 'F39': -0.42316332, 'F40': 0.21481667, 'F41': 0.27335998, 'F42': 0.044187605, 'F43': 0.61054

TypeError: Object of type 'float32' is not JSON serializable

Feed 100 rows of data to get predictions.

In [52]:
import json

test_samples = json.dumps({"data": vectors.tolist()})
test_samples = bytes(test_samples, encoding = 'utf8')

# Call scoring service
service.run(input_data = test_samples)

'{"result": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 

Deleting ACI is super fast!