# Deploy Scikit Learn and Keras Models

## Operationalize Scikit Learn model

In [4]:
import os

working_dir = r"D:\Sentiment140_Classification"

## Set training and test files
training_tweet_filename = os.path.join(working_dir, 'training_text.csv')
training_label_filename = os.path.join(working_dir, 'training_label.csv')
test_tweet_filename = os.path.join(working_dir, 'testing_text.csv')
test_label_filename = os.path.join(working_dir, 'testing_label.csv')

## Set word2vec file, parameter file for model, and model pipeline file
w2v_embeddings_filename = os.path.join(working_dir, 'w2vec.txt')
params_file_path = os.path.join(working_dir, "params.tsv")
model_file = os.path.join(working_dir, 'sk_model.zip') 

In [5]:
from tatk.feature_extraction.word2vec_vectorizer import Word2VecVectorizer

from __future__ import absolute_import
from __future__ import division
import tatk
import collections
import math
import sys
import random
import numpy as np
from six.moves import urllib
from six.moves import xrange  
from timeit import default_timer as timer
import pandas as pd
import re
import io
from nltk.tokenize import TweetTokenizer
import num2words

import math
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from tatk.pipelines.text_classification.text_classifier import TextClassifier
from tatk.pipelines.text_classification.keras_text_classifier import KerasTextClassifier
#from sklearn.model_selection import GridSearchCV
#from sklearn.model_selection import RandomizedSearchCV 

from azureml.api.schema.dataTypes import DataTypes
from azureml.api.schema.sampleDefinition import SampleDefinition
from azureml.api.realtime.services import generate_schema

!pip show azureml-tatk

%reload_ext autoreload
%autoreload 2
%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Name: azureml-tatk
Version: 0.0.687318
Summary: Microsoft Azure Machine Learning Text Analytics Toolkit
Home-page: https://msdata.visualstudio.com/DefaultCollection/AlgorithmsAndDataScience/_git/TATK
Author: Microsoft Corporation
Author-email: azml-tatk@microsoft.com
License: UNKNOWN
Location: c:\users\remoteuser\appdata\local\amlworkbench\python\lib\site-packages
Requires: lxml, validators, ipython, tensorflow-gpu, unidecode, scipy, bqplot, azure-ml-api-sdk, numpy, gensim, pytest, requests, matplotlib, nose, pdfminer.six, docker, azure-storage, pandas, sklearn-crfsuite, nltk, h5py, pyspark, keras, scikit-learn


You are using pip version 9.0.1, however version 9.0.3 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


## Load and preprocess tweet data

### Define data loading the preprocessing functions for tweets

In [6]:
# Data processing
# In the following code, we replace Emails, URLS, emoticons etc with special labels
pos_emoticons=["(^.^)","(^-^)","(^_^)","(^_~)","(^3^)","(^o^)","(~_^)","*)",":)",":*",":-*",":]",":^)",":}",
               ":>",":3",":b",":-b",":c)",":D",":-D",":O",":-O",":o)",":p",":-p",":P",":-P",":Þ",":-Þ",":X",
               ":-X",";)",";-)",";]",";D","^)","^.~","_)m"," ~.^","<=8","<3","<333","=)","=///=","=]","=^_^=",
               "=<_<=","=>.<="," =>.>="," =3","=D","=p","0-0","0w0","8D","8O","B)","C:","d'-'","d(>w<)b",":-)",
               "d^_^b","qB-)","X3","xD","XD","XP","ʘ‿ʘ","❤","💜","💚","💕","💙","💛","💓","💝","💖","💞",
               "💘","💗","😗","😘","😙","😚","😻","😀","😁","😃","☺","😄","😆","😇","😉","😊","😋","😍",
               "😎","😏","😛","😜","😝","😮","😸","😹","😺","😻","😼","👍"]

neg_emoticons=["--!--","(,_,)","(-.-)","(._.)","(;.;)9","(>.<)","(>_<)","(>_>)","(¬_¬)","(X_X)",":&",":(",":'(",
               ":-(",":-/",":-@[1]",":[",":\\",":{",":<",":-9",":c",":S",";(",";*(",";_;","^>_>^","^o)","_|_",
               "`_´","</3","<=3","=/","=\\",">:(",">:-(","💔","☹️","😌","😒","😓","😔","😕","😖","😞","😟",
               "😠","😡","😢","😣","😤","😥","😦","😧","😨","😩","😪","😫","😬","😭","😯","😰","😱","😲",
               "😳","😴","😷","😾","😿","🙀","💀","👎"]

# Emails
emailsRegex=re.compile(r'[\w\.-]+@[\w\.-]+')

# Mentions
userMentionsRegex=re.compile(r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9]+)')

#Urls
urlsRegex=re.compile('r(f|ht)(tp)(s?)(://)(.*)[.|/][^ ]+') # It may not be handling all the cases like t.co without http

#Numerics
numsRegex=re.compile(r"\b\d+\b")

punctuationNotEmoticonsRegex=re.compile(r'(?<=\w)[^\s\w](?![^\s\w])')

emoticonsDict = {}
for i,each in enumerate(pos_emoticons):
    emoticonsDict[each]=' POS_EMOTICON_'+num2words.num2words(i).upper()+' '
    
for i,each in enumerate(neg_emoticons):
    emoticonsDict[each]=' NEG_EMOTICON_'+num2words.num2words(i).upper()+' '
    
# use these three lines to do the replacement
rep = dict((re.escape(k), v) for k, v in emoticonsDict.items())
emoticonsPattern = re.compile("|".join(rep.keys()))


##########################
def read_tweets(filename):
    """Read the raw tweet data from a file. Replace Emails etc with special tokens """
    with open(filename, 'r') as f:
        all_lines=f.readlines()
        padded_lines=[]
        for line in all_lines:
            line = emoticonsPattern.sub(lambda m: rep[re.escape(m.group(0))], line.lower().strip())
            line = userMentionsRegex.sub(' USER ', line )
            line = emailsRegex.sub(' EMAIL ', line )
            line=urlsRegex.sub(' URL ', line)
            line=numsRegex.sub(' NUM ',line)
            line=punctuationNotEmoticonsRegex.sub(' PUN ',line)
            line=re.sub(r'(.)\1{2,}', r'\1\1',line)
            words_tokens=[token for token in TweetTokenizer().tokenize(line)]                    
            line= ' '.join(token for token in words_tokens )         
            padded_lines.append(line)
    return padded_lines

### Read in tweet training data and training labels, and combine into one data-frame

In [7]:
## Read in tweet file. Labels are 4 and 0 (4 being positive and 0 being negative)
tweets = read_tweets(test_tweet_filename)

with open(test_label_filename, 'r') as f:
    all_lines=f.readlines()
    tweet_labels = []
    for line in all_lines:
        line = line.strip()
        label = 0
        if line == '4':
            label = 1
        tweet_labels.append(label)

df = pd.DataFrame({'tweets':tweets, 'labels':tweet_labels}, columns=['tweets','labels'])
display(df[:3])

Unnamed: 0,tweets,labels
0,USER hehe enjoy i say i wish i kept a stock of...,1
1,is not happy she is driving a loaner car ! !,0
2,heyy guyzz howz evrionee PUN x,1


In [8]:
## Count the labels in training set
df['labels'].value_counts()

0    160257
1    159743
Name: labels, dtype: int64

### Load and the training pipleine, predict and evaluate the accuracy on held-out validation set

In [8]:
text_classifier_reloaded = TextClassifier.load(model_file)

BaseTextModel::load ==> start
TatkPipeline::load ==> start
Word '<UNK>' is already in vocabulary.
Word2VecVectorizer: Word2Vec model loaded from D:\Sentiment140_Classification\sk_model.zip 2018-03-21 19.26.48\sk_model\pipeline\tweets_word2vec\embedding_table.txt
Time taken: 0.06 mins
TatkPipeline::load ==> end
Time taken: 1.13 mins
BaseTextModel::load ==> end


In [9]:
predicted_df = text_classifier_reloaded.predict(df)
display(predicted_df[:3])

TextClassifier::predict ==> start
NltkPreprocessor::tatk_transform ==> start
NltkPreprocessor::tatk_transform ==> end 	 Time taken: 0.09 mins
Word2VecVectorizer::tatk_transform ==> start
Word2VecVectorizer::tatk_transform ==> end 	 Time taken: 0.59 mins
VectorAssembler::transform ==> start, num of input records=320000
(320000, 50)
all_features::
(320000, 50)
Time taken: 0.0 mins
VectorAssembler::transform ==> end
LogisticRegression::tatk_predict ==> start
LogisticRegression::tatk_predict ==> end 	 Time taken: 0.0 mins
Time taken: 0.69 mins
TextClassifier::predict ==> end


Unnamed: 0,tweets,labels,prediction
0,USER hehe enjoy i say i wish i kept a stock of...,1,0
1,is not happy she is driving a loaner car ! !,0,0
2,heyy guyzz howz evrionee PUN x,1,1


In [10]:
text_classifier_reloaded.evaluate(df)

TextClassifier::evaluate ==> start
NltkPreprocessor::tatk_transform ==> start
NltkPreprocessor::tatk_transform ==> end 	 Time taken: 0.09 mins
Word2VecVectorizer::tatk_transform ==> start
Word2VecVectorizer::tatk_transform ==> end 	 Time taken: 0.6 mins
VectorAssembler::transform ==> start, num of input records=320000
(320000, 50)
all_features::
(320000, 50)
Time taken: 0.0 mins
VectorAssembler::transform ==> end
LogisticRegression::tatk_predict ==> start
LogisticRegression::tatk_predict ==> end 	 Time taken: 0.0 mins
[[117205  43052]
 [ 41360 118383]]
macro_f1 = 0.7362089252084587
Time taken: 0.7 mins
TextClassifier::evaluate ==> end


(array([[117205,  43052],
        [ 41360, 118383]], dtype=int64), 0.7362089252084587)

### Create schema for web service

#### Define the functions to be used in score_sentiments.py file

In [46]:
os.chdir('C:\\Users\\remoteuser\\Desktop\\Projects\\v2Tatk_Classification_Sent140\\code\\03_deployment')
def init():
    from tatk.pipelines.text_classification.text_classifier import TextClassifier
    from sklearn.linear_model import LogisticRegression
    import os

    home_dir = os.getcwd() 
    model_file = os.path.join(home_dir, 'sk_model.zip')

    # load the model from file into a global object
    global model
    model = TextClassifier.load(model_file)

In [47]:
def run(input_df):
    from tatk.pipelines.text_classification.text_classifier import TextClassifier
    from sklearn.linear_model import LogisticRegression
    import json

    pred = model.predict(input_df)
    json_str = pred.to_json()
    return json.dumps(json_str)

#### Test run function from score_sentiments.py prior to creation of web-service

In [40]:
#init()
df1 = pd.DataFrame(data=[['add your good text here.']], columns=['tweets'])
#df1 = df.head(2)
run(df1)

TextClassifier::predict ==> start
NltkPreprocessor::tatk_transform ==> start
NltkPreprocessor::tatk_transform ==> end 	 Time taken: 0.0 mins
Word2VecVectorizer::tatk_transform ==> start
Word2VecVectorizer::tatk_transform ==> end 	 Time taken: 0.0 mins
VectorAssembler::transform ==> start, num of input records=1
(1, 50)
all_features::
(1, 50)
Time taken: 0.0 mins
VectorAssembler::transform ==> end
LogisticRegression::tatk_predict ==> start
LogisticRegression::tatk_predict ==> end 	 Time taken: 0.0 mins
Time taken: 0.0 mins
TextClassifier::predict ==> end


'"{\\"tweets\\":{\\"0\\":\\"add your good text here.\\"},\\"prediction\\":{\\"0\\":1}}"'

#### Generate schema

In [42]:
df1 = list(df.head(1)["tweets"])
df2 = pd.DataFrame(data=df1, columns = ['tweets'])

# Turn on data collection debug mode to view output in stdout
os.environ["AML_MODEL_DC_DEBUG"] = 'true'

# Test the output of the functions
#init(model_file)
  
inputs = {"input_df": SampleDefinition(DataTypes.PANDAS, df2)}
  
#Genereate the schema
generate_schema(run_func=run, inputs=inputs, filepath='service_schema.json')
print("Schema generated")

Schema generated


In [None]:
#az ml env setup -n debrajtatkclass1 --location eastus2
#az ml env show -g debrajtatkclass1rg -n debrajtatkclass1
#az ml env set -g debrajtatkclass1rg -n debrajtatkclass1

#az ml account modelmanagement create --location eastus2 -n debrajtatkclass1mms -g debrajtatkclass1rg --sku-name S1
#az ml account modelmanagement set -n debrajtatkclass1mms -g debrajtatkclass1rg

#az ml env setup --cluster -n tatkakscluster -l eastus2 -g debrajtatkclass1rg
#az ml env set -n tatkakscluster -g debrajtatkclass1aksrg

In [None]:
#az ml model register -m D:\Sentiment140_Classification\sk_model.zip -n sk_model2.zip
#az ml manifest create --manifest-name sk_model_manifest3 -f score_sentiments.py -r python -i 0d2fc4debf0b4632a555ec6d9e349d6e -s service_schema.json
#az ml image create -n sentimentappimage --manifest-id b6eab771-b719-458b-a4d2-4be416a275c6 -c conda_dependencies.yml
#az ml service create realtime --image-id 9f2660fe-d116-46f0-8d99-b3cfe21828a5 -n sentimentapp3 --collect-model-data true

In [None]:
#az ml service create realtime -f score_sentiments.py -m sk_model.zip -s service_schema.json -n sentimentapp -r python -c aml_config\conda_dependencies.yml

In [None]:
#az ml service run realtime -i sentimentapp.tatkakscluster-a0515695.eastus2 -d "{\"input_df\": [{\"tweets\": \"USER hehe enjoy i say i wish i kept a stock of bottles @ home PUN\"}]}"