# Loading Libraries

In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sn
from azureml.core import Workspace, Dataset

In [2]:
# import dataset

df = pd.read_csv('Dataset/amazonLabelled - amazonLabelled.csv')

In [3]:
df.head()

Unnamed: 0,S,Feedback,Sentiment
0,1,"Good case, Excellent value.",Positive
1,2,Great for the jawbone.,Positive
2,3,Tied to charger for conversations lasting more...,Negative
3,4,The mic is great.,Positive
4,5,I have to jiggle the plug to get it to line up...,Negative


In [4]:
df.shape

(999, 3)

In [5]:
df.isnull().values.any()

False

In [6]:
df["Sentiment"].value_counts()

Positive    500
Negative    499
Name: Sentiment, dtype: int64

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
lb=LabelEncoder()

In [9]:
lb.fit(df["Sentiment"])

LabelEncoder()

In [10]:
df["Sentiment"]=lb.transform(df["Sentiment"])

In [11]:
df.head()

Unnamed: 0,S,Feedback,Sentiment
0,1,"Good case, Excellent value.",1
1,2,Great for the jawbone.,1
2,3,Tied to charger for conversations lasting more...,0
3,4,The mic is great.,1
4,5,I have to jiggle the plug to get it to line up...,0


# Train Test Split

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train,X_test,y_train,y_test=train_test_split(df.drop("Sentiment",axis=1),df["Sentiment"],test_size=0.2)

In [14]:
X_train.shape

(799, 2)

In [15]:
train_df=pd.concat([X_train,y_train],axis=1).to_csv("Dataset/train_set.csv",index=False)

In [16]:
test_df=pd.concat([X_test,y_test],axis=1).to_csv("Dataset/test_set.csv",index=False)

# Register dataset to the workspace

In [4]:
subscription_id = 'b34c1109-5941-40da-be9e-960b4f30511d'
resource_group = 'Learn_MLOps'
workspace_name = 'MLOps_WS'

In [5]:
workspace = Workspace(subscription_id, resource_group, workspace_name)

In [6]:
# get the datastore to upload prepared data
datastore = workspace.get_default_datastore()

In [20]:
# upload the local file from src_dir to the target_path in datastore
datastore.upload(src_dir='Dataset', target_path='nlpdata')

Uploading an estimated of 3 files
Uploading Dataset/amazonLabelled - amazonLabelled.csv
Uploaded Dataset/amazonLabelled - amazonLabelled.csv, 1 files out of an estimated total of 3
Uploading Dataset/test_set.csv
Uploaded Dataset/test_set.csv, 2 files out of an estimated total of 3
Uploading Dataset/train_set.csv
Uploaded Dataset/train_set.csv, 3 files out of an estimated total of 3
Uploaded 3 files


$AZUREML_DATAREFERENCE_d25668d33ec04d86b31d4608fe9967bf

In [21]:
train_dataset = Dataset.Tabular.from_delimited_files(datastore.path('nlpdata/train_set.csv'))

In [22]:
test_dataset = Dataset.Tabular.from_delimited_files(datastore.path('nlpdata/test_set.csv'))

In [23]:
train_ds = train_dataset.register(workspace=workspace,
                                 name='nlp_train_set',
                                 description='Training data for nlp usecase')

In [24]:
test_ds = test_dataset.register(workspace=workspace,
                                 name='nlp_test_set',
                                 description='Test data for nlp usecase')

# Data ingestion step - Training dataset

In [25]:
dataset = Dataset.get_by_name(workspace, name='nlp_train_set')
print(dataset.name, dataset.version)

nlp_train_set 1


In [26]:
df = dataset.to_pandas_dataframe()

In [27]:
df.head()

Unnamed: 0,S,Feedback,Sentiment
0,785,This allows the possibility of double booking ...,0
1,908,"I can hear while I'm driving in the car, and u...",1
2,36,It has kept up very well.,1
3,862,So far it has worked like a charm.,1
4,298,Customer service was terrible.,0


In [28]:
df.shape

(799, 3)

# Preprocessing Data

In [30]:
!pip install nltk



In [29]:
import nltk

In [32]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [35]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
lemmatizer=WordNetLemmatizer()

In [36]:
def preprocess_data(data):
    corpus=[]
    for i in data:
        mess=re.sub("[^a-zA-Z0-9]"," ",i)
        mess=mess.lower().split()
        mess=[lemmatizer.lemmatize(word) for word in mess if word not in stopwords.words("english")]
        mess=" ".join(mess)
        corpus.append(mess)
    return corpus    

In [37]:
corpus=preprocess_data(df["Feedback"])

In [40]:
len(corpus)

799

In [41]:
from sklearn.feature_extraction.text import CountVectorizer

In [44]:
cv=CountVectorizer(ngram_range=(1,2))

In [45]:
cv.fit(corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [46]:
count_train=cv.transform(corpus)

In [47]:
count_train.shape

(799, 4436)

# Creating experiment and run to log metrics and hypermeters

In [48]:
from azureml.core.experiment import Experiment
myexperiment = Experiment(workspace, "rf_sent_analysis")
# initialize a run in Azureml
run = myexperiment.start_logging()

In [49]:
run.log("dataset name", dataset.name)
run.log("dataset Version", dataset.version)

# Model Training

In [50]:
from sklearn.ensemble import RandomForestClassifier

In [51]:
rf=RandomForestClassifier()

In [53]:
rf.fit(count_train,df["Sentiment"])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [54]:
rf.score(count_train,df["Sentiment"])

0.9974968710888611

In [55]:
from sklearn.model_selection import cross_val_score

In [56]:
scores=cross_val_score(rf,count_train,df["Sentiment"],cv=3)

In [57]:
scores.mean()

0.7621628978814076

In [58]:
scores.std()

0.023789217408980542

# Hyperparameter Tuning

In [59]:
from sklearn.model_selection import GridSearchCV

In [60]:
param_grid={'n_estimators': [100, 400,700,1000,2000,2500], 'min_samples_split': [2,4,8,16]}

In [61]:
grid=GridSearchCV(rf,param_grid,n_jobs=-1)

In [62]:
grid.fit(count_train,df["Sentiment"])

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              ra

In [63]:
n_est=grid.get_params(deep=True)['estimator__n_estimators']

In [64]:
min_sam_splt=grid.get_params(deep=True)['estimator__min_samples_split']

In [65]:
rf=RandomForestClassifier(n_estimators=n_est,min_samples_split=min_sam_splt)

In [66]:
rf.fit(count_train,df["Sentiment"])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [67]:
# Logging training parameters to AzureML and MLFlow experiments
run.log("n_estimators", grid.get_params(deep=True)['estimator__n_estimators'])
run.log("min_samples_split", grid.get_params(deep=True)['estimator__min_samples_split'])

In [68]:
run.complete()

# Model Packaging Step

In [69]:
import joblib

In [70]:
joblib.dump(cv,"outputs/count_vectorizer.pkl")

['outputs/count_vectorizer.pkl']

In [71]:
joblib.dump(rf,"outputs/rf_sent_model.pkl")

['outputs/rf_sent_model.pkl']

In [92]:
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies

# to install required packages
env = Environment('tutorial-env')
cd = CondaDependencies.create(pip_packages=['azureml-dataset-runtime[pandas,fuse]', 'azureml-defaults',"numpy",  "joblib", "azureml-core", "azureml-monitoring", "azureml-defaults", "scikit-learn==0.20.3", "inference-schema", "inference-schema[numpy-support]","nltk-corpus","nltk-stem"], conda_packages = ['scikit-learn==0.22.1'])

env.python.conda_dependencies = cd

# Register environment to re-use later
env.register(workspace = workspace)

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210615.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "tutorial-env",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "anaconda",
                "conda-forge"
 

# Model Registering Step

In [13]:
from azureml.core.model import Model

In [79]:
# Register Model on AzureML WS
model = Model.register(model_path = './outputs/count_vectorizer.pkl', # this points to a local file 
                       model_name = "NLP_Count_Vectorizer", # this is the name the model is registered as
                       tags = {'dataset': dataset.name, 'version': dataset.version, }, 
                       model_framework='pandas==0.23.4',
                       description = "Count Vectorizer",
                       workspace = workspace)

print('Name:', model.name)
print('Version:', model.version)

Registering model NLP_Count_Vectorizer
Name: NLP_Count_Vectorizer
Version: 1


In [80]:
# Register Model on AzureML WS
model = Model.register(model_path = './outputs/rf_sent_model.pkl', # this points to a local file 
                       model_name = "NLP_RF_Model", # this is the name the model is registered as
                       tags = {'dataset': dataset.name, 'version': dataset.version, }, 
                       model_framework='pandas==0.23.4',
                       description = "Random Forest Model",
                       workspace = workspace)

print('Name:', model.name)
print('Version:', model.version)

Registering model NLP_RF_Model
Name: NLP_RF_Model
Version: 1


# Deploy model as a webservice on Azure Container Instance

In [7]:
%%writefile score.py
import json
import numpy as np
import os
import pickle
import joblib
import time
from azureml.core.model import Model
import nltk
nltk.download("stopwords")
nltk.download("wordnet")
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
lemmatizer=WordNetLemmatizer()

def preprocess_data(data):
    corpus=[]
    for i in data:
        mess=re.sub("[^a-zA-Z0-9]"," ",i)
        mess=mess.lower().split()
        mess=[lemmatizer.lemmatize(word) for word in mess if word not in stopwords.words("english")]
        mess=" ".join(mess)
        corpus.append(mess)
    return corpus    


def init():
    global count_vect,rf_model
    
    count_vect_path=Model.get_model_path('NLP_Count_Vectorizer')
    count_vect= joblib.load(count_vect_path)
    
    rf_model_path=Model.get_model_path('NLP_RF_Model')
    rf_model=joblib.load(rf_model_path)
    
def run(data):
    data = np.array(json.loads(raw_data)['data'])
    corpus=preprocess_data(data)
    count_test=count_vect.transform(corpus)
    prediction=rf_model.predict(count_test)
    # you can return any data type as long as it is JSON-serializable
    return prediction.tolist()
                    
            


Overwriting score.py


 # Define Environment

In [8]:
from azureml.core.environment import Environment
from azureml.core.environment import CondaDependencies


# Create the environment
myenv = Environment(name="MyEnvironment")

# Create the dependencies object
print("Creating dependencies....")
myenv_dep = CondaDependencies.create(conda_packages=['scikit-learn', 'pip'],
                                     pip_packages=['azureml-defaults','joblib','numpy','azureml-core', "azureml-monitoring", "inference-schema", "inference-schema[numpy-support]","nltk"])

myenv.python.conda_dependencies = myenv_dep

# Register the environment
print("Registering the environment...")
myenv.register(workspace)

Creating dependencies....
Registering the environment...


{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210615.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "MyEnvironment",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "anaconda",
                "conda-forge"


In [9]:
from azureml.core.model import InferenceConfig

In [10]:
inference_config = InferenceConfig(entry_script="score.py", environment=myenv)

In [11]:
from azureml.core.webservice import AciWebservice

deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1, collect_model_data=True)

In [14]:
model1 = Model(workspace, 'NLP_Count_Vectorizer')
model2 = Model(workspace, 'NLP_RF_Model')

service_name = 'amazon-feedback-analysis'

In [15]:
service = Model.deploy(workspace, service_name, models=[model1, model2], inference_config=inference_config, deployment_config=deployment_config, overwrite=True)
service.wait_for_deployment(show_output = True)
print(service.state)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-08-19 18:00:25+00:00 Creating Container Registry if not exists.
2021-08-19 18:00:25+00:00 Registering the environment.
2021-08-19 18:00:29+00:00 Building image..
2021-08-19 18:07:31+00:00 Generating deployment configuration.
2021-08-19 18:07:32+00:00 Submitting deployment to compute..
2021-08-19 18:07:35+00:00 Checking the status of deployment amazon-feedback-analysis..
2021-08-19 18:12:49+00:00 Checking the status of inference endpoint amazon-feedback-analysis.
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy
