# Loading Libraries

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sn
from azureml.core import Workspace, Dataset

In [2]:
# import dataset

df = pd.read_csv('Dataset/amazonLabelled - amazonLabelled.csv')

In [3]:
df.head()

Unnamed: 0,S,Feedback,Sentiment
0,1,"Good case, Excellent value.",Positive
1,2,Great for the jawbone.,Positive
2,3,Tied to charger for conversations lasting more...,Negative
3,4,The mic is great.,Positive
4,5,I have to jiggle the plug to get it to line up...,Negative


In [4]:
df.shape

(999, 3)

In [5]:
df.isnull().values.any()

False

In [6]:
df["Sentiment"].value_counts()

Positive    500
Negative    499
Name: Sentiment, dtype: int64

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
lb=LabelEncoder()

In [9]:
lb.fit(df["Sentiment"])

LabelEncoder()

In [10]:
df["Sentiment"]=lb.transform(df["Sentiment"])

In [11]:
df.head()

Unnamed: 0,S,Feedback,Sentiment
0,1,"Good case, Excellent value.",1
1,2,Great for the jawbone.,1
2,3,Tied to charger for conversations lasting more...,0
3,4,The mic is great.,1
4,5,I have to jiggle the plug to get it to line up...,0


# Train Test Split

In [13]:
from sklearn.model_selection import train_test_split

In [18]:
X_train,X_test,y_train,y_test=train_test_split(df.drop("Sentiment",axis=1),df["Sentiment"],test_size=0.2)

In [19]:
X_train.shape

(799, 2)

In [44]:
train_df=pd.concat([X_train,y_train],axis=1).to_csv("Dataset/train_set.csv",index=False)

In [45]:
test_df=pd.concat([X_test,y_test],axis=1).to_csv("Dataset/test_set.csv",index=False)

# Register dataset to the workspace

In [26]:
subscription_id = 'b34c1109-5941-40da-be9e-960b4f30511d'
resource_group = 'Learn_MLOps'
workspace_name = 'MLOps_WS'

In [27]:
workspace = Workspace(subscription_id, resource_group, workspace_name)

In [28]:
# get the datastore to upload prepared data
datastore = workspace.get_default_datastore()

In [46]:
# upload the local file from src_dir to the target_path in datastore
datastore.upload(src_dir='Dataset', target_path='nlpdata')

Uploading an estimated of 3 files
Uploading Dataset/amazonLabelled - amazonLabelled.csv
Uploaded Dataset/amazonLabelled - amazonLabelled.csv, 1 files out of an estimated total of 3
Uploading Dataset/test_set.csv
Uploaded Dataset/test_set.csv, 2 files out of an estimated total of 3
Uploading Dataset/train_set.csv
Uploaded Dataset/train_set.csv, 3 files out of an estimated total of 3
Uploaded 3 files


$AZUREML_DATAREFERENCE_c6f34be1d5c64a8cafedee3e6f004d71

In [47]:
train_dataset = Dataset.Tabular.from_delimited_files(datastore.path('nlpdata/train_set.csv'))

In [48]:
test_dataset = Dataset.Tabular.from_delimited_files(datastore.path('nlpdata/test_set.csv'))

In [49]:
train_ds = train_dataset.register(workspace=workspace,
                                 name='nlp_train_set',
                                 description='Training data for nlp usecase')

In [50]:
test_ds = test_dataset.register(workspace=workspace,
                                 name='nlp_test_set',
                                 description='Test data for nlp usecase')

# Data ingestion step - Training dataset

In [51]:
dataset = Dataset.get_by_name(workspace, name='nlp_train_set')
print(dataset.name, dataset.version)

nlp_train_set 1


In [52]:
df = dataset.to_pandas_dataframe()

In [53]:
df.head()

Unnamed: 0,S,Feedback,Sentiment
0,173,The iGo chargers and tips are really great.,1
1,846,Used and dirty.,0
2,639,Disappointing accessory from a good manufacturer.,0
3,435,Warning - Stay away.,0
4,88,Product was excellent and works better than th...,1


In [54]:
df.shape

(799, 3)

# Preprocessing Data

In [60]:
from sklearn.feature_extraction.text import CountVectorizer

In [62]:
cv=CountVectorizer(stop_words="english",ngram_range=(1,2))

In [63]:
cv.fit(df["Feedback"])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [64]:
corpus=cv.transform(df["Feedback"])

In [66]:
corpus.shape

(799, 4234)

# Creating experiment and run to log metrics and hypermeters

In [109]:
from azureml.core.experiment import Experiment
myexperiment = Experiment(workspace, "rf_sent_analysis")
# initialize a run in Azureml
run = myexperiment.start_logging()

In [110]:
run.log("dataset name", dataset.name)
run.log("dataset Version", dataset.version)

# Model Training

In [67]:
from sklearn.ensemble import RandomForestClassifier

In [68]:
rf=RandomForestClassifier()

In [70]:
rf.fit(corpus,df["Sentiment"])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [71]:
rf.score(corpus,df["Sentiment"])

0.9962453066332916

In [72]:
from sklearn.model_selection import cross_val_score

In [73]:
scores=cross_val_score(rf,corpus,df["Sentiment"],cv=3)

In [74]:
scores.mean()

0.758445739817709

In [75]:
scores.std()

0.002018405024948868

# Hyperparameter Tuning

In [76]:
from sklearn.model_selection import GridSearchCV

In [100]:
param_grid={'n_estimators': [100, 400,700,1000,2000,2500], 'min_samples_split': [2,4,8,16]}

In [101]:
grid=GridSearchCV(rf,param_grid,n_jobs=-1)

In [102]:
grid.fit(corpus,df["Sentiment"])

GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              ra

In [105]:
n_est=grid.get_params(deep=True)['estimator__n_estimators']

In [106]:
min_sam_splt=grid.get_params(deep=True)['estimator__min_samples_split']

In [107]:
rf=RandomForestClassifier(n_estimators=n_est,min_samples_split=min_sam_splt)

In [108]:
rf.fit(corpus,df["Sentiment"])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [112]:
# Logging training parameters to AzureML and MLFlow experiments
run.log("n_estimators", grid.get_params(deep=True)['estimator__n_estimators'])
run.log("min_samples_split", grid.get_params(deep=True)['estimator__min_samples_split'])

In [113]:
run.complete()

# Model Packaging Step

In [114]:
import joblib

In [115]:
joblib.dump(cv,"outputs/count_vectorizer.pkl")

['outputs/count_vectorizer.pkl']

In [116]:
joblib.dump(rf,"outputs/rf_sent_model.pkl")

['outputs/rf_sent_model.pkl']

In [117]:
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies

# to install required packages
env = Environment('tutorial-env')
cd = CondaDependencies.create(pip_packages=['azureml-dataset-runtime[pandas,fuse]', 'azureml-defaults',"numpy",  "joblib", "azureml-core", "azureml-monitoring", "azureml-defaults", "scikit-learn==0.20.3", "inference-schema", "inference-schema[numpy-support]"], conda_packages = ['scikit-learn==0.22.1'])

env.python.conda_dependencies = cd

# Register environment to re-use later
env.register(workspace = workspace)

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210615.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "tutorial-env",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "anaconda",
                "conda-forge"
 

# Model Registering Step

In [119]:
from azureml.core.model import Model

In [120]:
# Register Model on AzureML WS
model = Model.register(model_path = './outputs/count_vectorizer.pkl', # this points to a local file 
                       model_name = "nlp_count_vectorizer", # this is the name the model is registered as
                       tags = {'dataset': dataset.name, 'version': dataset.version, }, 
                       model_framework='pandas==0.23.4',
                       description = "Count Vectorizer",
                       workspace = workspace)

print('Name:', model.name)
print('Version:', model.version)

Registering model nlp_count_vectorizer
Name: nlp_count_vectorizer
Version: 1


In [121]:
# Register Model on AzureML WS
model = Model.register(model_path = './outputs/rf_sent_model.pkl', # this points to a local file 
                       model_name = "nlp_rf_model", # this is the name the model is registered as
                       tags = {'dataset': dataset.name, 'version': dataset.version, }, 
                       model_framework='pandas==0.23.4',
                       description = "Random Forest Model",
                       workspace = workspace)

print('Name:', model.name)
print('Version:', model.version)

Registering model nlp_rf_model
Name: nlp_rf_model
Version: 1
