## Aim: Produce 100% trained Individual AA Resolver Model and Action Model. Compute the Evaluation Scores. 

[CR] + [AA] Resolver Model

[CR] + [AA] Action Model

### Objective: To Create Individual AA Models [Resolver, Action] using different algorithms and compute their Evaluation Scores. 


In [1]:
import pandas as pd
import numpy as np
import pickle
import re
import string

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

import sklearn
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, average_precision_score, recall_score, f1_score
# from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer

from textblob import TextBlob
import psycopg2

import os
import time
import requests
from datetime import datetime
import sys
import logging

from sklearn.model_selection import KFold 
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import log_loss
from sklearn.metrics import precision_recall_fscore_support
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def preprocess_error_message_aa(text):
    wn = WordNetLemmatizer()

    def black_txt(token):
        stop_words_ = set(stopwords.words('english'))
        my_sw = ['please', 'open', 'task', 'workbench', 'view', 'action']
        return token not in stop_words_ and token not in list(string.punctuation) and len(token) > 2 and token not in my_sw
    
    clean_text = []
    clean_text2 = []
    text = text.replace('(', '').replace(')', '')

    # first part of sentence
    if re.match('^.*(?=(An error occurred at line number))', text):
        text = re.match('^.*(?=(An error occurred at line number))', text)[0]
    # removing filename
    text = re.sub(r"'(.*).xlsx'", '', text)
    text = re.sub(r"'(.*).atmx'", '', text)
    text = re.sub(r"(?<=\\)(.*).csv", '', text)
    text = re.sub(r"(?<=\\)(.*).xlsx", '', text)
    text = re.sub(r"(?<=\\)(.*).xls", '', text)
    text = re.sub(r"\s\w+.xlsx", '', text)
    text = re.sub(r"(?<=\\).*.pdf", '', text)
    text = re.sub(r"(?<=\\).*.mbot", '', text)
    text = re.sub(r"(?<=\s)\w+.mbot", '', text)
    text = re.sub(r'\\', '', text)
    # removing bot name
    text = re.sub(r"( bot)?.*(.atmx)", '', text)
    # remove machine number
    text = re.sub('\w{3}\d{6}', '', text, re.IGNORECASE)
    # remove machine name
    text = re.sub('(?<=machine)\s\w+', '', text, re.IGNORECASE)
    # remove variable name after removing ''
    text = re.sub('(?<=variable)\s\w+', '', text, re.IGNORECASE)
    #remove pushbutton within ''
    text = re.sub('(?<=PushButton)\s\'[\w+\s]+\'', '', text, re.IGNORECASE)
    #remove Parameter name
    text = re.sub('(?<=Parameter name):?\s*\w+', '', text, re.IGNORECASE)
    #remove all quotes
    text = re.sub("'(.*?)'", "", text)
    #remove special characters
    text = re.sub('[^\w\s]', '', text) 
    clean_text = [
        wn.lemmatize(word, pos="v") for word in word_tokenize(text.lower())
        if black_txt(word)
    ]
    clean_text_tokens = [word for word in clean_text if black_txt(word)]
    clean_text_joined = " ".join(clean_text_tokens)
    return clean_text_joined


In [3]:
### Training Data Preparation

df = pd.DataFrame(pd.read_excel('AA Error Logs.xlsx'))
    
df = df.rename(columns={"Error Message": "error_message", "Error Classification": "error_classification", "Resolver Group": "resolver_group", "Recommended Action": "action_classification"})
df = df[['error_message', 'resolver_group', 'action_classification']]
print(df.shape)

# Missing Values - drop rows with missing values
df.dropna(inplace=True)
print(df.shape)
            
# create 'preprocessed_error_message' by applying 'preprocess_error_message_aa(text)' on 'error_message'
df['preprocessed_error_message'] = df['error_message'].apply(preprocess_error_message_aa)

df['rpa_name'] = 'Automation Anywhere'
# df['rpa_id'] = 1
df['source_type'] = 'CR'
# df['source_id'] = 1

display(df)

(684, 3)
(554, 3)


Unnamed: 0,error_message,resolver_group,action_classification,preprocessed_error_message,rpa_name,source_type
0,Please provide valid task file. Task File with...,L1,Validate File Path Manually.If path/file dosen...,provide valid file file name find,Automation Anywhere,CR
1,AAE was unable to unlock or logon onto the mac...,L1,Enable/Validate Auto Login Settings in Tools->...,aae unable unlock logon onto machine run check...,Automation Anywhere,CR
2,The credential variable DB_Connection String c...,L1,Check if the Credential is assigned to your lo...,credential variable string could access due on...,Automation Anywhere,CR
3,The Variable 'vDBLog_Type' does not exist. An...,L2,Possible code Issue.Validate if the variable e...,variable exist,Automation Anywhere,CR
4,[Microsoft][ODBC Driver Manager] Data source n...,L1,Define the data source using the ODBC Administ...,microsoftodbc driver manager data source name ...,Automation Anywhere,CR
...,...,...,...,...,...,...
679,Outgoing mail server is not specified. Unable ...,L1,Validate the configuration parameters (hostnam...,outgo mail server specify unable send mailplea...,Automation Anywhere,CR
680,Unable to rename file as the source path '\\ro...,L1,Validate File Path Manually.If path/file dosen...,unable rename file source path exist,Automation Anywhere,CR
681,Cannot find window or application titled 'Prep...,L1,Re-Run Bot,find window application title record,Automation Anywhere,CR
682,Unable to log the text in the file. Path not f...,L1,Validate File Path Manually.If path/file dosen...,unable log text file path find,Automation Anywhere,CR


In [4]:
# Check for duplicates and unique counts 
print(df.nunique())

error_message                 554
resolver_group                  3
action_classification          20
preprocessed_error_message     84
rpa_name                        1
source_type                     1
dtype: int64


No duplicate values exist in "error_message" column; 
Duplicate values exist in "preprocessed_error_message" column => as there are only 84 distinct values. 

In [5]:
# Check for duplicates and unique counts - in-depth view 
print("\npreprocessed_error_message value_counts: ")
print(df['preprocessed_error_message'].value_counts())
print("------------------------------------------------------------------------------------------------------------------")
print("\nresolver_group value_counts: ")
print(df['resolver_group'].value_counts())
print("------------------------------------------------------------------------------------------------------------------")
print("\naction_classification value_counts: ")
print(df['action_classification'].value_counts())
print("------------------------------------------------------------------------------------------------------------------")
print("\nrpa_name value_counts: ")
print(df['rpa_name'].value_counts())
print("------------------------------------------------------------------------------------------------------------------")
print("\nsource_type value_counts: ")
print(df['source_type'].value_counts())


preprocessed_error_message value_counts: 
unable copy file source path exist                                                                                                                       113
process access file use another process                                                                                                                   65
find window application title record                                                                                                                      30
outgo mail server specify unable send mailplease specify server name toolsoptionsemail settings try                                                       23
unable rename file source path exist                                                                                                                      22
                                                                                                                                                        ... 
microsoftodbc d

In [6]:
#drop duplicates of preprocessed_error_message in each rpa_id 

df = df.drop_duplicates(subset = ["preprocessed_error_message", "rpa_name", "source_type"], keep='last')
print(len(df))

84


The cleaned dataset is now ready => This dataset can be used for 100% trained model. 

But, need the evaluation score for the model => so splitting the dataset from this point. 

In [62]:
print(df.shape)
df.head()

(84, 6)


Unnamed: 0,error_message,resolver_group,action_classification,preprocessed_error_message,rpa_name,source_type
1,AAE was unable to unlock or logon onto the mac...,L1,Enable/Validate Auto Login Settings in Tools->...,aae unable unlock logon onto machine run check...,Automation Anywhere,CR
2,The credential variable DB_Connection String c...,L1,Check if the Credential is assigned to your lo...,credential variable string could access due on...,Automation Anywhere,CR
4,[Microsoft][ODBC Driver Manager] Data source n...,L1,Define the data source using the ODBC Administ...,microsoftodbc driver manager data source name ...,Automation Anywhere,CR
5,The Variable 'vDBLog_Type' does not exist. An...,L2,Possible code Issue.Validate if the variable e...,variable exist,Automation Anywhere,CR
6,AAE was unable to unlock or logon onto the mac...,L1,Enable/Validate Auto Login Settings in Tools->...,aae unable unlock logon onto machine run check...,Automation Anywhere,CR


In [63]:
def dtm2df(wm, error_words):
    """creates and returns a dataframe from a word matrix"""
    doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(wm)]
    df = pd.DataFrame(data=wm.toarray(), index=doc_names, columns=error_words)
    return(df)

# start Resolver Model --- --- --- 

## 100% Train Model

In [64]:
### Train Resolver Model - 100 % Training --------------------------------------------------------------------------------------

"""---------- Resolver Model Training ----------"""

# Define Features
X = df[['preprocessed_error_message', 'rpa_name', 'source_type']].values
y = df['resolver_group']

### Embedding and Encoding X
err_msg_embedding = TfidfVectorizer(max_features=None,
                            strip_accents='unicode',
                            analyzer='word',
                            token_pattern=r'\w{1,}',
                            ngram_range=(1, 10),
                            use_idf=1,
                            smooth_idf=1,
                            sublinear_tf=1,
                            stop_words=None)
rpa_name_encoder = LabelEncoder()
source_type_encoder = LabelEncoder()

X_values = X
X1 = [X_values[t][0] for t in range(len(X_values))]
X1_tf_idf = err_msg_embedding.fit_transform(X1)
terms = err_msg_embedding.get_feature_names()
final_X = dtm2df(X1_tf_idf ,terms)
# add - rpa_name
X2 = [X_values[i][1] for i in range(len(X_values))]
X2_encoded = rpa_name_encoder.fit_transform(X2)
final_X['rpa_name'] = X2_encoded
# add - source_type
X3 = [X_values[i][2] for i in range(len(X_values))]
X3_encoded = source_type_encoder.fit_transform(X3)
final_X['source_type'] = X3_encoded
final_X = final_X.to_numpy()
X = final_X # embedded X


### Encoding y
resolver_encoder = LabelEncoder()
y = resolver_encoder.fit_transform(y)


# define model architecture
model = SGDClassifier(max_iter=200, loss='log')
model.fit(X,y)

# saving model, err_msg_embedding, and encoder in the 'saved_models' directory
resolver_model_file = 'saved_models/M1 - resolver/resolver_model_r.sav'
pickle.dump(model, open(resolver_model_file, 'wb'))
err_msg_embedding_file = 'saved_models/M1 - resolver/err_msg_embedding_vocab_r.pkl'
pickle.dump(err_msg_embedding, open(err_msg_embedding_file, 'wb'))
resolver_encoder_file = 'saved_models/M1 - resolver/resolver_encoder_r.sav'
pickle.dump(resolver_encoder, open(resolver_encoder_file,'wb'))
rpa_name_encoder_file = 'saved_models/M1 - resolver/rpa_name_encoder_r.sav'
pickle.dump(rpa_name_encoder, open(rpa_name_encoder_file,'wb'))
source_type_encoder_file = 'saved_models/M1 - resolver/source_type_encoder_r.sav'
pickle.dump(source_type_encoder, open(source_type_encoder_file,'wb'))

## Train-Eval Model

In [65]:
# Dataset preparation for Train-Eval
te_df = df.copy()

In [66]:
# Unique Values in the dataset  
print(te_df.nunique())

# Check for duplicates and unique counts - in-depth view 
print("\npreprocessed_error_message value_counts: ")
print(te_df['preprocessed_error_message'].value_counts())
print("------------------------------------------------------------------------------------------------------------------")
print("\nresolver_group value_counts: ")
print(te_df['resolver_group'].value_counts())
print("------------------------------------------------------------------------------------------------------------------")
print("\naction_classification value_counts: ")
print(te_df['action_classification'].value_counts())
print("------------------------------------------------------------------------------------------------------------------")
print("\nrpa_name value_counts: ")
print(te_df['rpa_name'].value_counts())
print("------------------------------------------------------------------------------------------------------------------")
print("\nsource_type value_counts: ")
print(te_df['source_type'].value_counts())

error_message                 84
resolver_group                 3
action_classification         20
preprocessed_error_message    84
rpa_name                       1
source_type                    1
dtype: int64

preprocessed_error_message value_counts: 
aae unable unlock logon onto machine run check login settings toolsoptions aae client vrpa15p refer aae clientâs help documentation auto login detail                                1
unable find command use session name default1                                                                                                                                        1
unable delete folder source folder exist                                                                                                                                             1
access path deny                                                                                                                                                                     1
file contain s

In [67]:
# remove orphan categories (in resolver_group) i.e., the categories for which only 1 record exist. 

te_df = te_df.groupby('resolver_group').filter(lambda x : len(x)>1)
te_df

Unnamed: 0,error_message,resolver_group,action_classification,preprocessed_error_message,rpa_name,source_type
1,AAE was unable to unlock or logon onto the mac...,L1,Enable/Validate Auto Login Settings in Tools->...,aae unable unlock logon onto machine run check...,Automation Anywhere,CR
2,The credential variable DB_Connection String c...,L1,Check if the Credential is assigned to your lo...,credential variable string could access due on...,Automation Anywhere,CR
4,[Microsoft][ODBC Driver Manager] Data source n...,L1,Define the data source using the ODBC Administ...,microsoftodbc driver manager data source name ...,Automation Anywhere,CR
5,The Variable 'vDBLog_Type' does not exist. An...,L2,Possible code Issue.Validate if the variable e...,variable exist,Automation Anywhere,CR
6,AAE was unable to unlock or logon onto the mac...,L1,Enable/Validate Auto Login Settings in Tools->...,aae unable unlock logon onto machine run check...,Automation Anywhere,CR
...,...,...,...,...,...,...
679,Outgoing mail server is not specified. Unable ...,L1,Validate the configuration parameters (hostnam...,outgo mail server specify unable send mailplea...,Automation Anywhere,CR
680,Unable to rename file as the source path '\\ro...,L1,Validate File Path Manually.If path/file dosen...,unable rename file source path exist,Automation Anywhere,CR
681,Cannot find window or application titled 'Prep...,L1,Re-Run Bot,find window application title record,Automation Anywhere,CR
682,Unable to log the text in the file. Path not f...,L1,Validate File Path Manually.If path/file dosen...,unable log text file path find,Automation Anywhere,CR


In [81]:
### --- Resolver Model - Training and Evaluation --- 

# define features
# X = te_df['preprocessed_error_message']
# y = te_df['resolver_group']

df_train, df_test = train_test_split(te_df, test_size=0.20, stratify=te_df['resolver_group'], random_state=1)
X_train = df_train[['preprocessed_error_message', 'rpa_name', 'source_type']].values
X_test = df_test[['preprocessed_error_message', 'rpa_name', 'source_type']].values
y_train = df_train['resolver_group']
y_test = df_test['resolver_group']

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


# embedding & encoding
err_msg_embedding = TfidfVectorizer(max_features=None,
                            strip_accents='unicode',
                            analyzer='word',
                            token_pattern=r'\w{1,}',
                            ngram_range=(1, 10),
                            use_idf=1,
                            smooth_idf=1,
                            sublinear_tf=1,
                            stop_words=None)
rpa_name_encoder = LabelEncoder()
source_type_encoder = LabelEncoder()

X_values = X_train
X1 = [X_values[t][0] for t in range(len(X_values))]
X1_tf_idf = err_msg_embedding.fit_transform(X1)
terms = err_msg_embedding.get_feature_names()
final_X = dtm2df(X1_tf_idf ,terms)
# add - rpa_name
X2 = [X_values[i][1] for i in range(len(X_values))]
X2_encoded = rpa_name_encoder.fit_transform(X2)
final_X['rpa_name'] = X2_encoded
# add - source_type
X3 = [X_values[i][2] for i in range(len(X_values))]
X3_encoded = source_type_encoder.fit_transform(X3)
final_X['source_type'] = X3_encoded
final_X = final_X.to_numpy()
X_train = final_X # embedded X_train

X_values = X_test
X1 = [X_values[t][0] for t in range(len(X_values))]
X1_tf_idf = err_msg_embedding.transform(X1)
terms = err_msg_embedding.get_feature_names()
final_X = dtm2df(X1_tf_idf ,terms)
# add - rpa_name
X2 = [X_values[i][1] for i in range(len(X_values))]
X2_encoded = rpa_name_encoder.transform(X2)
final_X['rpa_name'] = X2_encoded
# add - source_type
X3 = [X_values[i][2] for i in range(len(X_values))]
X3_encoded = source_type_encoder.transform(X3)
final_X['source_type'] = X3_encoded
final_X = final_X.to_numpy()
X_test = final_X # embedded X_train


### Encoding y
resolver_encoder = LabelEncoder()
y_train = resolver_encoder.fit_transform(y_train)
y_test = resolver_encoder.transform(y_test)


(57, 3)
(15, 3)
(57,)
(15,)


In [82]:
### define, train, evaluate model

# model = LogisticRegression() # LogisticRegression(solver= 'liblinear')
model = SGDClassifier(max_iter=200, loss='log')

model.fit(X_train,y_train)
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)

# TPR, FPR, PR  - yet to 

# classification report - precision, recall, f1-score, accuracy
report = classification_report(y_test, y_pred) #, target_names=['Infra', 'L1', 'L2'])
print('Classification report : \n', report)

# inividual values of precision, recall, fscore, accuracy - from the above classification report
clf_evals = precision_recall_fscore_support(y_test, y_pred, average='macro')
print('precision score: ', clf_evals[0])
print('recall score: ', clf_evals[1])
print('fscore score: ', clf_evals[2])
# accuracy
acc = accuracy_score(y_pred , y_test)
print('accuracy score: ', acc)
# Log Loss - lower log loss value means better predictions; range(0,1)
log_loss_score = log_loss(y_test, y_pred_prob)
print('logloss score: ', log_loss_score)
# roc-auc score
r_score = roc_auc_score(y_test, y_pred_prob, multi_class='ovo', average='weighted')
print('roc-auc score: ', r_score)

Classification report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       0.93      1.00      0.96        13
           2       0.00      0.00      0.00         1

    accuracy                           0.93        15
   macro avg       0.64      0.67      0.65        15
weighted avg       0.87      0.93      0.90        15

precision score:  0.6428571428571429
recall score:  0.6666666666666666
fscore score:  0.654320987654321
accuracy score:  0.9333333333333333
logloss score:  0.25794012550905693
roc-auc score:  0.9282051282051282


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [70]:
"""
accepted input values: 
preprocessed_error_message = 'string of any length'
rpa_name = 'Automation Anywhere'
source_type = 'CR'
"""

message = "Error Occured at line 102."
rpa_name = "Automation Anywhere"
source_type = "CR"

if rpa_name == 'Automation Anywhere': 
    pp_err_msg = preprocess_error_message_aa(message)
# elif rpa_name == 'UiPath': 
#     pp_err_msg = preprocess_error_message_uipath(message)
# elif rpa_name == 'Blue Prism': 
#     pp_err_msg = preprocess_error_message_bp(message)

# loading pipeline for pulling features of text
loaded_err_msg_vocab = pickle.load(open('saved_models/M1 - resolver/err_msg_embedding_vocab_r.pkl', 'rb'))
# loading the saved model
loaded_resolver_model = pickle.load(open('saved_models/M1 - resolver/resolver_model_r.sav', 'rb'))
# loading the saved resolver encoder
loaded_resolver_encoder = pickle.load(open('saved_models/M1 - resolver/resolver_encoder_r.sav', 'rb'))
# loading the saved rpa_name encoder
loaded_rpa_name_encoder = pickle.load(open('saved_models/M1 - resolver/rpa_name_encoder_r.sav', 'rb'))
# loading the saved source_type encoder
loaded_source_type_encoder = pickle.load(open('saved_models/M1 - resolver/source_type_encoder_r.sav', 'rb'))

# creating record Series
terms = loaded_err_msg_vocab.get_feature_names()
vectorized = loaded_err_msg_vocab.transform([pp_err_msg])
record = dtm2df(vectorized, terms)
record['rpa_name'] = loaded_rpa_name_encoder.transform([rpa_name])
record['source_type'] = loaded_source_type_encoder.transform([source_type])
prediction = loaded_resolver_model.predict(record)
confidence_long = loaded_resolver_model.predict_proba(record)[0][prediction][0]*100
confidence = round(confidence_long,2)
# saving predictions in dataframe
prediction_decoded = loaded_resolver_encoder.inverse_transform(prediction)[0]
print('Preprocessed Error Message: ', pp_err_msg, '\n')
print(prediction_decoded, confidence, '\n')

Preprocessed Error Message:  error occur line 102 

L1 74.46 



# end Resolver Model --- --- --- 

# start Action Model --- --- --- 

## 100% Train Model

In [71]:
### Train Action Model - 100 % Training --------------------------------------------------------------------------------------

"""---------- Action Model Training ----------"""

# Define Features
X = df[['preprocessed_error_message', 'rpa_name', 'source_type', 'resolver_group']].values
y = df['action_classification'].values

### Embedding and Encoding X
err_msg_embedding = TfidfVectorizer(max_features=None,
                            strip_accents='unicode',
                            analyzer='word',
                            token_pattern=r'\w{1,}',
                            ngram_range=(1, 10),
                            use_idf=1,
                            smooth_idf=1,
                            sublinear_tf=1,
                            stop_words=None)
rpa_name_encoder = LabelEncoder()
source_type_encoder = LabelEncoder()
resolver_encoder = LabelEncoder()

X_values = X
X1 = [X_values[t][0] for t in range(len(X_values))]
X1_tf_idf = err_msg_embedding.fit_transform(X1)
terms = err_msg_embedding.get_feature_names()
final_X = dtm2df(X1_tf_idf ,terms)
# add - rpa_name
X2 = [X_values[i][1] for i in range(len(X_values))]
X2_encoded = rpa_name_encoder.fit_transform(X2)
final_X['rpa_name'] = X2_encoded
# add - source_type
X3 = [X_values[i][2] for i in range(len(X_values))]
X3_encoded = source_type_encoder.fit_transform(X3)
final_X['source_type'] = X3_encoded
# add - resolver_group
X4 = [X_values[i][3] for i in range(len(X_values))]
X4_encoded = resolver_encoder.fit_transform(X4)
final_X['resolver_group'] = X4_encoded
final_X = final_X.to_numpy()
X = final_X # embedded X


### Encoding y
action_encoder = LabelEncoder()
y = action_encoder.fit_transform(y)


# define model architecture
model = SGDClassifier(max_iter=200, loss='log')
model.fit(X, y)

# saving model, err_msg_embedding, and encoder in the 'saved_models' directory
action_model_file = 'saved_models/M1 - action/action_model_a.sav'
pickle.dump(model, open(action_model_file, 'wb'))
err_msg_embedding_file = 'saved_models/M1 - action/err_msg_embedding_vocab_a.pkl'
pickle.dump(err_msg_embedding, open(err_msg_embedding_file, 'wb'))
rpa_name_encoder_file = 'saved_models/M1 - action/rpa_name_encoder_a.sav'
pickle.dump(rpa_name_encoder, open(rpa_name_encoder_file,'wb'))
source_type_encoder_file = 'saved_models/M1 - action/source_type_encoder_a.sav'
pickle.dump(source_type_encoder, open(source_type_encoder_file,'wb'))
resolver_encoder_file = 'saved_models/M1 - action/resolver_encoder_a.sav'
pickle.dump(resolver_encoder, open(resolver_encoder_file,'wb'))
action_encoder_file = 'saved_models/M1 - action/action_encoder_a.sav'
pickle.dump(action_encoder, open(action_encoder_file,'wb'))

## Train-Eval Model

In [72]:
# Dataset preparation for Train-Eval
te_df = df.copy()

In [73]:
# Unique Values in the dataset  
print(te_df.nunique())

# Check for duplicates and unique counts - in-depth view 
print("\npreprocessed_error_message value_counts: ")
print(te_df['preprocessed_error_message'].value_counts())
print("------------------------------------------------------------------------------------------------------------------")
print("\nresolver_group value_counts: ")
print(te_df['resolver_group'].value_counts())
print("------------------------------------------------------------------------------------------------------------------")
print("\naction_classification value_counts: ")
print(te_df['action_classification'].value_counts())
print("------------------------------------------------------------------------------------------------------------------")
print("\nrpa_name value_counts: ")
print(te_df['rpa_name'].value_counts())
print("------------------------------------------------------------------------------------------------------------------")
print("\nsource_type value_counts: ")
print(te_df['source_type'].value_counts())

error_message                 84
resolver_group                 3
action_classification         20
preprocessed_error_message    84
rpa_name                       1
source_type                    1
dtype: int64

preprocessed_error_message value_counts: 
aae unable unlock logon onto machine run check login settings toolsoptions aae client vrpa15p refer aae clientâs help documentation auto login detail                                1
unable find command use session name default1                                                                                                                                        1
unable delete folder source folder exist                                                                                                                                             1
access path deny                                                                                                                                                                     1
file contain s

In [74]:
# remove orphan categories (in action_classification) i.e., the categories for which only 1 record exist. 

# clearly resolver group does not contain any orphan categories but action_classification does 

te_df = te_df.groupby('action_classification').filter(lambda x : len(x)>1)
te_df

Unnamed: 0,error_message,resolver_group,action_classification,preprocessed_error_message,rpa_name,source_type
1,AAE was unable to unlock or logon onto the mac...,L1,Enable/Validate Auto Login Settings in Tools->...,aae unable unlock logon onto machine run check...,Automation Anywhere,CR
2,The credential variable DB_Connection String c...,L1,Check if the Credential is assigned to your lo...,credential variable string could access due on...,Automation Anywhere,CR
6,AAE was unable to unlock or logon onto the mac...,L1,Enable/Validate Auto Login Settings in Tools->...,aae unable unlock logon onto machine run check...,Automation Anywhere,CR
7,Machine VRPA18P was logged off. AAE was unable...,Infra,Restart the Machine.Enable/Validate Auto Login...,machine vrpa18p log aae unable logon machine run,Automation Anywhere,CR
8,AAE was unable to unlock or logon onto the mac...,L1,Enable/Validate Auto Login Settings in Tools->...,aae unable unlock logon onto machine run check...,Automation Anywhere,CR
...,...,...,...,...,...,...
678,Unable to copy file as the source path '\\inte...,L1,Validate File Path Manually.If path/file dosen...,unable copy file source path exist,Automation Anywhere,CR
679,Outgoing mail server is not specified. Unable ...,L1,Validate the configuration parameters (hostnam...,outgo mail server specify unable send mailplea...,Automation Anywhere,CR
680,Unable to rename file as the source path '\\ro...,L1,Validate File Path Manually.If path/file dosen...,unable rename file source path exist,Automation Anywhere,CR
681,Cannot find window or application titled 'Prep...,L1,Re-Run Bot,find window application title record,Automation Anywhere,CR


In [83]:
### --- Action Model - Training and Evaluation --- 

# define features
# X_values = te_df[['preprocessed_error_message', 'resolver_group']].values
# y_values = te_df['action_classification'].values

df_train, df_test = train_test_split(te_df, test_size=0.20, stratify=te_df['action_classification'], random_state=1)
X_train = df_train[['preprocessed_error_message', 'rpa_name', 'source_type', 'resolver_group']].values
X_test = df_test[['preprocessed_error_message', 'rpa_name', 'source_type', 'resolver_group']].values
y_train = df_train['action_classification'].values
y_test = df_test['action_classification'].values

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

### Embedding and Encoding X
err_msg_embedding = TfidfVectorizer(max_features=None,
                            strip_accents='unicode',
                            analyzer='word',
                            token_pattern=r'\w{1,}',
                            ngram_range=(1, 10),
                            use_idf=1,
                            smooth_idf=1,
                            sublinear_tf=1,
                            stop_words=None)
rpa_name_encoder = LabelEncoder()
source_type_encoder = LabelEncoder()
resolver_encoder = LabelEncoder()

X_values = X_train
X1 = [X_values[t][0] for t in range(len(X_values))]
X1_tf_idf = err_msg_embedding.fit_transform(X1)
terms = err_msg_embedding.get_feature_names()
final_X = dtm2df(X1_tf_idf ,terms)
# add - rpa_name
X2 = [X_values[i][1] for i in range(len(X_values))]
X2_encoded = rpa_name_encoder.fit_transform(X2)
final_X['rpa_name'] = X2_encoded
# add - source_type
X3 = [X_values[i][2] for i in range(len(X_values))]
X3_encoded = source_type_encoder.fit_transform(X3)
final_X['source_type'] = X3_encoded
# add - resolver_group
X4 = [X_values[i][3] for i in range(len(X_values))]
X4_encoded = resolver_encoder.fit_transform(X4)
final_X['resolver_group'] = X4_encoded
final_X = final_X.to_numpy()
X_train = final_X # embedded X

X_values = X_test
X1 = [X_values[t][0] for t in range(len(X_values))]
X1_tf_idf = err_msg_embedding.transform(X1)
terms = err_msg_embedding.get_feature_names()
final_X = dtm2df(X1_tf_idf ,terms)
# add - rpa_name
X2 = [X_values[i][1] for i in range(len(X_values))]
X2_encoded = rpa_name_encoder.transform(X2)
final_X['rpa_name'] = X2_encoded
# add - source_type
X3 = [X_values[i][2] for i in range(len(X_values))]
X3_encoded = source_type_encoder.transform(X3)
final_X['source_type'] = X3_encoded
# add - resolver_group
X4 = [X_values[i][3] for i in range(len(X_values))]
X4_encoded = resolver_encoder.transform(X4)
final_X['resolver_group'] = X4_encoded
final_X = final_X.to_numpy()
X_test = final_X # embedded X

### Encoding y
action_encoder = LabelEncoder()
y_train = action_encoder.fit_transform(y_train)
y_test = action_encoder.transform(y_test)


(57, 4)
(15, 4)
(57,)
(15,)


In [84]:
print(X_train)
print(X_test)
print(y_train)
print(y_test)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 2.]
 [0. 0. 0. ... 0. 0. 1.]]
[4 5 7 3 3 5 5 5 5 5 4 3 1 3 1 5 1 3 5 7 3 3 0 6 3 5 5 5 5 5 5 6 5 3 5 2 3
 4 5 5 5 1 3 3 5 5 4 2 3 5 5 4 1 0 3 3 5]
[4 5 6 3 1 5 5 3 5 3 1 5 3 5 5]


In [85]:
### define, train, evaluate model

# model = LogisticRegression() # LogisticRegression(solver= 'liblinear')
model = SGDClassifier(max_iter=200, loss='log')

model.fit(X_train,y_train)
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)

# inividual values of precision, recall, fscore, accuracy - from the above classification report
clf_evals = precision_recall_fscore_support(y_test, y_pred, average='macro')
print('precision score: ', clf_evals[0])
print('recall score: ', clf_evals[1])
print('fscore score: ', clf_evals[2])
# accuracy
acc = accuracy_score(y_pred , y_test)
print('accuracy score: ', acc)

precision score:  0.775
recall score:  0.8
fscore score:  0.7866666666666667
accuracy score:  0.9333333333333333


  _warn_prf(average, modifier, msg_start, len(result))


In [78]:
"""
accepted input values: 
preprocessed_error_message = 'string of any length'
rpa_name = 'Automation Anywhere'
source_type = 'CR'
resolver_group = ['L1', 'L2', 'Infra', 'Business']
"""

message = ""
rpa_name = "Automation Anywhere"
source_type = "CR"
resolver_group = "L1"

pp_err_msg = preprocess_error_message_aa(message)
    
# loading pipeline for pulling features of text
loaded_err_msg_vocab = pickle.load(open('saved_models/M1 - action/err_msg_embedding_vocab_a.pkl', 'rb'))
# loading the saved model
loaded_action_model = pickle.load(open('saved_models/M1 - action/action_model_a.sav', 'rb'))
# loading the saved resolver encoder
loaded_resolver_encoder = pickle.load(open('saved_models/M1 - action/resolver_encoder_a.sav', 'rb'))
# loading the saved rpa_name encoder
loaded_rpa_name_encoder = pickle.load(open('saved_models/M1 - action/rpa_name_encoder_a.sav', 'rb'))
# loading the saved source_type encoder
loaded_source_type_encoder = pickle.load(open('saved_models/M1 - action/source_type_encoder_a.sav', 'rb'))
# loading the saved action encoder
loaded_action_encoder = pickle.load(open('saved_models/M1 - action/action_encoder_a.sav', 'rb'))

# creating record Series
terms = loaded_err_msg_vocab.get_feature_names()
vectorized = loaded_err_msg_vocab.transform([pp_err_msg])
record = dtm2df(vectorized, terms)
record['rpa_name'] = loaded_rpa_name_encoder.transform([rpa_name])
record['source_type'] = loaded_source_type_encoder.transform([source_type])
record['resolver_group'] = loaded_resolver_encoder.transform([resolver_group])
prediction = loaded_action_model.predict(record)
confidence_long = loaded_action_model.predict_proba(record)[0][prediction][0]*100
confidence = round(confidence_long,2)
# saving predictions in dataframe
prediction_decoded = loaded_action_encoder.inverse_transform(prediction)[0]
print('Preprocessed Error Message: ', pp_err_msg, '\n')
print(prediction_decoded, confidence, '\n')

Preprocessed Error Message:   

Validate File Path Manually.If path/file dosen't exists, place the expected file at the location. 89.5 



# end Action Model --- --- --- 

# Important Residue 

In [None]:
### define, train, evaluate model

# model = LogisticRegression() # LogisticRegression(solver= 'liblinear')
model = SGDClassifier(max_iter=200, loss='log')

model.fit(X_train,y_train)
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)

# TPR, FPR, PR  - yet to 

# classification report - precision, recall, f1-score, accuracy
report = classification_report(y_test, y_pred) #, target_names=['Infra', 'L1', 'L2'])
print('Classification report : \n', report)

# inividual values of precision, recall, fscore, accuracy - from the above classification report
clf_evals = precision_recall_fscore_support(y_test, y_pred, average='macro')
print('precision score: ', clf_evals[0])
print('recall score: ', clf_evals[1])
print('fscore score: ', clf_evals[2])
# accuracy
acc = accuracy_score(y_pred , y_test)
print('accuracy score: ', acc)
# Log Loss - lower log loss value means better predictions; range(0,1)
log_loss_score = log_loss(y_test, y_pred_prob)
print('logloss score: ', log_loss_score)
# roc-auc score
r_score = roc_auc_score(y_test, y_pred_prob, multi_class='ovo', average='weighted')
print('roc-auc score: ', r_score)
# roc curve for classes
"""fpr = {}
tpr = {}
thresh ={}
n_class = 3
for i in range(n_class):    
    fpr[i], tpr[i], thresh[i] = roc_curve(y_test, y_pred_prob[:,i], pos_label=i)
# plotting    
plt.plot(fpr[0], tpr[0], linestyle='--',color='orange', label='Class 0 vs Rest')
plt.plot(fpr[1], tpr[1], linestyle='--',color='green', label='Class 1 vs Rest')
plt.plot(fpr[2], tpr[2], linestyle='--',color='blue', label='Class 2 vs Rest')
#plt.plot(fpr[3], tpr[3], linestyle='--',color='yellow', label='Class 3 vs Rest')
plt.title('Multiclass ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.savefig('Multiclass ROC',dpi=300);   """ 





In [None]:
"""
How to measure the model’s bias-variance?
After k-fold cross validation, we’ll get k different model estimation errors (e1, e2 …..ek). In an ideal scenario, these error values should sum up to zero. To return the model’s bias, we take the average of all the errors. Lower the average value, better the model.
Similarly for calculating the model variance, we take standard deviation of all the errors. A low value of standard deviation suggests our model does not vary a lot with different subsets of training data.

for train_index, val_index in skf.split(X,y): 
    # print("Train:", train_index, "Validation:", val_index) 
    X_train, X_test = X[train_index], X[val_index] 
    y_train, y_test = y[train_index], y[val_index]
# for train_index , test_index in kf.split(X):
#     X_train , X_test = X[train_index,:],X[test_index,:]
#     y_train , y_test = y[train_index] , y[test_index]

"""
# K-fold Cross Validation 

k = 5
kf = KFold(n_splits=k, random_state=1, shuffle=True)
model = LogisticRegression() # LogisticRegression(solver= 'liblinear')
# model = SGDClassifier(max_iter=200, loss='log')
 
acc_scores_list = []
precision_scores_list = []
recall_scores_list = []
fscore_scores_list = []
log_loss_scores_list = []
roc_auc_scores_list = []

for train_index , test_index in kf.split(X):
    X_train , X_test = X[train_index,:],X[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
     
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)
     
    # TPR, FPR, PR  - yet to 
    
    # classification report - precision, recall, f1-score, accuracy
    report = classification_report(y_test, y_pred, target_names=['Infra', 'L1', 'L2'])
    print('Classification report : \n', report)
    
    # inividual values of precision, recall, fscore, accuracy - from the above classification report
    clf_evals = precision_recall_fscore_support(y_test, y_pred, average='macro')
    precision_scores_list.append(clf_evals[0])
    recall_scores_list.append(clf_evals[1])
    fscore_scores_list.append(clf_evals[2])
    # accuracy
    acc = accuracy_score(y_pred , y_test)
    acc_scores_list.append(acc)
    
    # Log Loss - lower log loss value means better predictions; range(0,1)
    log_loss_score = log_loss(y_test, y_pred_prob)
    log_loss_scores_list.append(log_loss_score)
    
    # roc-auc score
    r_score = roc_auc_score(y_test, y_pred_prob, multi_class='ovo', average='weighted')
    roc_auc_scores_list.append(r_score)
    # roc curve for classes
    fpr = {}
    tpr = {}
    thresh ={}
    n_class = 3
    for i in range(n_class):    
        fpr[i], tpr[i], thresh[i] = roc_curve(y_test, y_pred_prob[:,i], pos_label=i)
    # plotting    
    plt.plot(fpr[0], tpr[0], linestyle='--',color='orange', label='Class 0 vs Rest')
    plt.plot(fpr[1], tpr[1], linestyle='--',color='green', label='Class 1 vs Rest')
    plt.plot(fpr[2], tpr[2], linestyle='--',color='blue', label='Class 2 vs Rest')
    #plt.plot(fpr[3], tpr[3], linestyle='--',color='yellow', label='Class 3 vs Rest')
    plt.title('Multiclass ROC curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive rate')
    plt.legend(loc='best')
    plt.savefig('Multiclass ROC',dpi=300);    
    
     
avg_acc_score = sum(acc_scores_list)/k
avg_precision_score = sum(precision_scores_list)/k
avg_recall_score = sum(recall_scores_list)/k
avg_fscore_score = sum(fscore_scores_list)/k
avg_log_loss_score = sum(log_loss_scores_list)/k
avg_roc_auc_score = sum(roc_auc_scores_list)/k
 
print('Accuracy of each fold - {}'.format(acc_scores_list))
print('Avg Accuracy : {}'.format(avg_acc_score))
print('Precision of each fold - {}'.format(acc_scores_list))
print('Avg Precision : {}'.format(avg_precision_score))
print('Recall of each fold - {}'.format(acc_scores_list))
print('Avg Recall : {}'.format(avg_recall_score))
print('FScore of each fold - {}'.format(acc_scores_list))
print('Avg FScore : {}'.format(avg_fscore_score))
print('LogLoss of each fold - {}'.format(acc_scores_list))
print('Avg LogLoss : {}'.format(avg_log_loss_score))
print('ROC-AUC of each fold - {}'.format(acc_scores_list))
print('Avg ROC-AUC : {}'.format(avg_roc_auc_score))

print(model, ' f1-scores: ', fscore_scores_list)
print(("Mean f1-score: {0:.3f} (+/-{1:.3f})").format(np.mean(fscore_scores_list),np.std(fscore_scores_list)),end='\n')


# print('The accuracy of the Decision Tree is','{:.3f}'.format(accuracy_score(prediction,y_test)))



In [9]:
OLD CODE
### Train Resolver Model - 100 % Training --------------------------------------------------------------------------------------

"""---------- Resolver Model Training ----------"""

# Define Features
X = df['preprocessed_error_message']
y = df['resolver_group']

# embedding & encoding
err_msg_embedding = TfidfVectorizer(max_features=None,
                            strip_accents='unicode',
                            analyzer='word',
                            token_pattern=r'\w{1,}',
                            ngram_range=(1, 10),
                            use_idf=1,
                            smooth_idf=1,
                            sublinear_tf=1,
                            stop_words=None)
X = err_msg_embedding.fit_transform(X)

resolver_encoder = LabelEncoder()
y = resolver_encoder.fit_transform(y)

# define model architecture
model = SGDClassifier(max_iter=200, loss='log')
model.fit(X,y)

# saving model, err_msg_embedding, and encoder in the 'saved_models' directory
resolver_model_file = 'saved_models/M1 - resolver/resolver_model_r.sav'
pickle.dump(model, open(resolver_model_file, 'wb'))
err_msg_embedding_file = 'saved_models/M1 - resolver/err_msg_embedding_vocab_r.pkl'
pickle.dump(err_msg_embedding, open(err_msg_embedding_file, 'wb'))
resolver_encoder_file = 'saved_models/M1 - resolver/resolver_encoder_r.sav'
pickle.dump(resolver_encoder, open(resolver_encoder_file,'wb'))

In [None]:
# Dataset preparation for Train-Eval
te_df = df.copy()

# Saving some rows for unseen data Testing 
final_test_cases_aa = pd.DataFrame(te_df[-5:])
print("\nFinal Test Cases: \n")
print(1, te_df.iloc[-5], "\n")
print(2, te_df.iloc[-4], "\n")
print(3, te_df.iloc[-3], "\n")
print(4, te_df.iloc[-2], "\n")
print(5, te_df.iloc[-1], "\n")

te_df = te_df[:-5] # these last 5 rows will be used later for real-time testing 
print(len(te_df))

In [131]:
### real-Time Action Predictions (Testing)
final_test_cases_aa

Unnamed: 0,error_message,resolver_group,action_classification,preprocessed_error_message
679,Outgoing mail server is not specified. Unable ...,L1,Validate the configuration parameters (hostnam...,outgo mail server specify unable send mailplea...
680,Unable to rename file as the source path '\\ro...,L1,Validate File Path Manually.If path/file dosen...,unable rename file source path exist
681,Cannot find window or application titled 'Prep...,L1,Re-Run Bot,find window application title record
682,Unable to log the text in the file. Path not f...,L1,Validate File Path Manually.If path/file dosen...,unable log text file path find
683,Provider cannot be found. It may not be proper...,Infra,SQL Server Installation Issue.Check https://w...,provider find may properly instal


In [None]:
for i in range(len(final_test_cases_aa)):
    message = final_test_cases_aa.iloc[i]['error_message']
    resolver = final_test_cases_aa.iloc[i]['resolver_group']
    
    pp_err_msg = preprocess_error_message_aa(message)
    # loading pipeline for pulling features of text
    loaded_vocab = pickle.load(open('saved_models/M1 - action/err_msg_embedding_vocab_a.pkl', 'rb'))
    # loading the saved model
    loaded_model = pickle.load(open('saved_models/M1 - action/action_model_a.sav', 'rb'))
    # loading the saved resolver encoder
    resolver_encoder = pickle.load(open('saved_models/M1 - action/resolver_encoder_a.sav', 'rb'))
    # loading the saved resolver encoder
    action_encoder = pickle.load(open('saved_models/M1 - action/action_encoder_a.sav', 'rb'))

    # creating record Series
    terms = loaded_vocab.get_feature_names()
    vectorized = loaded_vocab.transform([message])
    record = dtm2df(vectorized, terms)
    record['resolver_group'] = resolver_encoder.transform([resolver])
    predicted = loaded_model.predict(record)
    confidence_long = loaded_model.predict_proba(record)[0][predicted][0]*100
    confidence = round(confidence_long,2)
    # saving predictions in dataframe
    action_predicted = action_encoder.inverse_transform(predicted)[0]
    print('Error Message ', i+1, ': ', message, '\n')
    print(action_predicted, confidence, '\n')