In [3]:
import pandas as pd


def read_multiline_messages(file_path, indent='   '):
    with open(file_path, 'r', encoding='latin1') as file:
        messages = []
        current_message = ''
        in_exception_block = False
        request_info_lines_remaining = 0  # Counter for 'Request Information' lines

        for line in file:
            stripped_line = line.strip()

            # Check for the start of an exception block
            if 'Throwing an exception.' in stripped_line:
                if current_message:
                    messages.append(current_message)
                current_message = stripped_line
                in_exception_block = True
            elif in_exception_block:
                current_message += ' ' + stripped_line
                # Check if the line is the end of an exception block
                if stripped_line.startswith("ExceptionId:"):
                    in_exception_block = False
            elif stripped_line.startswith("Request Information"):
                request_info_lines_remaining = 7  # Next 7 lines are part of the message
                current_message += ' ' + stripped_line
            elif request_info_lines_remaining > 0:
                current_message += ' ' + stripped_line
                request_info_lines_remaining -= 1
            elif line.startswith(indent) and not in_exception_block and request_info_lines_remaining == 0:
                current_message += ' ' + stripped_line
            else:
                if current_message:
                    messages.append(current_message)
                current_message = stripped_line
                request_info_lines_remaining = 0  # Reset counter when a new message starts

        # Add the last message if it exists
        if current_message:
            messages.append(current_message)

    return pd.DataFrame(messages, columns=['message'])

# File path to your CSV file
file_path = 'data\Copy of messages1.txt'

# Read the messages into a DataFrame
messages_df = read_multiline_messages(file_path)

pd.options.display.max_colwidth = None

# Import Messages and Levels

In [4]:
messages_df.shape

(969316, 1)

In [12]:
labels = pd.read_csv('preprocessedLogFiles\preprocessedLogFile2.csv')

  labels = pd.read_csv('preprocessedLogFiles\preprocessedLogFile2.csv')


In [14]:
labels['Level']

0         0
1         0
2         0
3         0
4         0
         ..
968731    4
968732    4
968733    4
968734    4
968735    4
Name: Level, Length: 968736, dtype: int64

In [46]:
labels

0         0
1         0
2         0
3         0
4         0
         ..
499995    4
499996    4
499997    4
499998    4
499999    4
Name: Level, Length: 500000, dtype: int64

In [67]:
#subset both labels and messages to ensure lineup:

messages_df = messages_df.iloc[0:500000]
#labels = labels['Level'].iloc[0:500000]


df = pd.concat([messages_df, labels], axis=1)
df.rename(columns={'message':'text', 'Level': 'label'}, inplace=True)
df = df.iloc[6:,:] #remove first 6 rows with Level 

In [68]:
df.columns

Index(['text', 'label'], dtype='object')

# Export of Messages 4 Classes:

In [69]:
#save dataframe to csv:

df.to_csv('data/labeled_data/4class.csv', index=False)

# Export of Messages 2 Classes:


In [72]:
#collapse the 4 classes into 2 classes:
biclass = df.copy()
biclass.replace({'label': {1: 'Error', 2: 'Error', 3: 'NoError', 4: 'NoError'}}, inplace=True)

In [74]:
biclass

Unnamed: 0,text,label
6,"Aborting messaging object. Name = seapod01srs1sbpk732:queue:srsargfullsync~95, Object type = Microsoft.ApplicationServer.Messaging.Broker.SoapMessageGroupConsumer, Reason = The operation did not complete within the allotted timeout of 00:02:00. The time allotted to this operation may have been a portion of a longer timeout. For more information on exception types and proper exception handling, please refer to https://aka.ms/ServiceBusExceptions .. TrackingId: 3b7789ac-0dc5-4aae-a153-683f0d081233_B27, SubsystemId: MessagingDispatcher",NoError
7,"A new client has connected: Address: 'seapod01srs1sbpk732:queue:srsargfullsync~95', Contract: SbmpMessageReceiver, SessionId: Channel:uuid:fe87369e-7ab5-4f91-ba25-6f84f94a1918;id=2;Link:pe_4dcca37230664e22882c608478edf201_6794. TrackingId: 36602b79-fd13-42aa-9970-bb0191991036_G13_B27, SubsystemId: seapod01srs1sbpk732:queue:srsargfullsync~95",NoError
8,"TrackingId: 79d63568-3faf-448e-9e7c-3932b340d74a_G18_B27, lock message: entityName: stsentiencetoskyspark:Queue:stskyspark2_sentiencetoskyspark, SubQueue: 7602291, sequenceNumber: 32651569751457907, messageId: iencetoskyspark:Queue:stskyspark2_sentiencetoskyspark, sessionId: , partitionId: 552927232, lockToken: Å",NoError
9,"MessageCursorBuffer completed an operation. SequenceNumber: 254800187, MessageId: 72c2b6a5b51c4e26a2a1db9e0f93fef9, SessionId: null, OperationType: Completed, TxnId:, LockToken: a01e38cc-6c3d-44ab-873a-43abf07a00cb, SubQueue: 0, DeliveryCount:1. TrackingId: 8a1249f40000fd77000002fb64e06334_G19_B27, SubsystemId: sera-monica-prod:Queue:monicaqueue~207",NoError
10,"in-connection1743(G24-234900): namespace: *, operation: read, duration: 120042, total bytes: 3145776, elapsed: :24:12.1836439, frame count: amqp:O=0,B=0,A=4,F=1,T=3,P=0,D=5,E=0,C=0,io:n=12,B=3702,S=65536,t=10003,T=45882, latency count: 0, avg: 3222470966, max: 77515569767320",NoError
...,...,...
499995,"TrackingId: b4c5ce7b-e466-4066-aebc-08b994707310_B27, partitioned entity: message log: append message: entityName: gmi-pxoptservicebus:Topic:auditlog~47|auditlog, sequenceNumber: 81802595, messageId: 11771cc304674544833cdf90283cfe20, sessionId: null, partitionCount: 256, beginPartitionRange: 32, partitionRangeWidth: 16, partitionId: 35, partitionKey: null",NoError
499996,"TrackingId: b4c5ce7b-e466-4066-aebc-08b994707310_B27, SubsystemId: gmi-pxoptservicebus:Topic:auditlog~47|auditlog, MessageLog AppendMessage: subqueueType: 0, sequenceNumber: 81802595, bodyId: 126061728, sessionId: (null), messageId: 11771cc304674544833cdf90283cfe20, transferSequenceNumber: 0, scheduledEnqueueTimeUtc: 01/01/0001 00:00:00.000, messageTtlTimeUtc: 12/31/9999 23:59:59.997",NoError
499997,"TrackingId: b09cdfc30000fd7100001b3264e0633d_G31_B27, partitioned entity: message log: append message: entityName: onedrive-media-southeastasia-prod:Queue:southeastasia-prod-video0~223, sequenceNumber: 36265266, messageId: es/b!MqVl9nHxLkeg22Jli6PIFj4xUFG5S45ApeawWkuyf7XdvjSDx_B8R58SXwzB4EBH/items/01BHPNBFHFSHOCI4HQJNAYMXSAKQSY6G4O, sessionId: null, partitionCount: 256, beginPartitionRange: 208, partitionRangeWidth: 16, partitionId: 208, partitionKey: null",NoError
499998,"TrackingId: b09cdfc30000fd7100001b3264e0633d_G31_B27, SubsystemId: onedrive-media-southeastasia-prod:queue:southeastasia-prod-video0~223, MessageLog AppendMessage: subqueueType: 0, sequenceNumber: 36265266, bodyId: 0, sessionId: (null), messageId: es/b!MqVl9nHxLkeg22Jli6PIFj4xUFG5S45ApeawWkuyf7XdvjSDx_B8R58SXwzB4EBH/items/01BHPNBFHFSHOCI4HQJNAYMXSAKQSY6G4O, transferSequenceNumber: 0, scheduledEnqueueTimeUtc: 01/01/0001 00:00:00.000, messageTtlTimeUtc: 08/26/2023 07:06:30.092",NoError


In [75]:
biclass.to_csv('data/labeled_data/2class.csv', index=False)

# Balanced Data: 

After running the initial BERT model on the dataset, we can see that the data set heavily favors non-erroneous data, which isn't particularly useful in the case of trying to highlight erroneous data.

In the following section, we will be undersampling the large bias to try and enforce some class parity.

In [22]:
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

biclass = pd.read_csv('data/labeled_data/2class.csv')

rus = RandomUnderSampler(random_state=0)

X = biclass['text']
y = biclass['label']

X_resampled, y_resampled = rus.fit_resample(X.values.reshape(-1, 1), y)


In [55]:
joined = zip(X_resampled, y_resampled)

In [56]:
bi_class_undersample = pd.DataFrame(joined, columns=['text', 'label'], index=None)

In [57]:
bi_class_undersample

Unnamed: 0,text,label
0,[in-connection12270(G21-404):session12271: Rem...,Error
1,[TrackingId: 9fbab2e5-f52f-479f-aca4-4ab3d89d7...,Error
2,[Aborting messaging object. Name = bwobarossa-...,Error
3,[TrackingId: 291b7ae2-ed3f-4442-9dfd-92925ca2c...,Error
4,[MessageCursorBuffer completed an operation. S...,Error
...,...,...
5959,[TrackingId: 2b0e9e610005fd710000035a64e06326_...,NoError
5960,[TrackingId: cb91d4db-63c6-4954-b5c0-96e28a753...,NoError
5961,[Cursor wait for messages from the log returne...,NoError
5962,[MessageCursorBuffer completed an operation. S...,NoError


In [None]:
bi_class_undersample.to_csv('data/labeled_data/2class_undersample.csv', index=False)

# Masked Data

In [5]:
import tensorflow as tf
import tensorflow_text as tf_text
import itertools
import pandas as pd

bi_class_undersample = pd.read_csv('data/labeled_data/2class_undersample.csv')

splitter = tf_text.RegexSplitter('\s+|:|!|\.')
#tokens = splitter.split([test],)
#words = tokens.to_list()

In [6]:
def num_mask(word):
  for i, x in enumerate(word[0]):
    token = x.decode()
    word[0][i] = token
    if any(c.isdigit() for c in token) == True:
      word[0][i] = '[NUM]'
  return word[0]

In [7]:
a = 0
m_data = []
while a < len(bi_class_undersample):
  message = bi_class_undersample['text'].iloc[a]
  tokens = splitter.split([message],)
  words = tokens.to_list()
  masked = num_mask(words)
  m_data.append(masked)
  a = a + 1


In [38]:
print(' '.join(m_data[0]))

[NUM] [NUM] Remove [NUM] (local [NUM] remote [NUM] [NUM] [NUM] VEHICLE_SERVICE) ']


In [39]:
import numpy as np
flattened_data = []

for x in m_data:
    flattened_column = ' '.join(x)
    flattened_data.append(flattened_column)


In [43]:
pd.DataFrame(zip(flattened_data, bi_class_undersample['label']), columns=['text', 'label']).to_csv('data/labeled_data/2class_undersample_masked.csv', index=False)

In [15]:
var = [x for xs in m_data for x in xs]

In [16]:
var

['[NUM]',
 '[NUM]',
 'Remove',
 '[NUM]',
 '(local',
 '[NUM]',
 'remote',
 '[NUM]',
 '[NUM]',
 '[NUM]',
 'VEHICLE_SERVICE)',
 "']",
 "['TrackingId",
 '[NUM]',
 'partitioned',
 'entity',
 'message',
 'log',
 'append',
 'message',
 'entityName',
 'onedrive-media-southeastasia-prod',
 'Queue',
 '[NUM]',
 'sequenceNumber',
 '[NUM]',
 'messageId',
 'es/b',
 '[NUM]',
 'sessionId',
 'null,',
 'partitionCount',
 '[NUM]',
 'beginPartitionRange',
 '[NUM]',
 'partitionRangeWidth',
 '[NUM]',
 'partitionId',
 '[NUM]',
 'partitionKey',
 '[NUM]',
 "['Aborting",
 'messaging',
 'object',
 'Name',
 '=',
 'bwobarossa-prim-lowprio',
 'topic',
 '[NUM]',
 'Object',
 'type',
 '=',
 'Microsoft',
 'ApplicationServer',
 'Messaging',
 'Broker',
 'SoapMessageGroupConsumer,',
 'Reason',
 '=',
 'The',
 'operation',
 'did',
 'not',
 'complete',
 'within',
 'the',
 'allotted',
 'timeout',
 'of',
 '[NUM]',
 '[NUM]',
 '[NUM]',
 'The',
 'time',
 'allotted',
 'to',
 'this',
 'operation',
 'may',
 'have',
 'been',
 'a',
 '