In [1]:
import pandas as pd
import os
import re
from tqdm import tqdm
from logparser import LogParser
import numpy as np

In [2]:
def log2df(path):
    with open(path, "r") as f:
        data = f.readlines()
        df = pd.DataFrame(data, columns=['log'])
    return df

In [3]:
def logclean(string):
    string = str(string)
    string = string.lower()
    string = string.replace('\n', '')
    string = string.replace('\tat', '')
    string = re.sub(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}.\d*',' ',string)
    
    string = re.sub(r' +',' ',string)
    string = string.strip()
    
    string = string.split(' ')
    
    timestamp = string[0]
    if len(string) == 1:
        level = 'unknown'
        msg = None
    
    elif string[1] == 'error' or string[1] == 'info':
        level = string[1]
        if len(string) > 2:
            msg = ' '.join(string[2:])
            msg = re.sub(r'[^a-z0-9]',' ',msg)
            msg = re.sub(r'[0-9]\S*\b','',msg) # 屏蔽设备序号
            msg = re.sub(r'\b\S*\d\S*\b', '',msg) # 去除含数字的组合
            msg = re.sub(r'\b[a-z]\b','',msg) # 过滤单独字母
            msg = re.sub(r' +',' ',msg)
        else:
            msg = None
    
    else:
        level = 'unknown'
        msg = ' '.join(string[1:])
        msg = re.sub(r'[^a-z0-9]',' ',msg)
        msg = re.sub(r'[0-9]\S*\b','',msg) # 屏蔽设备序号
        msg = re.sub(r'\b\S*\d\S*\b', '',msg) # 去除含数字的组合
        msg = re.sub(r'\b[a-z]\b','',msg) # 过滤单独字母
        msg = re.sub(r' +',' ',msg)
    
    return [timestamp, level, msg]

# 制作语料库

In [4]:
train = pd.concat([log2df('../dataset/sockshop/train/logs/carts-64f7c6d9d7-5mmd6.log'), 
                 log2df('../dataset/sockshop/train/logs/carts-db-844bf68bff-k7fvm.log'),
                 log2df('../dataset/sockshop/train/logs/catalogue-6d5b4c4c4b-tpl5w.log'),
                 log2df('../dataset/sockshop/train/logs/front-end-8648798447-tf4cj.log'),
                 log2df('../dataset/sockshop/train/logs/orders-5ffb5f8596-d74jv.log'), 
                 log2df('../dataset/sockshop/train/logs/orders-db-9fd454cff-rrttb.log'),
                 log2df('../dataset/sockshop/train/logs/payment-796848994-wwhv4.log'),
                 log2df('../dataset/sockshop/train/logs/queue-master-6bf566994f-hvrpm.log'),
                 log2df('../dataset/sockshop/train/logs/session-db-8f64655d5-qtnwh.log'), 
                 log2df('../dataset/sockshop/train/logs/shipping-899f9897f-96484.log'),
                 log2df('../dataset/sockshop/train/logs/user-55b65b94bf-s98kg.log')
                ])

In [5]:
train['log'] = train['log'].apply(lambda x: logclean(x))
train['timestamp'] = train['log'].apply(lambda x: x[0])
train['level'] = train['log'].apply(lambda x: x[1])
train['msg'] = train['log'].apply(lambda x: x[2])
train['src'] = 'train'
train['msg'].replace(' ', None, inplace=True)
train.dropna(subset=['msg'], axis=0, inplace=True)
train.head()

Unnamed: 0,log,timestamp,level,msg,src
0,"[2022-03-24t07:32:38.494696400z, info, carts ...",2022-03-24t07:32:38.494696400z,info,carts dbbdbc true nio exec org mongodb driver...,train
1,"[2022-03-24t07:42:35.815209408z, error, carts...",2022-03-24t07:42:35.815209408z,error,carts nio exec dispatcherservlet servlet serv...,train
3,"[2022-03-24t07:42:35.815234953z, unknown, java...",2022-03-24t07:42:35.815234953z,unknown,java lang illegalargumentexception cannot find...,train
4,"[2022-03-24t07:42:35.815236753z, unknown, work...",2022-03-24t07:42:35.815236753z,unknown,works weave socks cart item founditem lambda g...,train
5,"[2022-03-24t07:42:35.815238631z, unknown, java...",2022-03-24t07:42:35.815238631z,unknown,java util optional orelsethrow optional java n...,train


In [6]:
test = pd.concat([log2df('../dataset/sockshop/test/logs/carts-64f7c6d9d7-5mmd6.log'), 
                 log2df('../dataset/sockshop/test/logs/carts-db-844bf68bff-k7fvm.log'),
                 log2df('../dataset/sockshop/test/logs/catalogue-6d5b4c4c4b-tpl5w.log'),
                 log2df('../dataset/sockshop/test/logs/front-end-8648798447-tf4cj.log'),
                 log2df('../dataset/sockshop/test/logs/orders-5ffb5f8596-d74jv.log'), 
                 log2df('../dataset/sockshop/test/logs/orders-db-9fd454cff-rrttb.log'),
                 log2df('../dataset/sockshop/test/logs/payment-796848994-wwhv4.log'),
                 log2df('../dataset/sockshop/test/logs/queue-master-6bf566994f-hvrpm.log'),
                 log2df('../dataset/sockshop/test/logs/session-db-8f64655d5-qtnwh.log'), 
                 log2df('../dataset/sockshop/test/logs/shipping-899f9897f-96484.log'),
                 log2df('../dataset/sockshop/test/logs/user-55b65b94bf-s98kg.log')
                ])

In [7]:
test['log'] = test['log'].apply(lambda x: logclean(x))
test['timestamp'] = test['log'].apply(lambda x: x[0])
test['level'] = test['log'].apply(lambda x: x[1])
test['msg'] = test['log'].apply(lambda x: x[2])
test['src'] = 'test'
test['msg'].replace(' ', None, inplace=True)
test.dropna(subset=['msg'], axis=0, inplace=True)
test.head()

Unnamed: 0,log,timestamp,level,msg,src
0,"[2022-03-26t02:51:09.497025118z, info, bootst...",2022-03-26t02:51:09.497025118z,info,bootstrap main annotationconfigapplicationcon...,test
1,"[2022-03-26t02:51:19.897268291z, info, bootst...",2022-03-26t02:51:19.897268291z,info,bootstrap main trationdelegate beanpostproces...,test
9,"[2022-03-26t02:51:27.001723735z, unknown, spr...",2022-03-26t02:51:27.001723735z,unknown,spring boot release,test
11,"[2022-03-26t02:51:27.504687985z, info, carts ...",2022-03-26t02:51:27.504687985z,info,carts main works weave socks cart cartapplica...,test
12,"[2022-03-26t02:51:28.299650567z, info, carts ...",2022-03-26t02:51:28.299650567z,info,carts main ationconfigembeddedwebapplicationc...,test


In [8]:
df = pd.concat([train, test])
with open('../dataset/processed/tmp/log.log', "w") as f:
    for i in tqdm(range(len(df))):
        string = df['src'].values[i]+' '+df['timestamp'].values[i]+' '+df['level'].values[i]+' '+df['msg'].values[i]
        string = string.replace('  ', ' ')
        f.write(string)
        f.write('\n')

100%|███████████████████████████████| 829222/829222 [00:04<00:00, 175006.47it/s]


# 日志解析

In [9]:
log_format = '<Src> <Timestamp> <Level> <Content>'
parser = LogParser(indir='../dataset/processed/tmp/', outdir='../dataset/processed/tmp/', log_format=log_format, keep_para=False , tau=1)
parser.parse('log.log')

Parsing file: ../dataset/processed/tmp/log.log
load log ...


100%|███████████████████████████████| 829222/829222 [00:03<00:00, 232232.67it/s]


parse log ...


100%|████████████████████████████████| 826170/826170 [00:23<00:00, 35232.91it/s]


Parsing done. [Time taken: 0:00:29.403291]


# 日志结构化

In [10]:
df = pd.read_csv('../dataset/processed/tmp/log.log_templates.csv')
EventId2num = {}
for num, EventId in enumerate(df['EventId'].values):
    EventId2num[EventId] = num
len(EventId2num)

445

In [11]:
df = pd.read_csv('../dataset/processed/tmp/log.log_structured.csv')
df['EventId'] = df['EventId'].apply(lambda x: EventId2num[x])

df['timestamp'] = pd.to_datetime(df['Timestamp'] )
df.head()

Unnamed: 0,LineId,Src,Timestamp,Level,Content,EventId,EventTemplate,timestamp
0,1,train,2022-03-24t07:32:38.494696400z,info,carts dbbdbc true nio exec org mongodb driver ...,0,carts dbbdbc true nio exec org mongodb driver ...,2022-03-24 07:32:38.494696+00:00
1,2,train,2022-03-24t07:42:35.815209408z,error,carts nio exec dispatcherservlet servlet servi...,1,<*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*...,2022-03-24 07:42:35.815209+00:00
2,3,train,2022-03-24t07:42:35.815234953z,unknown,java lang illegalargumentexception cannot find...,1,<*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*> <*...,2022-03-24 07:42:35.815234+00:00
3,4,train,2022-03-24t07:42:35.815236753z,unknown,works weave socks cart item founditem lambda g...,2,works weave socks cart item founditem <*> get ...,2022-03-24 07:42:35.815236+00:00
4,5,train,2022-03-24t07:42:35.815238631z,unknown,java util optional orelsethrow optional java n...,3,java util optional orelsethrow optional java n...,2022-03-24 07:42:35.815238+00:00


In [12]:
train = df[df['Src']=='train'].copy()
train.sort_values('timestamp', inplace=True)

countmin = 1*60*1000000000
train['timestamp'] = pd.to_datetime((train['timestamp'].view(np.int64) // countmin) * countmin) + pd.to_timedelta('8h')

train = train[(train['timestamp']>='2022-03-24 15:20') & (train['timestamp']<='2022-03-25 15:19')]
train[['timestamp', 'Level', 'Content', 'EventId', 'EventTemplate']].to_csv('../dataset/processed/train/logs/logs.csv',index=False)

In [13]:
test = df[df['Src']=='test'].copy()
test.sort_values('timestamp', inplace=True)

countmin = 1*60*1000000000
test['timestamp'] = pd.to_datetime((test['timestamp'].view(np.int64) // countmin) * countmin) + pd.to_timedelta('8h')

test = test[(test['timestamp']>='2022-03-26 08:30') & (test['timestamp']<='2022-03-26 20:29')]
test[['timestamp', 'Level', 'Content', 'EventId', 'EventTemplate']].to_csv('../dataset/processed/test/logs/logs.csv', index=False)