In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

from tqdm import tqdm
tqdm.pandas()

In [2]:
template = pd.read_csv('./dataset/v1/log.log_templates.csv')
n_template = len(template)

In [3]:
def getEmbedding(log,logid, num):
    logid = list(map(int, logid.split(' ')))
    templateid = log['EventId'].values[logid]
    
    templateid = np.eye(num)[templateid]
    templateid = templateid.max(axis=0)
    return templateid

In [4]:
def getTimeFeature(log, fault_time, logid):
    logid = list(map(int, logid.split(' ')))
    df = log.iloc[logid, :][['timestamp']]
    df = df.sort_values('timestamp')
    
    df['delta'] = df['timestamp'].apply(lambda x: (x-fault_time).total_seconds())
    df['delta_diff'] = df['delta'].diff(1)
    
    min_delta = np.min(np.abs(df['delta'].values))
    max_delta = np.max(np.abs(df['delta'].values))
    mean_delta = np.mean(np.abs(df['delta'].values))
    std_delta = np.std(np.abs(df['delta'].values))
    
    num_log = len(logid)
    span_delta = df['delta'].values[-1]-df['delta'].values[0]
    before = df['delta'].values[np.argmin(np.abs(df['delta'].values))] / min_delta
    
    if num_log > 1:
        min_delta_diff = np.min(df['delta_diff'].values[1:])
        max_delta_diff = np.max(df['delta_diff'].values[1:])
        mean_delta_diff = np.mean(df['delta_diff'].values[1:])
        std_delta_diff = np.std(df['delta_diff'].values[1:])
    else:
        min_delta_diff=0
        max_delta_diff=0
        mean_delta_diff=0
        std_delta_diff=0
    
    return [num_log, span_delta, before, 
            min_delta, max_delta, mean_delta, std_delta, 
            min_delta_diff, max_delta_diff, mean_delta_diff, std_delta_diff]

In [5]:
def getSM(log, logid):
    logid = list(map(int, logid.split(' ')))
    SM = log.iloc[logid, :]['server_model'].values
    
    # 返回众数
    counts = np.bincount(SM)
    SM = np.argmax(counts)
    return SM

In [6]:
label = pd.read_csv('./dataset/v1/label.csv')
train = pd.read_csv('./dataset/v1/train.csv')

label['fault_time'] = pd.to_datetime(label['fault_time'])
train['timestamp'] = pd.to_datetime(train['timestamp'])

label['EventId'] = label['LogID'].progress_apply(lambda x: getEmbedding(train, x, n_template))
for i in tqdm(range(n_template)):
    label['Event_Id_'+str(i)] = label['EventId'].apply(lambda x: x[i])

label['TimeFeature'] = label.progress_apply(lambda x: getTimeFeature(train, x['fault_time'], x['LogID']),axis=1)

label['num_log'] =  label['TimeFeature'].apply(lambda x: x[0])
label['span_delta'] =  label['TimeFeature'].apply(lambda x: x[1])
label['before'] =  label['TimeFeature'].apply(lambda x: x[2])
label['min_delta'] =  label['TimeFeature'].apply(lambda x: x[3])
label['max_delta'] =  label['TimeFeature'].apply(lambda x: x[4])
label['mean_delta'] =  label['TimeFeature'].apply(lambda x: x[5])
label['std_delta'] =  label['TimeFeature'].apply(lambda x: x[6])
label['min_delta_diff'] =  label['TimeFeature'].apply(lambda x: x[7])
label['max_delta_diff'] =  label['TimeFeature'].apply(lambda x: x[8])
label['mean_delta_diff'] =  label['TimeFeature'].apply(lambda x: x[9])
label['std_delta_diff'] =  label['TimeFeature'].apply(lambda x: x[10])

label['server_model'] = label['LogID'].progress_apply(lambda x: getSM(train, x))

label.drop(['LogID', 'EventId', 'TimeFeature'], axis=1, inplace=True)
label.to_csv('./dataset/v1/label_pro.csv', index=False)

100%|██████████████████████████████████| 16669/16669 [00:00<00:00, 62072.60it/s]
100%|████████████████████████████████████████| 104/104 [00:00<00:00, 296.26it/s]
100%|███████████████████████████████████| 16669/16669 [00:16<00:00, 1039.43it/s]
100%|███████████████████████████████████| 16669/16669 [00:01<00:00, 9394.68it/s]


In [7]:
submit = pd.read_csv('./dataset/v1/submit.csv')
test = pd.read_csv('./dataset/v1/test.csv')

submit['fault_time'] = pd.to_datetime(submit['fault_time'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

submit['EventId'] = submit['LogID'].progress_apply(lambda x: getEmbedding(test, x, n_template))
for i in tqdm(range(n_template)):
    submit['Event_Id_'+str(i)] = submit['EventId'].apply(lambda x: x[i]).copy()
    
submit['TimeFeature'] = submit.progress_apply(lambda x: getTimeFeature(test, x['fault_time'], x['LogID']),axis=1)

submit['num_log'] =  submit['TimeFeature'].apply(lambda x: x[0])
submit['span_delta'] =  submit['TimeFeature'].apply(lambda x: x[1])
submit['before'] =  submit['TimeFeature'].apply(lambda x: x[2])
submit['min_delta'] =  submit['TimeFeature'].apply(lambda x: x[3])
submit['max_delta'] =  submit['TimeFeature'].apply(lambda x: x[4])
submit['mean_delta'] =  submit['TimeFeature'].apply(lambda x: x[5])
submit['std_delta'] =  submit['TimeFeature'].apply(lambda x: x[6])
submit['min_delta_diff'] =  submit['TimeFeature'].apply(lambda x: x[7])
submit['max_delta_diff'] =  submit['TimeFeature'].apply(lambda x: x[8])
submit['mean_delta_diff'] =  submit['TimeFeature'].apply(lambda x: x[9])
submit['std_delta_diff'] =  submit['TimeFeature'].apply(lambda x: x[10])

submit['server_model'] = submit['LogID'].progress_apply(lambda x: getSM(test, x))

submit.drop(['LogID', 'EventId', 'TimeFeature'], axis=1, inplace=True)
submit.to_csv('./dataset/v1/submit_pro.csv', index=False)

100%|███████████████████████████████████| 3030/3030 [00:00<00:00, 107618.20it/s]
100%|███████████████████████████████████████| 104/104 [00:00<00:00, 1339.05it/s]
100%|█████████████████████████████████████| 3030/3030 [00:02<00:00, 1202.30it/s]
100%|█████████████████████████████████████| 3030/3030 [00:00<00:00, 9733.72it/s]
