In [1]:
import pandas as pd
import numpy as np

In [2]:
def GroupbyTime(df, time):
    countmin = 1*60*1000000000
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='us')
    df['timestamp'] = pd.to_datetime((df['timestamp'].view(np.int64) // countmin) * countmin) + pd.to_timedelta('8h')
    
    df = df.drop_duplicates()

    # 数据清洗
    error = df.groupby('span_id').apply(lambda x: len(set(x['cmdb_id'].values)))
    error_ids = error[error>1].index.values.tolist()
    for error_id in error_ids:
        df = df[(df['span_id'] != error_id) & (df['parent_id'] != error_id)]

    # 提取字典
    id2cmdb = df.groupby('span_id').apply(lambda x: list(set(x['cmdb_id'].values))[0])
    id2cmdb = id2cmdb.to_dict()
    id2cmdb['nan'] = 'root'

    # 提取调用链
    df['child'] = df['cmdb_id'].fillna('nan').copy(deep=True)
    df['parent'] = df['parent_id'].fillna('nan').apply(lambda x: id2cmdb[x])
    df['trace_pair'] = df.apply(lambda x: x['parent'] +  ' -> ' + x["child"], axis=1)
    
    df = df.groupby(['timestamp', 'trace_pair'], as_index=False).mean()
    df = df.pivot('timestamp', 'trace_pair', 'duration')
    df = pd.merge(time, df, on='timestamp', how='left')
    return df

In [3]:
col = sorted(['carts -> carts', 'orders -> orders', 'orders -> payment',
       'orders -> user', 'payment -> payment', 'root -> carts',
       'root -> catalogue', 'root -> orders', 'root -> user', 'user -> user'])
col = ['timestamp']+col

# 训练集处理

In [4]:
time = pd.DataFrame(pd.date_range(start='2022-03-24 15:20', end='2022-03-25 15:19', freq='T', name='timestamp'))

In [5]:
train = pd.concat([pd.read_csv('../dataset/sockshop/train/traces/carts.csv'), 
                pd.read_csv('../dataset/sockshop/train/traces/catalogue.csv'),
                pd.read_csv('../dataset/sockshop/train/traces/orders.csv'),
                pd.read_csv('../dataset/sockshop/train/traces/payment.csv'),
                pd.read_csv('../dataset/sockshop/train/traces/shipping.csv'),
                pd.read_csv('../dataset/sockshop/train/traces/user.csv')
               ])
train = GroupbyTime(train, time)
train = train[col]
train.sort_values('timestamp').head()

Unnamed: 0,timestamp,carts -> carts,orders -> orders,orders -> payment,orders -> user,payment -> payment,root -> carts,root -> catalogue,root -> orders,root -> user,user -> user
0,2022-03-24 15:20:00,,,,,,,,,,
1,2022-03-24 15:21:00,20662.409091,933051.7,113226.0,1606.666667,110.0,21021.227273,,2013917.5,19551.4,5660.0
2,2022-03-24 15:22:00,4801.478261,23847.975,250.5,1533.333333,82.5,5122.826087,,92963.25,1830.4,1349.052632
3,2022-03-24 15:23:00,4286.653846,28345.0,198.5,1776.166667,82.5,4589.923077,,105392.25,1146.0,1297.9
4,2022-03-24 15:24:00,4501.875,25522.0,177.5,1104.5,58.5,4782.5625,,60778.5,1189.571429,959.619048


In [6]:
train.to_csv('../dataset/processed/train/traces/traces.csv', index=False)

# 测试集处理

In [7]:
time = pd.DataFrame(pd.date_range(start='2022-03-26 08:30', end='2022-03-26 20:29', freq='T', name='timestamp'))

In [8]:
test = pd.concat([pd.read_csv('../dataset/sockshop/test/traces/carts.csv'), 
                pd.read_csv('../dataset/sockshop/test/traces/catalogue.csv'),
                pd.read_csv('../dataset/sockshop/test/traces/orders.csv'),
                pd.read_csv('../dataset/sockshop/test/traces/payment.csv'),
                pd.read_csv('../dataset/sockshop/test/traces/shipping.csv'),
                pd.read_csv('../dataset/sockshop/test/traces/user.csv')
               ])
test = GroupbyTime(test, time)
test = test[col]
test.head()

Unnamed: 0,timestamp,carts -> carts,orders -> orders,orders -> payment,orders -> user,payment -> payment,root -> carts,root -> catalogue,root -> orders,root -> user,user -> user
0,2022-03-26 08:30:00,,,,,,,,,,
1,2022-03-26 08:31:00,,,,,,,,,,
2,2022-03-26 08:32:00,,,,,,,,,,
3,2022-03-26 08:33:00,,,,,,,,,,
4,2022-03-26 08:34:00,,,,,,,,,,


In [9]:
test.to_csv('../dataset/processed/test/traces/traces.csv', index=False)