In [None]:
# 3GB
# Download training_data
!wget -O label_training_data.zip https://cloud.tsinghua.edu.cn/f/c1ea3426ce444bc9baae/?dl=1

# Unzip
!unzip label_training_data.zip -d ./

%cd AIOps挑战赛数据/
!bash unzip_all.sh
%cd ..

In [None]:
# 885MB
# Download training_data
!wget -O training_data.zip https://cloud.tsinghua.edu.cn/f/7eece510dc784e70a083/?dl=1

# Unzip
!unzip training_data.zip -d ./

# Remove unecessary documents
%rm -r training_data/__MACOSX/
%rm training_data.zip

# Check final result
!ls training_data/

In [None]:
# Using kaggle dataset
# Unzip
!unzip ../input/anm-data/label_training_data.zip -d ./

%cd AIOps挑战赛数据/
!bash unzip_all.sh
%cd ..

In [None]:
import pandas as pd
import numpy as np
import datetime, pytz

import seaborn as sns
import matplotlib.pyplot as plt

import pickle

import xgboost as xgb
from sklearn.model_selection import train_test_split

training_data_path = "AIOps挑战赛数据"

# Acquire data

In [None]:
import pandas as pd
import numpy as np

import datetime
import math
import gc

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

class DataUtils:
    def __init__(self, verbose=False):
        self.verbose = verbose
        self.node_le = LabelEncoder()
        self.kpi_le = LabelEncoder()
        self.data_stats = DataStats()

    def reduce_mem_usage(self, df):
        """ Function to reduce the DF size """
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        start_mem = df.memory_usage().sum() / 1024**2
        for col in df.columns:
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
        end_mem = df.memory_usage().sum() / 1024**2
        if self.verbose:
            print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
                end_mem, 100 * (start_mem - end_mem) / start_mem))
        return df
    
    def load_data(self, training_data_path, dates, nrows=None, dataset='all'):
        assert dataset in ['all', 'host', 'esb', 'trace']
        arr_host = []
        arr_esb = []
        arr_trace = [] 
        arr_failures = []

        for date in dates:
            if dataset in ['all', 'esb']:
                # ESB
                print("{:=^30}".format('  ESB  '))
                filename = training_data_path + "/{}/业务指标/esb.csv".format(date)
                print("opening {}".format(filename))
                esb_df = pd.read_csv(filename, header=0, nrows=nrows)

                esb_df.rename(columns={
                    'serviceName': 'service_name',
                    'startTime': 'start_time'},
                    inplace=True)

                esb_df['start_time'] = pd.to_datetime(
                    esb_df['start_time'], unit='ms')
                esb_df['time'] = (esb_df['start_time'] - esb_df['start_time'].min()) / \
                    datetime.timedelta(seconds=1)  # should be 0 to 24h in seconds
                arr_esb.append(esb_df)

            if dataset in ['all', 'host']:
                # HOST
                print("{:=^30}".format('  host  '))
                host_df_lst = []
                for s_name in ["db_oracle_11g", "dcos_container", "dcos_docker", "mw_redis", "os_linux"]:
                    filename = training_data_path + "/" + date + "/平台指标/" + s_name + ".csv"
                    print("opening {}".format(filename))
                    temp_df = pd.read_csv(filename, header=0)
                    # rename columns so that is follows convention some_special_name
                    temp_df.rename(columns={'itemid': 'item_id'}, inplace=True)

                    temp_df['timestamp'] = pd.to_datetime(temp_df['timestamp'], unit='ms')

                    # changing data type to save memory
                    temp_df['item_id'] = temp_df['item_id'].astype(int)
                    temp_df[['cmdb_id', 'bomc_id', 'name']] = temp_df[[
                        'cmdb_id', 'bomc_id', 'name']].astype('category')

                    host_df_lst.append(temp_df)
                    del temp_df

                host_df = pd.concat(host_df_lst)
                del host_df_lst

                arr_host.append(host_df)

            if dataset in ['all', 'trace']:
                # TRACE
                print("{:=^30}".format('   trace   '))
                trace_df_lst = []
                for c_type in ["csf", "fly_remote", "jdbc", "local", "osb", "remote_process"]:
                    filename = training_data_path + "/" + date + "/调用链指标/trace_" + c_type + ".csv"
                    print("opening {}".format(filename))
                    # chunks = pd.read_csv(filename, header=0, chunksize=100000)
                    # temp_df = pd.concat(chunks)
                    temp_df = pd.read_csv(filename, header=0, nrows=nrows)
                    temp_df.rename(
                        columns={
                            'callType': 'call_type',
                            'startTime': 'start_time',
                            'elapsedTime': 'elapsed_time',
                            'traceId': 'trace_id',
                            'serviceName': 'service_name',
                            'dsName': 'ds_name'},
                        inplace=True)  # rename columns so that is follows convention some_special_name except for Id
                    temp_df['start_time'] = pd.to_datetime(temp_df['start_time'], unit='ms')
                    trace_df_lst.append(temp_df)
                    del temp_df

                trace_df = pd.concat(trace_df_lst)
                arr_trace.append(trace_df)

            # FAILURES
            print("{:=^30}".format('   failures   '))
            filename = "故障整理（预赛）.csv"
            print(f"opening {filename}")
            # read csv
            failures_df = pd.read_csv(filename, index_col='index')

            # interpret as datetime objects
            failures_df['start_time'] = pd.to_datetime(
                failures_df['start_time'], format='%Y/%m/%d %H:%M', infer_datetime_format=True).dt.tz_localize('Asia/Shanghai').dt.tz_convert(None)
            failures_df['log_time'] = pd.to_datetime(
                failures_df['log_time'], format='%Y/%m/%d %H:%M', infer_datetime_format=True).dt.tz_localize('Asia/Shanghai').dt.tz_convert(None)

            # load failures for the day date
            datetime_date = datetime.datetime.strptime(date, "%Y_%m_%d").date()
            failures_df = failures_df[failures_df['start_time'].dt.date == datetime_date]
            arr_failures.append(failures_df)
        

        if dataset == 'all':
            return pd.concat(arr_esb), pd.concat(arr_host), pd.concat(arr_trace), pd.concat(arr_failures)
        else: 
            dataset_arr_dict = {'esb': arr_esb, 'host': arr_host, 'trace': arr_trace}
            return pd.concat(dataset_arr_dict[dataset]), pd.concat(arr_failures)
    
    def add_timeseries_features(self, df, w_period, w_time_unit, w_length):
        """
        Convert Timeseries data to supervised dataset by shifting data.
        Params: 
            - df: host_df
            - w_period: int, window period ie. how frequent you want to sample data
            - w_time_unit: str, time unit for w_perdiod
            - w_length: int, window length ie. how far in the past we want to see
        
        Returns:
            - train_df: transformed df with new features and target_kpi/node/value
            - supervised_dataset: transformed with new features only (train_df.shape[0] == supervised_dataset.shape[0]*#of_kpis)
        
        Example:
            (df, 1, 'min', 5) you want your model to predict the next value based on the previous 5minutes with data sampling every minute (thus based on 4 values)
        
        Note: this code is designed for the global xgboost model
        """

        # Check parameters
        supported_time_unit = ['h', 'min', 's', 'ms']
        assert w_time_unit in supported_time_unit, "Only {} are supported for w_time_unit".format(supported_time_unit)

        # Gather data
        temp_df = df.loc[:, ['cmdb_id', 'name', 'timestamp', 'value']]
        temp_df['timestamp'] = temp_df['timestamp'].dt.round('30s')  # reduce precision of timestamp


        # Create features: all kpis ('name') become features
        temp_df = pd.pivot_table(temp_df, index=['timestamp', 'cmdb_id'], columns='name', values='value', dropna=True)
        temp_df.reset_index(['cmdb_id'], inplace=True)

        # Fill missing data using forward fill
        temp_df.fillna(method='ffill', inplace=True)

        # Drop rows with Nan values
        temp_df.dropna(axis=0, how='any',inplace=True)

        # Reduce mem usage
        temp_df = self.reduce_mem_usage(temp_df)
        
        # Shift data and create even more features (e.g. 'CPU_used (t-3min)')
        #  while creating supervised_dataset
        supervised_dataset = temp_df.reset_index()
        for i in range(1, w_length + w_period, w_period):
            s = temp_df.shift(periods=i, freq=w_time_unit)
            s.columns = ["{}(t-{}{:s})".format(_n, i, w_time_unit) for _n in s.columns]
            s.reset_index(inplace=True)

            supervised_dataset = pd.merge(supervised_dataset, s, left_on=['cmdb_id', 'timestamp'], right_on=['cmdb_id(t-{}{:s})'.format(i, w_time_unit), 'timestamp'])
            supervised_dataset.drop('cmdb_id(t-{}{:s})'.format(i, w_time_unit), axis=1, inplace=True)

        # Drop Nan and adding time features
        supervised_dataset.dropna(how='any', inplace=True)          
        supervised_dataset['hour'] = supervised_dataset['timestamp'].dt.hour
        supervised_dataset.drop('timestamp', axis=1, inplace=True)

        # Check missing data
        percent_missing = self.data_stats.missing_statistics(temp_df)
        # print(percent_missing)

        return supervised_dataset
    
    def to_multiindex(self, df, unique_kpi, w_period, w_time_unit, w_length):
        # pandas multi index
        time_idx = ["t"] + [f"t-{i}{w_time_unit}" for i in range(1, w_length + w_period, w_period)]
        idx = pd.MultiIndex.from_product([time_idx, unique_kpi], names=['time', 'kpi'])
        
        # values
        vals = df.loc[:, ~df.columns.isin(['hour', 'cmdb_id'])].values

        # transform to multi index
        multiindex_supervised_dataset = pd.DataFrame(vals, columns=idx)

        return multiindex_supervised_dataset
    
    def to_tensor(self, multi_df, w_length):
        # shape = (samples, timesteps, features)
        shape = (multi_df['t'].shape[0], w_length+1, multi_df['t'].shape[1])
        tensor = multi_df.values.reshape(shape)
        return tensor
    
    def transform_to_lstm_data(self, df, unique_kpi, w_period, w_time_unit, w_length, scaler=None):
        # supervised dataset
        supervised_dataset = self.add_timeseries_features(df, w_period, w_time_unit, w_length)
        
        # get values
        supervised_dataset = supervised_dataset.loc[:, ~supervised_dataset.columns.isin(['hour', 'cmdb_id'])].values

        # scale data and transform to tensor
        assert supervised_dataset.shape[0] > 0, "Supervised dataset is empty !"
        if scaler:
            supervised_dataset = scaler.fit_transform(supervised_dataset)
        
        # shape = (samples, timesteps, features)
        n_samples = supervised_dataset.shape[0]
        n_timesteps = w_length+1
        n_features = unique_kpi.shape[0]
        shape = (n_samples, n_timesteps, n_features)
        tensor = supervised_dataset.reshape(shape)
        return tensor
            
    def create_ts_files(self, df, history_length, step_size, lag_unit, target_step, data_folder, num_rows_per_file):
        """ like add_timeseries_features but creates files along the way """
        # Check parameters
        supported_time_unit = ['h', 'min', 's', 'ms']
        assert lag_unit in supported_time_unit, f"Only {supported_time_unit} are supported for w_time_unit"

        # Gather data
        temp_df = df.loc[:, ['cmdb_id', 'name', 'timestamp', 'value']]
        temp_df['timestamp'] = temp_df['timestamp'].dt.round('30s')  # reduce precision of timestamp

        # Create features: all kpis ('name') become features
        temp_df = pd.pivot_table(temp_df, index=[
                                 'timestamp', 'cmdb_id'], columns='name', values='value', dropna=True)
        temp_df.reset_index(['cmdb_id'], inplace=True)

        # Fill missing data using forward fill
        temp_df.fillna(method='ffill', inplace=True)

        # Drop rows with Nan values
        temp_df.dropna(axis=0, how='any', inplace=True)

        # Reduce mem usage
        temp_df = self.reduce_mem_usage(temp_df)

        # CREATE FILES
        num_rows = len(temp_df)
        num_files = math.ceil(num_rows/num_rows_per_file)

        print(f'Creating {num_files} files.')
        for i in range(num_files):
            filename = f'{data_folder}/ts_file{i}.pkl'
            
            if i % 10 == 0:
                print(f'{filename}')
            
            left_idx = i * num_rows_per_file
            right_idx = min(left_idx + num_rows_per_file, num_rows)


            # Shift data and create even more features (e.g. 'CPU_used (t-3min)')
            #  while creating supervised_dataset
            supervised_dataset = temp_df.iloc[left_idx:right_idx].reset_index()
            for i in range(history_length, 0, -step_size):
                s = temp_df.shift(periods=i, freq=lag_unit)
                s.columns = ["{}(t-{}{:s})".format(_n, i, lag_unit)
                            for _n in s.columns]
                s.reset_index(inplace=True)

                supervised_dataset = pd.merge(supervised_dataset, s, left_on=['cmdb_id', 'timestamp'], right_on=[
                                            'cmdb_id(t-{}{:s})'.format(i, lag_unit), 'timestamp'])
                supervised_dataset.drop(
                    'cmdb_id(t-{}{:s})'.format(i, lag_unit), axis=1, inplace=True)

            # Drop Nan and adding time features
            supervised_dataset.dropna(how='any', inplace=True)
            supervised_dataset['hour'] = supervised_dataset['timestamp'].dt.hour
            supervised_dataset.drop('timestamp', axis=1, inplace=True)

            supervised_dataset.to_pickle(filename)

        num_ts = temp_df.shape[1]
        num_timesteps = history_length
        return num_timesteps, num_ts
    
    def create_training_data(self, supervised_dataset, save_dir="./"):
        """
        Returns:
            - filenames: list of filename where training data is saved
                
        """
        unique_kpi = supervised_dataset.columns.str.extract(r'([a-zA-Z_\s]+)')[0].unique()
        unique_kpi = unique_kpi[~np.isin(unique_kpi, ['hour', 'cmdb_id', 'target_node', 'target_value', 'target_kpi'])]
        
        # Create train_df
        print(">The derived supervised_dataset has {} rows, now creating train_df with {}x more rows ({}).".format(
            supervised_dataset.shape[0], 
            unique_kpi.shape[0],
            unique_kpi.shape[0] * supervised_dataset.shape[0])
            )
        supervised_dataset.rename({ 'cmdb_id':  'target_node'}, axis=1, inplace=True)

        names = supervised_dataset.columns.tolist() + ['target_value', 'target_kpi']
        train_df = pd.DataFrame(columns=names)
        filenames = []
        i = 1
        for _kpi in unique_kpi:
            temp_df = supervised_dataset.rename({   _kpi: 'target_value'}, axis=1)
            temp_df['target_kpi'] = _kpi
            train_df = train_df.append(temp_df, ignore_index=True)

            memory = train_df.memory_usage().sum() / 1024**2
            if memory > 1000: # save every 1Go
                filename = save_dir + "training_data{}".format(i)
                print(">Saving {:.1f} MB of data into {:s}".format(memory, filename))
                train_df.to_parquet(filename, index=None)
                filenames.append(filename)
                i += 1
                del train_df
                train_df = pd.DataFrame(columns=names)
                gc.collect()
        train_df.rename({'cmdb_id':'target_node'}, axis=1, inplace=True)

        return filenames
    
    def scale_data(self, supervised_dataset):
        unique_kpi = supervised_dataset.columns.str.extract(r'([a-zA-Z_\s]+)')[0].unique()
        unique_kpi = unique_kpi[~np.isin(unique_kpi, ['hour', 'cmdb_id', 'target_node', 'target_value', 'target_kpi'])]
        
        self.scaler = MinMaxScaler(feature_range=(1, len(unique_kpi)))
        return self.scaler.fit_transform(supervised_dataset)        
    
    def create_testing_data(self, supervised_dataset, node, kpi):
        test_df = supervised_dataset[supervised_dataset['cmdb_id'] == node]
        test_df.rename({ 'cmdb_id': 'target_node', kpi: 'target_value'}, axis=1, inplace=True)
        test_df['target_kpi'] = kpi
        return test_df
            
#################################################################################
#                           DATA STATISTICS CLASS
#################################################################################
class DataStats:
    def __init__(self):
        pass

    # source: https://www.kaggle.com/aitude/ashrae-missing-weather-data-handling
    def missing_statistics(self, df):
        statitics = pd.DataFrame(df.isnull().sum()).reset_index()
        statitics.columns = ['COLUMN NAME', "MISSING VALUES"]
        statitics['TOTAL ROWS'] = df.shape[0]
        statitics['% MISSING'] = round(
            (statitics['MISSING VALUES']/statitics['TOTAL ROWS'])*100, 2)
        return statitics

    def infinite_statistics(self, df):
        statitics = pd.DataFrame(df[df.abs() >= np.inf].sum()).reset_index()
        statitics.columns = ['COLUMN NAME', "INFINITE VALUES"]
        statitics['TOTAL ROWS'] = df.shape[0]
        statitics['% INFINITE'] = round(
            (statitics['INFINITE VALUES']/statitics['TOTAL ROWS'])*100, 2)
        return statitics


In [None]:
data_utils = DataUtils(verbose=True)
esb_df, host_df, trace_df, failures_df = data_utils.load_data(training_data_path, ["2020_05_23"], 1000, 'all') 

failure_free_host_df = host_df[host_df['timestamp'] > failures_df['start_time'].max()]
failure_free_host_df

In [None]:
data_utils = DataUtils(verbose=True)
host, failures = data_utils.load_data(training_data_path, ["2020_05_23"], 1000, 'host')

sizes = []
for _ in range(100):

    rdm_tmsp = np.random.choice(host['timestamp'].values, size=1, replace=True)[0]

    small_host = host[(host['timestamp']-rdm_tmsp).abs() < datetime.timedelta(minutes=6)]
    supervised_dataset = data_utils.add_timeseries_features(small_host, 1, 'min', 5)
    sizes.append(supervised_dataset.shape[0])

In [None]:
len(np.where(np.array(sizes)==0)[0])

# Test with one kpi

In [None]:
# TEST WITH ONE KPI (container_cpu_used)
CURR_KPI = 'container_cpu_used'

# TRANSFORM DATA
print("Gathering data")
d = failure_free_host_df[failure_free_host_df['name'] == CURR_KPI][['cmdb_id', 'timestamp', 'value']]
d['timestamp'] = d['timestamp'].dt.round('60s') # reduce precision of timestamp
d['hour'] = d['timestamp'].dt.hour

# print("Here is a plot of every kpi ({nb_of_kpis}) for the node 'db_008'".format(nb_of_kpis=d['name'].unique().shape[0]))
# g = sns.FacetGrid(data=d, row='name', sharex=False, sharey=False, height=2, aspect=4)
# g.map_dataframe(sns.lineplot, x='timestamp', y='value')

pivot_d = pd.pivot_table(d, index='timestamp', columns='cmdb_id', values='value', dropna=True) # create columns out of every kpi 'name'
percent_missing = pivot_d.isnull().sum() * 100 / len(pivot_d)
print("Dropping {} columns because percent_missing is over 50%".format(pivot_d.loc[:, percent_missing >= 50].shape[1]))
pivot_d = pivot_d.loc[:, percent_missing < 50]  # drop columns that have too much missing values

# # alternatively
# print("Filling NaN using forward fill (ffill) method")
# pivot_d = pivot_d.fillna(method='ffill')
# pivot_d = pivot_d.dropna(axis=0, how='any')

arr_of_df_with_past_values = []
for i in range(5, 0, -1):
  temp_df = pivot_d.shift(periods=i, freq='min')
  temp_df.columns = ["{}(t-{}min)".format(_n, i) for _n in temp_df.columns]
  temp_df.dropna(axis=0, how='all', inplace=True)   # drop rows that are empty   
  arr_of_df_with_past_values.append(temp_df)

train_df_arr = []
for _node in pivot_d.columns:
  # current values of other kpis (not the one we are going to predict)
  temp_df = pd.concat(arr_of_df_with_past_values + [pivot_d.loc[:, pivot_d.columns!=_node]])
  temp_df['target_node'] = _node          # fills target_kpi colum
  temp_df['target_value'] = pivot_d[_node]
  train_df_arr.append(temp_df)

# create train_df
train_df = pd.concat(train_df_arr)
train_df = train_df.fillna(method='ffill')    # fill nan with previous value (ffill = foward fill)
train_df = train_df.dropna(axis=0, how='any') # drop nan
train_df['target_node'] = train_df['target_node'].astype('category')    # categorize the target columns
train_df = pd.get_dummies(train_df, columns=['target_node'], prefix=['tgt_node'])
train_df = train_df.reset_index().drop(columns=['timestamp'])

train_df

# Hyperparameters tuning using genetic algorithm


In [None]:
"""
HYPERPARAMETERS TUNING USING GENETIC ALGORITHM
"""
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from scipy.optimize import differential_evolution

class Tuner:
    def __init__(self, params_lim, model_init, fixed_params, eval_metrics, kfold_splits, dtypes, seed=None):
        self.params_lim = params_lim
        self.params = list(params_lim.keys())
        self.model_init = model_init
        self.fixed_params = fixed_params
        self.kfold_splits = kfold_splits
        self.dtypes=dtypes
        self.seed = seed

    def _function(self, hyperparams, X, y):
        # Assign hyper-parameters
        model_params = {}
        for _name, _value in zip(self.params, hyperparams):
            if self.dtypes[_name] == 'int':
                model_params[_name] = int(_value)
            else:
                model_params[_name] = _value

        ## Using kfold cross validation
        if self.seed:
            kf = KFold(n_splits=self.kfold_splits, shuffle=True, random_state=self.seed)
        else :
            kf = KFold(n_splits=self.kfold_splits, shuffle=False)

        y_pred_total = []
        y_test_total = []
        # kf-fold cross-validation loop
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Fit xgboost with (X_train, y_train), and predict X_test
            model = self.model_init(**self.fixed_params, **model_params)
            y_pred = model.fit(X_train, y_train).predict(X_test)
            # Append y_pred and y_test values of this k-fold step to list with total values
            y_pred_total.append(y_pred)
            y_test_total.append(y_test)
        # Flatten lists with test and predicted values
        y_pred_total = [item for sublist in y_pred_total for item in sublist]
        y_test_total = [item for sublist in y_test_total for item in sublist]

        # Calculate error metric of test and predicted values: rmse
        rmse = np.sqrt(mean_squared_error(y_test_total, y_pred_total))

        # log message
        print(">Intermediate results: rmse: {}, {}".format(rmse, model_params))
        return rmse
    
    def tune_hyperparameters(self, X, y, popsize=10, mutation=0.5, recombination=0.7, tol=0.1, workers=1, maxiter=100, init=None):
        # params boundaries
        boundaries = [value for value in self.params_lim.values()]

        # extra variables
        extra_variables = (X, y)

        ## set up Differential Evolution solver
        if init: 
            solver = differential_evolution(
                func=self._function, 
                bounds=boundaries, 
                args=extra_variables, 
                strategy='best1bin',
                popsize=popsize, 
                mutation=mutation, 
                recombination=recombination, 
                tol=tol, 
                seed=self.seed, 
                workers=workers, 
                maxiter=maxiter,
                init=init
                )
        else:
            solver = differential_evolution(
                func=self._function,
                bounds=boundaries,
                args=extra_variables,
                strategy='best1bin',
                popsize=popsize,
                mutation=mutation,
                recombination=recombination,
                tol=tol,
                seed=self.seed,
                workers=workers,
                maxiter=maxiter
            )

        ## calculate best hyperparameters and resulting rmse
        best_hyperparams = solver.x
        best_rmse = solver.fun

        ## print final results
        print("Converged hyperparameters: {}".format(best_hyperparams))
        print("Minimum rmse: {:.6f}".format(best_rmse))

In [None]:
# HYPERPARAMETER TUNING WITH GENETIC ALGORITHM
##  training data
X = train_df.loc[:, train_df.columns != 'target_value']
y = train_df['target_value']

# for regularization params (reg_alpha, reg_lambda)
fixed_params ={
    'learning_rate': 0.1,
    'gamma': 43,
    'max_depth': 4,
    'colsample_bytree': 0.4,
    'min_child_weight': 98,
    'base_score': 0.5,
    'booster': 'gbtree',
    'importance_type': 'gain',
    'n_estimators': 50,
    'nthread': 1,
    'objective': 'reg:squarederror',
    'seed': 2020,
    'subsample': 1,
    'verbosity': 1
    }

params_lim = {
    'reg_alpha' : (0, 10),
    'reg_lambda' : (0, 10)
}

dtypes = {
    'reg_alpha': 'float',
    'reg_lambda': 'float'
}

# for learning_rate, gamma, max_depth, colsample_bytree and min_child_weight
fixed_params ={
    'base_score': 0.5,
    'booster': 'gbtree',
    'importance_type': 'gain',
    'n_estimators': 50,
    'nthread': 1,
    'objective': 'reg:squarederror',
    'reg_alpha': 9.2,
    'reg_lambda': 8.9,
    'seed': 2020,
    'subsample': 1,
    'verbosity': 1
    }

params_lim = {
    'learning_rate' : (0.00001, 1),
    'gamma' : (0, 100),
    'max_depth' : (3, 20),
    #'reg_alpha' : (0, 10),
    #'reg_lambda' : (0, 10),
    'colsample_bytree' : (0.1, 1),
    'min_child_weight' : (0, 100),
}

dtypes = {
    'learning_rate' : 'float',
    'gamma' : 'float',
    'max_depth' : 'int',
    'colsample_bytree' : 'float',
    'min_child_weight' : 'float',
}



tuner = Tuner(params_lim=params_lim, model_init=xgb.XGBRegressor, fixed_params=fixed_params, eval_metrics='rmse', kfold_splits=8, dtypes=dtypes)
tuner.tune_hyperparameters(X, y, popsize=10, mutation=0.5, recombination=0.7, tol=0.1, workers=4, maxiter=100)

In [None]:
params ={
    'base_score': 0.5,
    'booster': 'gbtree',
    'importance_type': 'gain',
    'n_estimators': 100,
    'nthread': 4,
    'objective': 'reg:squarederror',
    'reg_alpha': 9.2,
    'reg_lambda': 8.9,
    'seed': 2020,
    'subsample': 1,
    'verbosity': 1,
    'learning_rate':0.1, 
    'gamma': 43, 
    'max_depth':10,
    'colsample_bytree':0.4,
    'min_child_weight':98
    }
##  training data
X = train_df.loc[:, train_df.columns != 'target_value']
y = train_df['target_value']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=False)

model = xgb.XGBRegressor(**params)
model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=['rmse'])
res = model.evals_result()
n = len(res['validation_0']['rmse'])

sns.lineplot(x=np.arange(n), y=res['validation_1']['rmse'])

plt.plot(model.predict(X_test), 'b-', y_test.reset_index()['target_value'], 'r--', alpha=0.8)

# MultiRegressor 

In [None]:
# FIT XGBOOST MODEL (multiregression)
from sklearn.multioutput import MultiOutputRegressor

print(">Loading data")
data_utils = DataUtils(verbose=True)
day_train = "2020_05_24"
day_test = "2020_05_25"

day_train_dt = datetime.datetime.strptime(day_train, "%Y_%m_%d")
day_test_dt = datetime.datetime.strptime(day_test, "%Y_%m_%d")

host, failures = data_utils.load_data(training_data_path, [day_train, day_test], None, 'host')

train_host = host[host['timestamp'].dt.date == day_train_dt.date()]
test_host = host[host['timestamp'].dt.date == day_test_dt.date()]

##  training data
print(">Preparing data")
train_ddf = data_utils.add_timeseries_features(train_host, 1, 'min', 5)
train_ddf['cmdb_id'] = data_utils.node_le.fit_transform(train_ddf['cmdb_id'])

unique_kpi = host['name'].unique()

X = train_ddf.loc[:, ~train_ddf.columns.isin(unique_kpi)]
y = np.log1p(train_ddf.loc[:, unique_kpi])

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=False, random_state=2020)

## fit model
print(">Initializing model")
params = {
    'learning_rate': 0.910, 
    'gamma': 50.107, 
    'max_depth': 18, 
    'colsample_bytree': 0.814, 
    'min_child_weight': 0.967,
    }
model = xgb.XGBRegressor(
    objective='reg:squarederror', 
    n_estimators=10,
    **params)


print(">Fitting model")
fit_params = {
    'eval_set':[(X_train, y_train), (X_test, y_test)], 
    'eval_metric':'rmse', 
    'verbose': 3
}

multioutputregressor = MultiOutputRegressor(model, n_jobs=2).fit(X_train, y_train)

## save model
print(">Saving")
filename = "multioutput_xgboost.pkl"
pickle.dump(multioutputregressor, open(filename, 'wb'))

In [None]:
multioutputregressor = pickle.load(open('../input/multioutput-xgboost/multioutput_xgboost.pkl', 'rb'))

In [None]:
N = 400
y_predicted = multioutputregressor.predict(X_test.iloc[:N])

rmsle = np.sqrt((y_predicted-y_test.iloc[:N])**2)
rmsle['cmdb_id'] = X_test.iloc[:N]['cmdb_id']

conclusion = rmsle.groupby('cmdb_id').mean()
detected_anomalies_rmse = conclusion[conclusion > 1].dropna(axis=0, how='all').dropna(axis=1, how='all')
nodes = data_utils.node_le.inverse_transform(detected_anomalies_rmse.index)
detected_anomalies_rmse.index = data_utils.node_le.inverse_transform(detected_anomalies_rmse.index)

submission = []
for n in nodes:
    kpi = detected_anomalies_rmse.loc[n, :].dropna().axes[0]
    for k in kpi:
        submission.append({'timestamp': host['timestamp'].max(), 'content': (n, k)})

submission_df = pd.DataFrame(submission)
submission_df

In [None]:
CURR_NODE = 42
N = None
y_predicted = multioutputregressor.predict(X_test.iloc[:N])

predicted = pd.DataFrame(y_predicted, columns=unique_kpi)
predicted['cmdb_id'] = X_test.iloc[:N, X_test.columns=='cmdb_id'].values
true = y_test.iloc[:N]
true['cmdb_id'] = X_test.iloc[:N]['cmdb_id'].values

n_figs = 5
plot_predicted = predicted[predicted['cmdb_id']==CURR_NODE].iloc[:, -n_figs-1:-1]
plot_true = true[true['cmdb_id']==CURR_NODE].iloc[:, -n_figs-1:-1]

fig, ax = plt.subplots(n_figs, 2, figsize=(20, 20))
# plot_input.plot(ax=ax[:, 0], subplots=True, sharey=False, xlabel="input")
plot_true.plot(ax=ax[:, 0], subplots=True, sharey=False, sharex=True, xlabel="true")
plot_predicted.plot(ax=ax[:, 1], subplots=True, sharey=False, xlabel="predicted")
fig.suptitle("KPIs of host 'os_020'")

In [None]:
fig.savefig('multioutput_xgboost_os020.png')

# lstm
- one model for each host
- multivariate time series forecasting



In [None]:
# LOAD DATA
print(">Loading data")
data_utils = DataUtils(verbose=True)
day_train = "2020_05_23" # china time
day_test = "2020_05_24" # china time
tz = pytz.timezone('Asia/Shanghai')

day_train_dt = tz.localize(datetime.datetime.strptime(day_train, "%Y_%m_%d")).astimezone(pytz.utc)
day_test_dt = tz.localize(datetime.datetime.strptime(day_test, "%Y_%m_%d")).astimezone(pytz.utc)

host, failures = data_utils.load_data(training_data_path, [day_train, day_test], None, 'host')
nodes = host['cmdb_id'].unique()

host['cmdb_id'] = data_utils.node_le.fit_transform(host['cmdb_id'])

train_host = host[host['timestamp'].dt.date == day_train_dt.date()]
test_host = host[host['timestamp'].dt.date == day_test_dt.date()]

In [None]:
test_failures = failures[failures['start_time'].dt.date == day_test_dt.date()]
test_failures = test_failures[test_failures['name'] == 'os_020']
tmsp = test_failures['start_time'].iloc[0]
test_host = test_host[(test_host['timestamp'] - tmsp).abs() < datetime.timedelta(hours=1)]

In [None]:
print(anomalous_test_host[anomalous_test_host['cmdb_id'] == 42]['name'].unique() )
print("="*50)
print(test_host[test_host['cmdb_id'] == 42]['name'].unique())

In [None]:
test_host[(test_host['cmdb_id'] == 42) & (test_host['name'] == 'Memory_total')]

In [None]:
# TRANSFORM DATA
WIN_PERIOD = 1
WIN_LENGTH = 5
TIME_UNIT = "min"

# RESHAPE DATA FOR LSTM
print(">Reshaping data for LSTM")
for node_lbl, node in zip(sorted(nodes), data_utils.node_le.transform(sorted(nodes))):
    print(f">>Examining node {node_lbl} (n°{node})")
    temp = train_host[train_host['cmdb_id'] == node]
    kpi = temp['name'].unique()
    
    assert temp.shape[0] > 0, "temp is empty"
    scaler = StandardScaler()
    tensor = data_utils.transform_to_lstm_data(temp, kpi, WIN_PERIOD, TIME_UNIT, WIN_LENGTH, scaler)
    print(f"(tensor shape: {tensor.shape})")

In [None]:
from sklearn.preprocessing import MinMaxScaler

CURR_NODE = 42 # os_020
train = train_host[train_host['cmdb_id'] == CURR_NODE]
test = test_host[test_host['cmdb_id'] == CURR_NODE]
kpi = train['name'].unique()
test_kpi = test['name'].unique()

assert train.shape[0] > 0, "train is empty"
assert test.shape[0] > 0, "test is empty"

scaler = MinMaxScaler()
WIN_PERIOD = 1
WIN_LENGTH = 5
TIME_UNIT = "min"

tensor = data_utils.transform_to_lstm_data(train, kpi, WIN_PERIOD, TIME_UNIT, WIN_LENGTH, scaler)
test_tensor = data_utils.transform_to_lstm_data(test, kpi, WIN_PERIOD, TIME_UNIT, WIN_LENGTH, scaler)
print(f"(tensor shape: {tensor.shape})")
print(f"(test tensor shape: {test_tensor.shape})")

In [None]:
# import packages
import math

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import Sequence
from datetime import timedelta
from sklearn.metrics import mean_squared_error

X = tensor[:, 1:, :]
y = tensor[:, 0, :]

X_test = test_tensor[:, 1:, :]
y_test = test_tensor[:, 0, :]

# Create the Keras model.
n_steps = tensor.shape[1]-1
n_features = tensor.shape[2]

# vanilla lstm
# ts_inputs = tf.keras.Input(shape=(n_steps, n_features))
# x = layers.LSTM(units=100, activation='tanh')(ts_inputs)
# x = layers.Dropout(0.2)(x)
# outputs = layers.Dense(n_features, activation='linear')(x)
# model = tf.keras.Model(inputs=ts_inputs, outputs=outputs)

# stacked lstm
# ts_inputs = tf.keras.Input(shape=(n_steps, n_features))
# x = layers.LSTM(units=100, return_sequences=True, activation='tanh')(ts_inputs)
# x = layers.LSTM(units=100, activation='tanh')(x)
# x = layers.Dropout(0.2)(x)
# outputs = layers.Dense(tensor.shape[2], activation='linear')(x)
# model = tf.keras.Model(inputs=ts_inputs, outputs=outputs)

# CNN into LSTM
# ts_inputs = tf.keras.Input(shape=(n_steps, n_features))
# x = layers.Conv1D(filters=64, kernel_size=4, activation='tanh')(ts_inputs)
# x = layers.MaxPooling1D(pool_size=2)(x)
# x = layers.LSTM(units=50, activation='tanh')(x)
# x = layers.Flatten()(x)
# x = layers.Dense(50, activation='relu')(x)
# x = layers.Dropout(0.2)(x)
# outputs = layers.Dense(tensor.shape[2], activation='linear')(x)
# model = tf.keras.Model(inputs=ts_inputs, outputs=outputs)
# outputs = layers.Dense(n_features, activation='linear')(x)
# model = tf.keras.Model(inputs=ts_inputs, outputs=outputs)

# CNN + LSTM
# input1 = tf.keras.Input(shape=(n_steps, n_features))
# input2 = tf.keras.Input(shape=(n_steps, n_features))
# cnn = layers.Conv1D(filters=64, kernel_size=4, activation='tanh')(input1)
# cnn = layers.MaxPooling1D(pool_size=2)(cnn)
# cnn = layers.Flatten()(cnn)
# lstm = layers.LSTM(units=100, activation='tanh')(input2)
# lstm = layers.Flatten()(lstm)
# x = layers.concatenate([cnn, lstm])
# x = layers.Dropout(0.2)(x)
# outputs = layers.Dense(n_features, activation='linear')(x)
# model = tf.keras.Model(inputs=[input1, input2], outputs=outputs)

# one lstm cell for each time series
inputs = []
outputs = []
for _ in range(n_features):
    inputs.append(tf.keras.Input(shape=(n_steps, 1)))
    x = layers.LSTM(units=5, activation='tanh')(inputs[-1])
#     x = layers.Dropout(0.2)(x)
    outputs.append(layers.Dense(1, activation='linear')(x))
model = tf.keras.Model(inputs=inputs, outputs=outputs)

# Specify the training configuration.
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
              loss=tf.keras.losses.MeanSquaredError(),
              metrics=['mse'])



# FIT
BATCH_SIZE = 128

xx = [X[:,:,i] for i in range(n_features)]
yy = [y[:, i] for i in range(n_features)]
xx_test = [X_test[:,:,i] for i in range(n_features)]
yy_test = [y_test[:, i] for i in range(n_features)]
model.fit(x=xx, y=yy, epochs=20, batch_size=BATCH_SIZE, validation_data=(xx_test, yy_test))

model.save('one_lstm_for_each_kpi_os_020.h5')

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
model.summary()

In [None]:
# train for all nodes and save models
from sklearn.preprocessing import MinMaxScaler

BATCH_SIZE = 128
WIN_PERIOD = 1
WIN_LENGTH = 5
TIME_UNIT = "min"

training_data_path = "AIOps挑战赛数据"
save_dir = "./lstm2/"
filename_format = "lstm_{}.h5"
models = {}
scalers = {}
dates = ["2020_05_22", "2020_05_23", "2020_05_24", "2020_05_25","2020_05_26","2020_05_27","2020_05_28", "2020_05_29","2020_05_30","2020_05_31"]

data_utils = DataUtils(verbose=True)
host, failures = data_utils.load_data(training_data_path, dates, None, 'host')
nodes = host['cmdb_id'].unique()

host['cmdb_id'] = data_utils.node_le.fit_transform(host['cmdb_id'])

for node in nodes:
    print(f"===== {node} =====")
    train = host[host['cmdb_id'] == data_utils.node_le.transform([node])[0]]
    kpi = train['name'].unique()

    # prepare data
    print("Preparing data")
    scaler = MinMaxScaler(feature_range=(0, 1))
    tensor = data_utils.transform_to_lstm_data(train, kpi, WIN_PERIOD, TIME_UNIT, WIN_LENGTH, scaler)
    X = tensor[:, 1:, :]
    y = tensor[:, 0, :]

    # create the Keras model.
    ts_inputs = tf.keras.Input(shape=(tensor.shape[1]-1, tensor.shape[2]))
    x = layers.LSTM(units=50, activation='tanh')(ts_inputs)
    x = layers.Dropout(0.2)(x)
    outputs = layers.Dense(tensor.shape[2], activation='linear')(x)
    model = tf.keras.Model(inputs=ts_inputs, outputs=outputs)

    # specify the training configuration.
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
                loss=tf.keras.losses.MeanSquaredLogarithmicError(),
                metrics=['msle'])

    # fit model 
    print("Fitting")
    model.fit(x=X, y=y, batch_size=BATCH_SIZE, epochs=30, validation_split=0.8, workers=4, use_multiprocessing=True)

    # save model and scalers
    print("Saving")
    models[node] = model
    scalers[node] = scaler
    filename = save_dir + filename_format.format(node)
    model.save(filename)

# pickle.dump(models, open("all_lstm_models.pkl", 'wb'))
pickle.dump(scalers, open("all_scalers.pkl", 'wb'))

In [None]:
!zip -r lstm2.zip ./lstm2/ 

In [None]:
THRESHOLD = 1
kpi_name= np.sort(test['name'].unique())

y_predicted = model.predict(X_test)
mse = ((y_predicted-y_test)**2).mean(axis=0)
msle = ((np.log1p(y_predicted)-np.log1p(y_test))**2).mean(axis=0)
anomalous_kpis = np.where(mse > THRESHOLD)
kpi_name[anomalous_kpis]

In [None]:
# plot kpis true vs predicted
kpi_name= np.sort(test['name'].unique())
y_predicted = model.predict([X_test[:,:,i] for i in range(n_features)])

xinput = pd.DataFrame(data=X_test[:, 0, :], columns=kpi_name)
true = pd.DataFrame(data=y_test, columns=kpi_name)
predicted = pd.DataFrame(data=np.concatenate(y_predicted, axis=1), columns=kpi_name)

n_figs = 5
# plot_input = xinput.iloc[:, -n_figs:]
plot_true = true.iloc[:, -n_figs:]
plot_predicted = predicted.iloc[:, -n_figs:]

fig, ax = plt.subplots(n_figs, 2, figsize=(20, 20))
# plot_input.plot(ax=ax[:, 0], subplots=True, sharey=False, xlabel="input")
plot_true.plot(ax=ax[:, 0], subplots=True, sharey=False, sharex=True, xlabel="true")
plot_predicted.plot(ax=ax[:, 1], subplots=True, sharey=False, xlabel="predicted")
fig.suptitle("KPIs of host 'os_020'")

In [None]:
fig.savefig("one lstm for every kpi - batch100 units5 nodropout")

In [None]:
mse = ((y_test-np.concatenate(y_predicted, axis=1))**2).mean(axis=0)
kpi_name[np.where(mse > 0.2)[0]]

In [None]:
tf.keras.models.load_model("lstm2/lstm_db_003")

# Trace Anomaly Detection


In [None]:
data_utils = DataUtils(verbose=True)
trace_df, failures = data_utils.load_data(training_data_path, ["2020_05_23"], None, 'trace')

In [None]:
# Calculate anomaly score
def anomaly_score(traces):
    print(f"Anomaly scores of {len(traces['trace_id'].unique())} traces")
    
    if 'node_name' not in traces.columns:
        traces['node_name'] = traces['cmdb_id'] # CSF, FlyRemote, OSB, RemoteProcess
        traces.loc[trace_df['call_type'].isin(['LOCAL', 'JDBC']), 'node_name'] = traces.loc[trace_df['call_type'].isin(['LOCAL', 'JDBC']), 'ds_name']

    def f(x):
        d = pd.Series({
            'start_time': x['start_time'].min(),

            # elapsed_time
            'elapsed_time': x['elapsed_time'].mean(),
            'childs_sum_elapsed_time': x['elapsed_time_child'].sum(),
            'childs_average_elapsed_time': x['elapsed_time_child'].mean(),
            'parent_average_elapsed_time': x['elapsed_time_parent'].mean(),

            # call_time
            'parent_call_time': x['elapsed_time_parent'].max() - x['elapsed_time_cousin'].sum(),
            'incoming_call_time': (x['elapsed_time_parent'].max() - x['elapsed_time_cousin'].sum()) / x['id_cousin'].count(),
            'outgoing_call_time': x['elapsed_time'].mean() - x['elapsed_time_child'].sum(),
            'mean_outgoing_call_time': (x['elapsed_time'].mean() - x['elapsed_time_child'].sum()) / x['pid_child'].count() if x['pid_child'].count() > 0 else 0,

            # number of ...
            'number_of_childs': x['pid_child'].count(),
            'number_of_cousins': x['id_cousin'].count(),

            # success_rate
            'child_success_rate': x['success_child'].mean(),
            'parent_success_rate': x['success_parent'].mean(),
            'cousin_success_rate': x['success_cousin'].mean(),
            'success': x['success'].mean(),

            # names
            'childs_name': x['node_name_child'].dropna().unique().tolist(),
            'parents_name': x['node_name_parent'].dropna().unique().tolist(),
            'cousins_name': x['node_name_cousin'].dropna().unique().tolist(),
        })
        return d
    
#     # childs
#     s = traces.merge(traces[['trace_id', 'pid', 'elapsed_time', 'node_name', 'success']], how='left', left_on=['trace_id', 'id'], right_on=['trace_id', 'pid'], suffixes=('', '_child'))
    
#     # parents
#     s = s.merge(traces[['trace_id', 'id', 'elapsed_time', 'node_name', 'success']], how='left', left_on=['trace_id', 'pid'], right_on=['trace_id', 'id'], suffixes=('', '_parent'))
    
#     # cousins
#     s = s.merge(traces[['trace_id', 'pid', 'id', 'elapsed_time', 'node_name', 'success']], how='outer', left_on=['trace_id', 'pid'], right_on=['trace_id', 'pid'], suffixes=('', '_cousin'))
    childs = traces.merge(traces[['trace_id', 'pid', 'elapsed_time', 'node_name', 'success']], how='left', left_on=['trace_id', 'id'], right_on=['trace_id', 'pid'], suffixes=('', '_child'))
    parents = traces.merge(traces[['trace_id', 'id', 'elapsed_time', 'node_name', 'success']], how='left', left_on=['trace_id', 'pid'], right_on=['trace_id', 'id'], suffixes=('', '_parent'))
    cousins = traces.merge(traces[['trace_id', 'pid', 'id', 'elapsed_time', 'node_name', 'success']], how='left', left_on=['trace_id', 'pid'], right_on=['trace_id', 'pid'], suffixes=('', '_cousin'))
    
    s = pd.concat([childs, parents, cousins])
    print(f"Merging complete parents, childs and cousin are now reunited ! Applying custom scoring function on dataframe of shape {s.shape}")
    anomaly_score = s.groupby(['node_name', 'trace_id', 'id']).apply(f)
    anomaly_score.fillna(0, inplace=True) # nan are nodes without childs, or without parents. Thus all ..._time related to either of them is 0 (and not NaN)
    return anomaly_score
# main
anomaly_scores = anomaly_score(trace_df)
anomaly_scores

In [None]:
# testing cell
childs = trace_df.merge(trace_df[['trace_id', 'pid', 'elapsed_time', 'node_name', 'success']], how='left', left_on=['trace_id', 'id'], right_on=['trace_id', 'pid'], suffixes=('', '_child'))
parents = trace_df.merge(trace_df[['trace_id', 'id', 'elapsed_time', 'node_name', 'success']], how='left', left_on=['trace_id', 'pid'], right_on=['trace_id', 'id'], suffixes=('', '_parent'))
cousins = trace_df.merge(trace_df[['trace_id', 'pid', 'id', 'elapsed_time', 'node_name', 'success']], how='left', left_on=['trace_id', 'pid'], right_on=['trace_id', 'pid'], suffixes=('', '_cousin'))

print(f"shapes {childs.shape} {parents.shape} {cousins.shape}")
pd.concat([childs, parents, cousins])

####
quantile = trace_df['elapsed_time'].quantile(0.99)
trace_df['outside_quantile'] = trace_df['elapsed_time'] > quantile
trace_roots = trace_df[trace_df['pid'] == 'None']
trace_roots.groupby("trace_id").sum().sort_values(by='elapsed_time', ascending=False)
trace_df[trace_df['trace_id'] == '38171490e1640892ee7b'].sort_values(by='start_time')
trace_df.groupby("trace_id").sum().sort_values(by='elapsed_time', ascending=False)

####
plot_trace(trace_df[trace_df['trace_id'] == '6a171001cb3255425da7'])

In [None]:
# PLOT TRACE
from graphviz import Digraph
def plot_trace(trace):
    u = Digraph('unix', format='png', strict=True)
    names = trace[['id', 'cmdb_id', 'service_name', 'ds_name']].set_index('id')
    for index, row in trace.iterrows():
        child_id = row['id']
        parent_id = row['pid']

        parent = names.loc[parent_id][['cmdb_id', 'service_name', 'ds_name']] if parent_id in names.index else {'cmdb_id': parent_id, 'service_name': parent_id}
        child = names.loc[child_id][['cmdb_id', 'service_name', 'ds_name']]
        
        if row['call_type'] in ['JDBC']:
            u.edge(str(parent['cmdb_id']), str(child['ds_name']), label=row['call_type'])
#         elif row['call_type'] in ['CSF']:
#             u.edge(str(parent['cmdb_id']), str(child['service_name']), label=row['call_type'])
        elif row['call_type'] in ['FlyRemote', 'OSB', 'CSF']:
            # skip these call types
            continue
        else:
            u.edge(str(parent['cmdb_id']), str(child['cmdb_id']), label=row['call_type'])
        
    return u
traces = trace_df[trace_df['trace_id'] == '6a1713243e3266b9c7f7']
# anomaly_time = datetime.datetime.strptime("23/05/2020 16:05:00", "%d/%m/%Y %H:%M:%S")
# window = datetime.timedelta(minutes=20)
# traces = trace_df[(trace_df['start_time'] - anomaly_time).abs() < window]
drawing = plot_trace(traces)
drawing.render('drawing')
drawing

In [None]:
# calculate mean value of last n minutes (20min)
past_mean_scores = anomaly_scores.mean(axis=0, level=0)
past_mean_scores

In [None]:
# Plot anomaly_scores against time
data = s[['elapsed_time', 'childs_average_elapsed_time', 'call_time','start_time', 'cmdb_id_parent', 'child_success_rate']].melt(
    ['start_time', 'cmdb_id_parent'], var_name='cols', value_name='value')
g = sns.FacetGrid(data=data, row='cmdb_id_parent',  hue='cols', sharex=True, sharey=False, height=3, aspect=4)
g.map_dataframe(sns.lineplot, x='start_time', y='value')
g.add_legend()

In [None]:
# FindRootCause function
def find_root_cause(anomaly_scores, past_mean_scores):
    scores = anomaly_scores.dropna().reset_index(['trace_id', 'id'], drop=True)
    anomalies = []
    for node in scores.index.unique():
        node_anomaly_scores = (scores.loc[node, past_mean_scores.columns] - past_mean_scores.loc[node, :]) / past_mean_scores.loc[node, :]
        print(node_anomaly_scores)
        
        # Network loss (docker/os)
        network_loss_threshold = 0.8
        # if sent_queue or receive_queue is anomalous:
        if node_anomaly_scores['outgoing_call_time'] > network_loss_threshold and node_anomaly_scores['incoming_call_time'] > network_loss_threshold:
            anomalies.append([node, "Sent_queue"])
            anomalies.append([node, "Received_queue"])
        
        # CPU fault (docker)
        cpu_fault_threshold = 0.8
        if node_anomaly_scores['outgoing_call_time'] > network_loss_threshold and node_anomaly_scores['incoming_call_time'] > network_loss_threshold:
            anomalies.append()
        
        # Network delay (docker/os)
        network_delay_threshold = 0.8
        
        # DB connection limit (db)
        db_connection_limit_threshold = 0.8
        
        # DB close (db)
        db_close_threshold = 0.8
        
    return anomalies
scores_to_test = anomaly_scores.sort_values(by='start_time').head(50000)
find_root_cause(scores_to_test, past_mean_score)

In [None]:
scores_to_test = anomaly_scores[(anomaly_scores['start_time']-datetime.datetime.strptime("23/05/2020 16:05:00", "%d/%m/%Y %H:%M:%S")).abs() < datetime.timedelta(minutes=1)]
k = scores_to_test.reset_index(['trace_id', 'id'], drop=True)
k = (k.loc[:, past_mean_scores.columns] - past_mean_scores) / past_mean_scores
k = k.dropna().max(level=0)
k.sort_values(by=['incoming_call_time', 'outgoing_call_time'], ascending=False)

In [None]:
anomalous_on_off_state_idx = np.where(host.loc[host['name'] == 'On_Off_State', 'value'] < 1)
anomalous_db = host.iloc[anomalous_on_off_state_idx]['cmdb_id'].to_list()
anomalous_db

In [None]:
host.loc[host['name'] == 'Sent_queue', ['value', 'cmdb_id', 'timestamp']].groupby('cmdb_id').describe()

In [None]:
#### zscore test
traces = trace_df.copy()
traces['node_name'] = traces['cmdb_id'] # CSF, FlyRemote, OSB, RemoteProcess
condition = traces['ds_name'].fillna('').str.contains('db')
traces.loc[condition, 'node_name'] = traces.loc[condition, 'ds_name']

anomaly_time = datetime.datetime.strptime("23/05/2020 16:40:00", "%d/%m/%Y %H:%M:%S")
ts = traces[(traces['start_time']-anomaly_time).abs() < datetime.timedelta(minutes=1)]
ts = ts.loc[ts['node_name'] == "db_003", ['elapsed_time', 'start_time']].set_index('start_time')
long_time_ts = traces.loc[traces['node_name'] == "db_003", ['elapsed_time', 'start_time']].set_index('start_time')
zscore = (ts - long_time_ts.mean()) / long_time_ts.std()
zscore.sort_values('elapsed_time')

In [None]:
h = host[(host['timestamp']-datetime.datetime.strptime("23/05/2020 16:05:00", "%d/%m/%Y %H:%M:%S")).abs() < datetime.timedelta(minutes=3)]
sns.FacetGrid(h.loc[h['name'] == 'container_cpu_used', ['value', 'cmdb_id', 'timestamp']], col='cmdb_id').map_dataframe(sns.lineplot, x='timestamp',y='value')

In [None]:
# ALL TIME statistics
data_utils = DataUtils(verbose=True)
dates = ["2020_05_22", "2020_05_23", "2020_05_24", "2020_05_25","2020_05_26","2020_05_27","2020_05_28", "2020_05_29","2020_05_30","2020_05_31"]
trace_df, failures = data_utils.load_data(training_data_path, dates[:len(dates)//3], None, 'trace')
print("done!")

print("node name")
trace_df['node_name'] = trace_df['cmdb_id'] # CSF, FlyRemote, OSB, RemoteProcess
condition = trace_df['ds_name'].fillna('').str.contains('db')
trace_df.loc[condition, 'node_name'] = trace_df.loc[condition, 'ds_name']
print("done!")

print("Free memory")
trace_df = trace_df[['node_name', 'elapsed_time']]
gc.collect()
print("done!")

trace_df2, _ = data_utils.load_data(training_data_path, dates[len(dates)//3:(len(dates)//3)*2], None, 'trace')
print("done!")

print("node name")
trace_df2['node_name'] = trace_df2['cmdb_id'] # CSF, FlyRemote, OSB, RemoteProcess
condition = trace_df2['ds_name'].fillna('').str.contains('db')
trace_df2.loc[condition, 'node_name'] = trace_df2.loc[condition, 'ds_name']
print("done!")

print("Free memory")
trace_df2 = trace_df2[['node_name', 'elapsed_time']]
gc.collect()
print("done!")

trace_df = trace_df.append(trace_df2)

trace_df3, _ = data_utils.load_data(training_data_path, dates[(len(dates)//3)*2:], None, 'trace')
print("done!")

print("node name")
trace_df3['node_name'] = trace_df3['cmdb_id'] # CSF, FlyRemote, OSB, RemoteProcess
condition = trace_df3['ds_name'].fillna('').str.contains('db')
trace_df3.loc[condition, 'node_name'] = trace_df3.loc[condition, 'ds_name']
print("done!")

print("Free memory")
trace_df3 = trace_df3[['node_name', 'elapsed_time']]
gc.collect()
print("done!")

trace_df = trace_df.append(trace_df3)

zscores = []
statistics = []
for node_name in trace_df['node_name'].unique():
    print(f"node {node_name}")
    ts = trace_df.loc[trace_df['node_name'] == node_name, ['elapsed_time']]
    long_ts = trace_df.loc[trace_df['node_name'] == node_name, ['elapsed_time']]
    mean, std = long_ts.mean(), long_ts.std()
    quantile95, quantile99, quantile99_99 = long_ts.quantile(0.95), long_ts.quantile(0.99), long_ts.quantile(0.9999)
    print(f"ALL TIME - mean {mean.values}, std {std.values}, quantiles 95 {quantile95.values}, 99 {quantile99.values}, 99.99 {quantile99_99.values}")
    statistics.append({
        'node_name': node_name,
        'mean' : mean.values,
        'std': std.values,
        'quantile95': quantile95.values, 
        'quantile99': quantile99.values, 
        'quantile99_99': quantile99_99.values
    })
    
#     temp_df = pd.DataFrame({
#         'zscore': (ts - mean) / std
#     })
#     temp_df['node_name'] = node_name
#     zscores.append(temp_df)
# alltime_zscores = pd.concat(zscores)
alltime_statistics = pd.DataFrame(statistics)
alltime_statistics.to_csv("alltime_statistics.csv", index=False)
alltime_zscores

In [None]:
host.loc[host['name'] == 'Received_queue', ['value', 'timestamp']].describe()