In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path

In [2]:
datasets_path = str(Path(os.getcwd()).parent) + "/datasets/"
df = pd.read_csv(datasets_path + "down_detector_outage_count.csv")
df_rep = pd.read_csv(datasets_path + "down_detector_reported_problem_count.csv")

In [58]:
def extract_event_times(df):
    event_times = dict()
    event_ids = df["event_id"].unique()

    event_started = False
    event_ended = False
    # Extract event_start and event_end from each event
    for ev_id in event_ids:
        start_time, end_time = 0, 0
        df_event = df[df["event_id"] == ev_id]
        threshold = np.mean(df_event["count"])
        duration = 0
        event_ended, event_started = False, False
        for _, row in df_event.sort_values("timestamp").iterrows():

            if row["count"] > threshold:
                duration += 900

        event_times[ev_id] = {"start_time": start_time, 
                              "end_time": end_time, 
                              "duration": duration,
                              "max": np.max(df_event["count"]),
                              "mean": np.mean(df_event["count"]),
                              "median": np.median(df_event["count"]),
                              "min": np.min(df_event["count"])}

    return event_times


In [59]:
extract_event_times(df[df["provider"] == "aws"])

{21: {'start_time': 0,
  'end_time': 0,
  'duration': 20700,
  'max': 57,
  'mean': 2.736842105263158,
  'median': 1.0,
  'min': 0},
 2: {'start_time': 0,
  'end_time': 0,
  'duration': 19800,
  'max': 50,
  'mean': 2.6736842105263157,
  'median': 1.0,
  'min': 0},
 6: {'start_time': 0,
  'end_time': 0,
  'duration': 22500,
  'max': 317,
  'mean': 28.778947368421054,
  'median': 1.0,
  'min': 0},
 26: {'start_time': 0,
  'end_time': 0,
  'duration': 23400,
  'max': 136,
  'mean': 8.789473684210526,
  'median': 1.0,
  'min': 0},
 9: {'start_time': 0,
  'end_time': 0,
  'duration': 19800,
  'max': 136,
  'mean': 9.073684210526316,
  'median': 2.0,
  'min': 0},
 34: {'start_time': 0,
  'end_time': 0,
  'duration': 16200,
  'max': 59,
  'mean': 3.2421052631578946,
  'median': 0.0,
  'min': 0},
 36: {'start_time': 0,
  'end_time': 0,
  'duration': 12600,
  'max': 36,
  'mean': 2.0105263157894737,
  'median': 0.0,
  'min': 0},
 61: {'start_time': 0,
  'end_time': 0,
  'duration': 16200,
  'm

array([ nan, 900., 900., 900., 900., 900., 900., 900., 900., 900., 900.,
       900., 900., 900., 900., 900., 900., 900., 900., 900., 900., 900.,
       900., 900., 900., 900., 900., 900., 900., 900., 900., 900., 900.,
       900., 900., 900., 900., 900., 900., 900., 900., 900., 900., 900.,
       900., 900., 900., 900., 900., 900., 900., 900., 900., 900., 900.,
       900., 900., 900., 900., 900., 900., 900., 900., 900., 900., 900.,
       900., 900., 900., 900., 900., 900., 900., 900., 900., 900., 900.,
       900., 900., 900., 900., 900., 900., 900., 900., 900., 900., 900.,
       900., 900., 900., 900., 900., 900., 900.])

In [83]:
SECOND = 1
MINUTE = SECOND * 60
REPORT_FREQ = MINUTE * 15

# DOESN'T MAKE SENSE
def calc_event_times(df):
    event_times = dict()
    event_ids = df["event_id"].unique()
    
    # Extract event_start and event_end from each event
    for ev_id in event_ids:
        start_time, end_time = 0, 0
        duration = 0
        df_event = df[df["event_id"] == ev_id]
        df_diff = df_event[["count", "timestamp"]].diff()
        is_increasing = False
        for i in range(0, len(df_diff.index)-1):
            if df_diff.iloc[i+1:i+2, :]["count"].values[0] > 0:
                is_increasing = True
            if df_diff.iloc[i:i+1, :]["count"].values[0] > 0 and is_increasing:
                duration += 900
                

        event_times[ev_id] = {"duration": duration,
                              "max": np.max(df_event["count"]),
                              "mean": np.mean(df_event["count"]),
                              "median": np.median(df_event["count"]),
                              "min": np.min(df_event["count"])}

    return event_times

In [84]:
calc_event_times(df[df["provider"] == "azure"])

{2: {'duration': 31500,
  'max': 24,
  'mean': 3.610526315789474,
  'median': 2.0,
  'min': 0},
 50: {'duration': 17100,
  'max': 50,
  'mean': 3.0105263157894737,
  'median': 0.0,
  'min': 0},
 22: {'duration': 12600,
  'max': 45,
  'mean': 2.0526315789473686,
  'median': 0.0,
  'min': 0},
 48: {'duration': 17100,
  'max': 72,
  'mean': 3.5157894736842104,
  'median': 0.0,
  'min': 0},
 21: {'duration': 20700,
  'max': 66,
  'mean': 4.936842105263158,
  'median': 1.0,
  'min': 0},
 7: {'duration': 27000,
  'max': 412,
  'mean': 96.85227272727273,
  'median': 35.5,
  'min': 0},
 1: {'duration': 38700,
  'max': 412,
  'mean': 94.9578947368421,
  'median': 28.0,
  'min': 0},
 17: {'duration': 36000,
  'max': 42,
  'mean': 12.126315789473685,
  'median': 8.0,
  'min': 0},
 27: {'duration': 34200,
  'max': 37,
  'mean': 6.673684210526316,
  'median': 2.0,
  'min': 0},
 29: {'duration': 18000, 'max': 87, 'mean': 4.8, 'median': 0.0, 'min': 0},
 8: {'duration': 27000,
  'max': 620,
  'mean': 