In [1]:
import pandas as pd
import rrcf
import datetime
import numpy as np
from utils.helpers import nearest_after, optimal_fbeta, calculate_metrics, forest_to_dict
import json

# Carga de datos

En primer lugar, se cargan los ficheros de tráfico de la sede que se desee analizar. Estos ficheros deben haber sido previamente procesados ():

In [2]:
traffic_data = pd.read_csv('./data/bbva_usa_data//BBVA-USA-HOU-MP-M-075945.csv')
traffic_data.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,SC_ROUTER,DE_INTERFACE,NU_SPEED,NU_TRAFFIC_INPUT,NU_TRAFFIC_OUTPUT,DT_MEASURE_DATETIME,SC_INTERFACE_TYPE,DE_CUSTOMER_NAME,SD_ADMINISTRATIVE_NUMBER,WEEKEND
0,BBVA-USA-HOU-MP-M-075945,GigabitEthernet0/1,3.072,0.05099,0.657667,2018-03-01 00:00:00,WAN,BBVA,75945,False
1,BBVA-USA-HOU-MP-M-075945,GigabitEthernet0/1,3.072,0.15426,2.824734,2018-03-01 00:05:00,WAN,BBVA,75945,False
2,BBVA-USA-HOU-MP-M-075945,GigabitEthernet0/1,3.072,0.124266,1.215409,2018-03-01 00:10:00,WAN,BBVA,75945,False
3,BBVA-USA-HOU-MP-M-075945,GigabitEthernet0/1,3.072,0.191709,1.52566,2018-03-01 00:15:00,WAN,BBVA,75945,False
4,BBVA-USA-HOU-MP-M-075945,GigabitEthernet0/1,3.072,0.185136,1.62025,2018-03-01 00:20:00,WAN,BBVA,75945,False


A continuación se cargan los datos de incidencias correspondientes a la sede, también previamente procesados()

In [3]:
incidents_filtered = pd.read_csv('./data/bbva_usa_data/incidenciasBBVA-USA_filtered.csv')
incidents_filtered.head() 

Unnamed: 0,TICKET_ID,SERVICE_TYPE,SUBMIT_DATE,LAST_MODIFICATION_DATE,ASSIGNED_SUPPORT_COMPANY,REPORTED_DATE,REPORTED_SOURCE_ID,OPERATING_COMPANY_NAME,CONTACT_COMPANY_NAME,END_USER_COUNTRY,...,OPERATIONAL_CATEGORIZATION_TIER_1,OPERATIONAL_CATEGORIZATION_TIER_2,OPERATIONAL_CATEGORIZATION_TIER_3,CLOSURE_CATEGORIZATION_TIER_1,CLOSURE_CATEGORIZATION_TIER_2,CLOSURE_CATEGORIZATION_TIER_3,VENDOR_GROUP,INF2_HPD_OUTAGE_DURATION,ADMIN_NUMBER,INSTANCEID
0,INC000001193617,User Service Restoration,2018-05-07 17:39:17,2019-07-18 19:35:55,Telefonica,2018-05-07 17:38:26,12001.0,Telefónica México (Pegaso),,Spain,...,Wan Services,MWAN - Failure,Damage in Backup Line,NO DATA,NO DATA,NO DATA,XO Communications,0.0,20100322-SITEMPLS-0000075945,AGGAA5V0F5WOOAPI33B7PH6O0AGKSM
1,INC000001209422,User Service Restoration,2018-05-17 09:03:42,2019-07-18 19:35:55,Telefonica,2018-05-17 08:57:08,12001.0,Telefónica México (Pegaso),,Spain,...,Wan Services,MWAN - Failure,Total Isolation,PROVIDER,Configuration,,XO Communications,1260.0,20100322-SITEMPLS-0000075945,IDGAA5V0F5WOOAPIVH53PHY8NVE0NO
2,INC000001311301,User Service Restoration,2018-07-29 15:47:40,2019-07-18 19:35:55,Telefonica,2018-07-29 15:50:18,12001.0,Telefónica México (Pegaso),,Spain,...,Wan Services,MWAN - Failure,Damage in Backup Line,WAN - PROVIDER,LOCAL NNI,LNNI - FIBER CUT,XO Communications,298.0,20100322-SITEMPLS-0000075945,IDGAA5V0F6IPYAP2MXH6P1QCMBJFVK
3,INC000001311313,User Service Restoration,2018-07-29 17:02:42,2019-07-18 19:35:55,Telefonica,2018-07-29 16:49:59,10000.0,Telefónica México (Pegaso),Telefónica México (Pegaso),Spain,...,Wan Services,MWAN - Failure,Damage in Main Line Working with Backup,NO DATA,NO DATA,NO DATA,,0.0,20100322-SITEMPLS-0000075945,IDGAA5V0F6IPYAP2NACGP1QF6SJHHJ
4,INC000001322999,User Service Restoration,2018-08-08 04:17:55,2019-07-18 19:36:48,Telefónica México (Pegaso),2018-08-08 04:17:20,12001.0,Telefónica México (Pegaso),,Spain,...,Wan Services,MWAN - Failure,Damage in Backup Line,WAN - PROVIDER,LOCAL LOOP,LL - RESTORED WITHOUT INTERVENTION,,1.0,20100322-SITEMPLS-0000075945,IDGAA5V0F5V26AP3E9XVP2HOTRH64S


# Fase de entrenamiento

Se seleccionan los meses de entrenamiento del modelo. Debido a que para que el funcionamiento del algoritmo sea el deseado, los datos de entrenamiento deben ser lo más "normales" posibles. Entendiéndose "normal" como libres de anomalías, en la medida de lo posible. Por ello se eliminan los 0 presentes en la serie:

In [4]:
traffic_series = pd.Series(traffic_data.NU_TRAFFIC_INPUT.values, index = traffic_data.DT_MEASURE_DATETIME.values)
train_data = traffic_series['2019-02-01 00:00:00':'2019-04-30 23:55:00']
train_data = train_data[train_data != 0] 

In [5]:
num_trees = 50
tree_size = 512
train_index = train_data.index.values
forest = []
while len(forest) < num_trees:
    samples = np.random.choice(train_index, size = (num_trees, tree_size), replace = True)
    trees = [rrcf.RCTree(np.vstack(train_data.loc[sample].values), index_labels = sample) for sample in samples]
    forest.extend(trees) 

In [6]:
forest_dict = forest_to_dict(forest) 
with open('./data/bbva_usa_data/forest_dict.json', 'w') as output_file:
    json.dump(forest_dict, output_file)

# Fase de scoring

Se seleccionan los datos que se desee *scorear*. Se simula un *scoreo* de los datos en *streaming* a través de un bucle *for*:

In [None]:
test_data = traffic_series['2019-05-01 00:00:00':'2019-11-30 23:55:00']
codisp_no_shingle = pd.Series(0, index = test_data.index.values)

for next_index,next_point in test_data.items():
    for tree in forest:
        tree.insert_point(next_point, index=next_index)
        codisp_no_shingle[next_index] += tree.codisp(next_index)
        tree.forget_point(next_index)
avg_codisp_no_shingle = codisp_no_shingle/num_trees 

El resultado devuelto es un vector de *anomaly scores*. Este vector es convertido a *dataframe* para poder ser guardado y utilizado posteriormente:

In [None]:
avg_codisp_no_shingle.name = 'Anomaly score'
avg_codisp_no_shingle = avg_codisp_no_shingle.to_frame()
avg_codisp_no_shingle.reset_index(inplace=True)
avg_codisp_no_shingle.rename(columns={'index':'Date'}, inplace=True)
avg_codisp_no_shingle.describe()

In [None]:
# Se puede guardar para su posterior uso
# avg_codisp_no_shingle.to_csv('./data/bbva_usa_data/anomaly_score_may_nov.csv', index = False)

# Fase de optimización del threshold 

Para calcular el *threshold* óptimo, es necesario hacer cálculo previo de las intervalos asociados a las incidencias.Este intervalo consiste en el lapso de tiempo transcurrido entre la *submit_date* y la *last_resolved_date* de cada incidencia. Las alarmas lanzadas por el algoritmo durante ese intervalo no van a ser tomadas como "falsos positivos":

In [5]:
# Se podrían cargar desde un fichero las anomaly_scores necesarias
avg_codisp_no_shingle = pd.read_csv('./data/bbva_usa_data/anomaly_score_may_nov.csv')

In [7]:
type(avg_codisp_no_shingle["Anomaly score"].values)

numpy.ndarray

In [10]:
# Nos quedamos con las fechas de test y transformamos a tipo datetime
traffic_data_test = traffic_data[(traffic_data['DT_MEASURE_DATETIME'] >= '2019-05-01 00:00:00') & (traffic_data['DT_MEASURE_DATETIME'] <= '2019-11-30 23:55:00')].copy()
traffic_dates_test = traffic_data_test.DT_MEASURE_DATETIME.values
traffic_dates_test = [datetime.datetime.strptime(date, '%Y-%m-%d %H:%M:%S') for date in traffic_dates_test]


incidents_filtered.sort_values(by = 'SUBMIT_DATE', inplace = True)
submit_dates_test = list(filter(lambda x: (str(x) > '2019-05-01 00:00:00' and str(x) < '2019-11-30 23:55:00'), incidents_filtered.SUBMIT_DATE.values))
resolved_dates_test = list(filter(lambda x: (str(x) > '2019-05-01 00:00:00' and str(x) < '2019-11-30 23:55:00'), incidents_filtered.LAST_RESOLVED_DATE.values))

mapped_submit_dates = [nearest_after(traffic_dates_test, submit_date) for submit_date in submit_dates_test]
mapped_resolved_dates = [nearest_after(traffic_dates_test, resolved_date) for resolved_date in resolved_dates_test]

# Eliminamos fechas que no se corresponden con anomalías reales en la serie de tráfico
mapped_submit_dates.remove('2019-08-02 06:15:00') 
mapped_submit_dates.remove('2019-09-12 03:50:00')
mapped_resolved_dates.remove('2019-09-12 08:00:00')

# Modificamos las incidencias que se mapean al mismo instante, mapeando una de ellas al instante anterior
mapped_submit_dates[0] = '2019-05-11 13:05:00'

# Se calculan las diferencias entre submit_dates y last_resolved_dates
differences_resolved_submit = [np.ceil((datetime.datetime.strptime(mapped_resolved_dates[i], '%Y-%m-%d %H:%M:%S') - datetime.datetime.strptime(mapped_submit_dates[i], '%Y-%m-%d %H:%M:%S')).total_seconds()/60/5) for i in range(len(mapped_resolved_dates))] # Diferencia en nº de intervalos de 5 min.


traffic_data_test['INCIDENT'] = 0.0
traffic_data_test.loc[traffic_data['DT_MEASURE_DATETIME'].isin(mapped_submit_dates), 'INCIDENT'] = 1.0
incidents = traffic_data_test.INCIDENT.values

anomaly_scores_50 = avg_codisp_no_shingle['Anomaly score'].values

In [11]:
type(differences_resolved_submit)

list

Se lanza la función que hace el cálculo del *threshold* óptimo en función de las incidencias registradas:

In [36]:
prev_points = 2 
initial_threshold = 0
last_threshold = 250
beta_value = 2
num_incidents = len(mapped_resolved_dates)

best_threshold = optimal_fbeta(anomaly_scores_50, incidents, prev_points, differences_resolved_submit, initial_threshold, last_threshold, beta_value, num_incidents)
fp, fn, tp, recall, precision, f2_score = calculate_metrics(anomaly_scores_50, incidents, prev_points, best_threshold, differences_resolved_submit, beta_value, num_incidents)

In [37]:
print('El threshold óptimo es: {}'.format(best_threshold))
print('El número de falsas alarmas es: {}'.format(fp))
print('El número incidencias no captadas es: {}'.format(fn))
print('El número de incidencias captadas es: {}'.format(tp))
print('La precisión es {} y el recall {}'.format(precision, recall))
print('La f2_score alcanzada es: {}'.format(f2_score))

El threshold óptimo es: 225.0
El número de falsas alarmas es: 89
El número incidencias no captadas es: 1
El número de incidencias captadas es: 5
La precisión es 0.05319148936170213 y el recall 0.8333333333333334
La f2_score alcanzada es: 0.21186440677966104
