In [140]:
# importacion general de librerias y de visualizacion (matplotlib y seaborn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random as rd
%matplotlib inline

plt.style.use('default') # haciendo los graficos un poco mas bonitos en matplotlib
#plt.rcParams['figure.figsize'] = (20, 10)

sns.set(style="whitegrid") # seteando tipo de grid en seaborn

pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs

import warnings
warnings.filterwarnings('ignore')

In [141]:
event_log = pd.DataFrame({
 'event_id':range(1,101),
 'event_type_id': rd.choices(range(1,6),k=100),
 'event_status': rd.choices(['ERROR','DELAYED','CAPTURED'],k=100),
 'country_id': rd.choices(['AR','BR','UY','CO','CL'],k=100),
 'event_producer_id': rd.choices(range(1,6),k=100),
 'event_date': pd.date_range(start='2021-03-01',freq='D',periods=100)
})
event_log

Unnamed: 0,event_id,event_type_id,event_status,country_id,event_producer_id,event_date
0,1,2,ERROR,BR,4,2021-03-01
1,2,3,DELAYED,CL,4,2021-03-02
2,3,4,CAPTURED,UY,4,2021-03-03
3,4,4,ERROR,UY,5,2021-03-04
4,5,1,CAPTURED,CL,4,2021-03-05
...,...,...,...,...,...,...
95,96,4,CAPTURED,CO,4,2021-06-04
96,97,4,ERROR,CL,1,2021-06-05
97,98,3,DELAYED,AR,2,2021-06-06
98,99,1,CAPTURED,BR,1,2021-06-07


In [142]:
event_types = pd.DataFrame({
 'event_type_id': range(1,6),
 'event_type_name': range(1,6),
 'event_consumer_id': range(100,105),
 'event_consumer_target': ["salesforce", "dynamodb", "erp","XXX","YYY"]
})
event_types

Unnamed: 0,event_type_id,event_type_name,event_consumer_id,event_consumer_target
0,1,1,100,salesforce
1,2,2,101,dynamodb
2,3,3,102,erp
3,4,4,103,XXX
4,5,5,104,YYY


a) Top 5 de Consumers que han tenido la mayor cantidad de eventos que resultaron en un event_status de ERROR.

In [143]:
logs_error_status = event_log.loc[event_log['event_status']=='ERROR',['event_id','event_type_id']]
logs_error_status.head()

Unnamed: 0,event_id,event_type_id
0,1,2
3,4,4
6,7,2
8,9,1
11,12,5


In [144]:
freq_errors = logs_error_status.groupby('event_type_id')['event_id'].count().reset_index()
freq_errors = freq_errors.rename(columns={'event_id':'error_freq'})
freq_errors

Unnamed: 0,event_type_id,error_freq
0,1,4
1,2,11
2,3,8
3,4,9
4,5,9


In [145]:
top_5_most_freq_errors = pd.merge(freq_errors,event_types,on='event_type_id',how='inner')
top_5_most_freq_errors = top_5_most_freq_errors.nlargest(5,columns='error_freq')['event_type_name']
top_5_most_freq_errors

1    2
3    4
4    5
2    3
0    1
Name: event_type_name, dtype: int64

b) De los eventos ocurridos para el country_id: BR indicar la cantidad de eventos totales por cada evento ocurridos por event_consumer_target

In [146]:
events_br = event_log.loc[event_log['country_id'] == 'BR']
events_br

Unnamed: 0,event_id,event_type_id,event_status,country_id,event_producer_id,event_date
0,1,2,ERROR,BR,4,2021-03-01
6,7,2,ERROR,BR,4,2021-03-07
19,20,3,ERROR,BR,3,2021-03-20
28,29,4,ERROR,BR,3,2021-03-29
29,30,5,CAPTURED,BR,1,2021-03-30
33,34,2,ERROR,BR,4,2021-04-03
36,37,1,CAPTURED,BR,3,2021-04-06
38,39,1,ERROR,BR,5,2021-04-08
39,40,5,ERROR,BR,3,2021-04-09
44,45,2,DELAYED,BR,5,2021-04-14


In [147]:
events_br_tg = pd.merge(events_br,event_types,on='event_type_id',how='inner')
events_br_tg = events_br_tg[['event_id','event_status','event_consumer_target']]
events_br_tg

Unnamed: 0,event_id,event_status,event_consumer_target
0,1,ERROR,dynamodb
1,7,ERROR,dynamodb
2,34,ERROR,dynamodb
3,45,DELAYED,dynamodb
4,90,ERROR,dynamodb
5,20,ERROR,erp
6,52,ERROR,erp
7,54,DELAYED,erp
8,60,CAPTURED,erp
9,29,ERROR,XXX


In [148]:
table_res = events_br_tg.pivot_table(index='event_status',columns='event_consumer_target',values='event_id',aggfunc='count')
solucion_b = table_res.loc[['ERROR','DELAYED','CAPTURED']]

In [149]:
solucion_b

event_consumer_target,XXX,YYY,dynamodb,erp,salesforce
event_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ERROR,1.0,2.0,4.0,2.0,1.0
DELAYED,,,1.0,1.0,2.0
CAPTURED,2.0,3.0,,1.0,2.0
