# Fetch log analysis

Exploratory analysis of the stop times fetching log data.

## Setup

In [19]:
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import os
import modin.pandas as pd
import requests

%matplotlib notebook

Read the data to a pandas DataFrame and show the first rows. Also convert some  columns.

In [2]:
df = pd.read_csv(
                 'fetch_log_parsed',
                 parse_dates=['actual_date'],
                 infer_datetime_format=True,
                 index_col='actual_date',
                 dtype={
                        'resp_status': str,
                        'resp_legth': float
                        }
                 )
df['resp_length'] = pd.to_numeric(df['resp_length'], errors='coerce')
#df = df['2020-02-12 00:00':]
df



Unnamed: 0_level_0,cod_stop,resp_time,resp_status,resp_length,timeout,connection_error,max_connections,timeout_time
actual_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-02-15 17:46:11.396055,8_06283,0.767585,200,3750.0,False,False,25,60
2020-02-15 17:46:11.394207,8_11647,0.795400,200,7515.0,False,False,25,60
2020-02-15 17:46:11.394387,8_3444,0.834889,200,7407.0,False,False,25,60
2020-02-15 17:46:11.395877,8_06884,1.041245,200,20115.0,False,False,25,60
2020-02-15 17:46:11.393115,8_50001,1.106854,200,2026.0,False,False,25,60
...,...,...,...,...,...,...,...,...
2020-03-17 10:45:42.742852,8_19420,10.532590,200,7419.0,False,False,25,60
2020-03-17 10:45:42.745405,8_08698,10.552473,200,11824.0,False,False,25,60
2020-03-17 10:45:42.745561,8_07505,10.598485,200,5070.0,False,False,25,60
2020-03-17 10:45:42.737376,8_23,10.661054,200,90164.0,False,False,25,60


In [3]:
print('Data from ' + str(df.index.values.min()) + ' to ' + str(df.index.values.max()) +'.')

Data from 2020-02-15T17:46:11.379125000 to 2020-03-17T10:45:42.745561000.


## Response time over time

In [18]:
minute_mean = df.query("resp_length > 0 & resp_status == '200' & timeout == False").resample("30Min").mean()
# plt.figure()
# plt.title('Response time over time of successful requests')
# plt.xlabel('Date')
# plt.ylabel('Response time (s)')
# plt.plot(minute_mean['resp_time'])
# #plt.hist([df.loc[df['resp_status'] == x, 'actual_date'] for x in error_codes], label=error_codes)
# plt.show()


User-defined function verification is still under development in Modin. The function provided is not verified.


`DataFrame.resample` defaulting to pandas implementation.



In [17]:
fig = px.line(
              minute_mean,
              x=minute_mean.index.to_pydatetime(),
              y='resp_time')

# Edit the layout
fig.update_layout(title='Response time of successful requests over time',
                  xaxis_title='Date',
                  yaxis_title='Response time (s)')
fig.show()

## Response status over time

In [24]:
status_codes = df['resp_status'].unique()
# plt.figure()
# plt.title('Response status over time')
# plt.xlabel('Date')
# for status_code in status_codes:
#     plt.plot(df.query("resp_status == '{status_code}'".format(status_code=status_code)).resample("30Min").count()['resp_status'], label=status_code)
# plt.ylabel('Number of answers')
# plt.legend()
# # plt.twinx()
# # plt.plot(df['cod_stop'].resample("30Min").nunique(), color='purple')
# # plt.ylabel('Number of stops')
# plt.show()


`Series.unique` defaulting to pandas implementation.



In [23]:
fig = go.Figure()
for status_code in status_codes:
    #plt.plot(df.query("resp_status == '{status_code}'".format(status_code=status_code)).resample("30Min").count()['resp_status'], label=status_code)
    plot_df = (df
                 .query("resp_status == '{status_code}'"
                 .format(status_code=status_code))
                 .resample("30Min")
                 .count())
    fig.add_trace(go.Scatter(
                x=plot_df.index.to_pydatetime(),
                y=plot_df['resp_status'],
                name=status_code))
    
# Edit the layout
fig.update_layout(title='Response status over time',
                  xaxis_title='Date',
                  yaxis_title='Number of answers')

fig.show()


User-defined function verification is still under development in Modin. The function provided is not verified.


`DataFrame.resample` defaulting to pandas implementation.



## Timeout ratio over time

In [28]:
timeout_sampled = df['timeout'].resample("30Min")
timeout_ratio_sampled = timeout_sampled.sum()/timeout_sampled.count()
# plt.figure()
# plt.title('Timeout ratio over time')
# plt.xlabel('Date')
# plt.plot(timeout_ratio_sampled)
# plt.show()


`Series.resample` defaulting to pandas implementation.



In [37]:
fig = px.line(
        x=timeout_ratio_sampled.index.to_pydatetime(),
        y=timeout_ratio_sampled.values)

# Edit the layout
fig.update_layout(title='Timeout ratio over time',
                  xaxis_title='Date',
                  yaxis_title='Timeout ratio')

fig.show()