In [1]:
from __future__ import division
import itertools
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import pandas as pd
import math
from sklearn import metrics
from random import randint
from matplotlib import style
import seaborn as sns
%matplotlib inline

# Import wrangle
import wrangle

In [2]:
df = wrangle.prep()

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 13978 entries, 2019-04-16 19:34:42+00:00 to 2019-04-19 19:42:41+00:00
Data columns (total 7 columns):
ip                13978 non-null object
request_method    13978 non-null object
status            13978 non-null int64
size              13978 non-null int64
destination       25 non-null object
request_agent     13978 non-null object
size_mb           13978 non-null float64
dtypes: float64(1), int64(2), object(4)
memory usage: 873.6+ KB


In [4]:
df.head()

Unnamed: 0_level_0,ip,request_method,status,size,destination,request_agent,size_mb
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-04-16 19:34:42+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,512495,,python-requests/2.21.0,0.488753
2019-04-16 19:34:42+00:00,97.105.19.58,GET /api/v1/items HTTP/1.1,200,3561,,python-requests/2.21.0,0.003396
2019-04-16 19:34:44+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,510103,,python-requests/2.21.0,0.486472
2019-04-16 19:34:46+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,510003,,python-requests/2.21.0,0.486377
2019-04-16 19:34:48+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,511963,,python-requests/2.21.0,0.488246


In [5]:
print("HTTP Status Codes, Frequency Count")
df.status.value_counts()

HTTP Status Codes, Frequency Count


200    13960
499       16
301        2
Name: status, dtype: int64

In [6]:
df.request_agent.value_counts()

 python-requests/2.21.0                                                                                                        12001
 python-requests/2.20.1                                                                                                         1911
 Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36        34
 Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0                                                8
 Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)                                                                        7
 Slackbot 1.0 (+https://api.slack.com/robots)                                                                                      6
python-requests/2.21.0                                                                                                             4
 Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (

## What do we know so far?
- Most of the request go through just fine
- The vast majority of requests are coming from python scripts using the python-requests library

In [7]:
print("Size in MB, # of Requests")
df.size_mb.value_counts(bins=6)

Size in MB, # of Requests


(0.327, 0.654]       12321
(-0.00296, 0.327]     1655
(1.634, 1.961]           1
(0.981, 1.307]           1
(1.307, 1.634]           0
(0.654, 0.981]           0
Name: size_mb, dtype: int64

In [8]:
# Looks like the vast majority of requests come in on Tuesday with some on Wednesday
df.groupby(df.index.dayofweek).agg(["count"])

Unnamed: 0_level_0,ip,request_method,status,size,destination,request_agent,size_mb
Unnamed: 0_level_1,count,count,count,count,count,count,count
timestamp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1,11947,11947,11947,11947,22,11947,11947
2,2028,2028,2028,2028,3,2028,2028
3,1,1,1,1,0,1,1
4,1,1,1,1,0,1,1
6,1,1,1,1,0,1,1


## What else we've learned:
- Most requests between 0.327MB and 0.654MB
- Most traffic is on Tuesday (84%)
- Some traffic on Wednesday (16%)

In [9]:
df.request_method = df.request_method.str.strip()
df.parts = df.request_method.str.split(" ")

In [10]:
# Determine the http method (expecting GET and/or POST)
df["http_method"] = df.parts.apply(lambda x: x[0])

In [11]:
# Isolate the request endpoint
df["endpoints"] = df.parts.apply(lambda x: x[1])

In [12]:
df.endpoints.value_counts()

/api/v1/sales                      12403
/api/v1/items                       1065
/api/v1/stores                       229
/                                    107
/documentation                       100
/favicon.ico                          26
/api/v1//api/v1/items                 11
/api/v1/items/api/v1/items             7
/api/v1/items/next_page                5
/api/v1/                               4
/api/v1/sales/HTTP/1.1                 3
/api/v1/itemsitems                     3
/api/v1/store                          3
/api/v1/sales/                         3
/api/v1items                           2
/api/V1/HiZach!                        1
/api/v1/items&page=0                   1
/api/v1/helloclass!                    1
/api/v1//api/v1/items/next_page        1
/api/v1                                1
/api/v1/items/HTTP/1.1                 1
/api/v1/I_DIDNT_DO_IT!!!!              1
Name: endpoints, dtype: int64