## Discrete Anomaly Detection
### Corey Solitaire
`11.4.2020`

In [1]:
from __future__ import division
import itertools
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import pandas as pd
import math
from sklearn import metrics
from random import randint
from matplotlib import style
import seaborn as sns
%matplotlib inline

## Wranle Data:

#### Acquire

In [2]:
colnames=['ip', 'timestamp', 'request_method', 'status', 'size',
          'destination', 'request_agent']
df_orig = pd.read_csv('http://python.zach.lol/access.log',          
                 engine='python',
                 header=None,
                 index_col=False,
                 names=colnames,
                 sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
                 na_values='"-"',
                 usecols=[0, 3, 4, 5, 6, 7, 8]
)

new = pd.DataFrame([["95.31.18.119", "[21/Apr/2019:10:02:41+0000]", 
                     "GET /api/v1/items/HTTP/1.1", 200, 1153005, np.nan, 
                     "python-requests/2.21.0"],
                    ["95.31.16.121", "[17/Apr/2019:19:36:41+0000]", 
                     "GET /api/v1/sales?page=79/HTTP/1.1", 301, 1005, np.nan, 
                     "python-requests/2.21.0"],
                    ["97.105.15.120", "[18/Apr/2019:19:42:41+0000]", 
                     "GET /api/v1/sales?page=79/HTTP/1.1", 301, 2560, np.nan, 
                     "python-requests/2.21.0"],
                    ["97.105.19.58", "[19/Apr/2019:19:42:41+0000]", 
                     "GET /api/v1/sales?page=79/HTTP/1.1", 200, 2056327, np.nan, 
                     "python-requests/2.21.0"]], columns=colnames)

df = df_orig.append(new)

In [3]:
df.head()

Unnamed: 0,ip,timestamp,request_method,status,size,destination,request_agent
0,97.105.19.58,[16/Apr/2019:19:34:42 +0000],"""GET /api/v1/sales?page=81 HTTP/1.1""",200,512495,,"""python-requests/2.21.0"""
1,97.105.19.58,[16/Apr/2019:19:34:42 +0000],"""GET /api/v1/items HTTP/1.1""",200,3561,,"""python-requests/2.21.0"""
2,97.105.19.58,[16/Apr/2019:19:34:44 +0000],"""GET /api/v1/sales?page=82 HTTP/1.1""",200,510103,,"""python-requests/2.21.0"""
3,97.105.19.58,[16/Apr/2019:19:34:46 +0000],"""GET /api/v1/sales?page=83 HTTP/1.1""",200,510003,,"""python-requests/2.21.0"""
4,97.105.19.58,[16/Apr/2019:19:34:48 +0000],"""GET /api/v1/sales?page=84 HTTP/1.1""",200,511963,,"""python-requests/2.21.0"""


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13978 entries, 0 to 3
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   ip              13978 non-null  object
 1   timestamp       13978 non-null  object
 2   request_method  13978 non-null  object
 3   status          13978 non-null  int64 
 4   size            13978 non-null  int64 
 5   destination     25 non-null     object
 6   request_agent   13978 non-null  object
dtypes: int64(2), object(5)
memory usage: 873.6+ KB


#### Change Status to Object

In [5]:
df['status'] = df['status'].astype(object)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13978 entries, 0 to 3
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   ip              13978 non-null  object
 1   timestamp       13978 non-null  object
 2   request_method  13978 non-null  object
 3   status          13978 non-null  object
 4   size            13978 non-null  int64 
 5   destination     25 non-null     object
 6   request_agent   13978 non-null  object
dtypes: int64(1), object(6)
memory usage: 873.6+ KB


#### Parse Datetime

In [6]:
df.timestamp = df.timestamp.str.replace(r'(\[|\])', '', regex=True)
df.timestamp= pd.to_datetime(df.timestamp.str.replace(':', ' ', 1)) 
df = df.set_index('timestamp')
df.head()

Unnamed: 0_level_0,ip,request_method,status,size,destination,request_agent
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-04-16 19:34:42+00:00,97.105.19.58,"""GET /api/v1/sales?page=81 HTTP/1.1""",200,512495,,"""python-requests/2.21.0"""
2019-04-16 19:34:42+00:00,97.105.19.58,"""GET /api/v1/items HTTP/1.1""",200,3561,,"""python-requests/2.21.0"""
2019-04-16 19:34:44+00:00,97.105.19.58,"""GET /api/v1/sales?page=82 HTTP/1.1""",200,510103,,"""python-requests/2.21.0"""
2019-04-16 19:34:46+00:00,97.105.19.58,"""GET /api/v1/sales?page=83 HTTP/1.1""",200,510003,,"""python-requests/2.21.0"""
2019-04-16 19:34:48+00:00,97.105.19.58,"""GET /api/v1/sales?page=84 HTTP/1.1""",200,511963,,"""python-requests/2.21.0"""


#### Cleanup Text

In [7]:
# Removes "" from around strings in columns
for col in ['request_method', 'request_agent', 'destination']:
    df[col] = df[col].str.replace('"', '')

# Removes the detail of the page number
df['request_method'] = df.request_method.str.replace(r'\?page=[0-9]+', '', regex=True)  
    
df.head()

Unnamed: 0_level_0,ip,request_method,status,size,destination,request_agent
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-04-16 19:34:42+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,512495,,python-requests/2.21.0
2019-04-16 19:34:42+00:00,97.105.19.58,GET /api/v1/items HTTP/1.1,200,3561,,python-requests/2.21.0
2019-04-16 19:34:44+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,510103,,python-requests/2.21.0
2019-04-16 19:34:46+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,510003,,python-requests/2.21.0
2019-04-16 19:34:48+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,511963,,python-requests/2.21.0


#### Add Variable: Converting bytes to mb

In [8]:
df['size_mb'] = [n/1024/1024 for n in df['size']]
df.head()

Unnamed: 0_level_0,ip,request_method,status,size,destination,request_agent,size_mb
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-04-16 19:34:42+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,512495,,python-requests/2.21.0,0.488753
2019-04-16 19:34:42+00:00,97.105.19.58,GET /api/v1/items HTTP/1.1,200,3561,,python-requests/2.21.0,0.003396
2019-04-16 19:34:44+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,510103,,python-requests/2.21.0,0.486472
2019-04-16 19:34:46+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,510003,,python-requests/2.21.0,0.486377
2019-04-16 19:34:48+00:00,97.105.19.58,GET /api/v1/sales HTTP/1.1,200,511963,,python-requests/2.21.0,0.488246


***

## Explore:

#### Value Counts by ip address

In [24]:
df.ip.value_counts(dropna=False)

97.105.19.58      11999
173.173.113.51     1059
72.181.113.170      613
72.181.105.81       246
24.26.242.9          21
68.201.219.223       21
70.121.214.34         2
35.175.171.137        2
52.87.230.102         2
52.91.30.150          1
54.145.52.184         1
95.31.16.121          1
34.229.70.250         1
35.174.209.2          1
3.88.129.158          1
45.23.250.16          1
54.172.14.223         1
52.90.165.200         1
95.31.18.119          1
3.92.201.136          1
34.207.64.242         1
97.105.15.120         1
Name: ip, dtype: int64

In [31]:
df.groupby('ip').count()

Unnamed: 0_level_0,request_method,status,size,destination,request_agent,size_mb
ip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
173.173.113.51,1059,1059,1059,0,1059,1059
24.26.242.9,21,21,21,0,21,21
3.88.129.158,1,1,1,0,1,1
3.92.201.136,1,1,1,0,1,1
34.207.64.242,1,1,1,0,1,1
34.229.70.250,1,1,1,0,1,1
35.174.209.2,1,1,1,0,1,1
35.175.171.137,2,2,2,0,2,2
45.23.250.16,1,1,1,0,1,1
52.87.230.102,2,2,2,0,2,2


***
***Takeaway:***

`1. Six IP Addresses make the bulk of the observations (> 2 Requests)`

`2. 97.105.19.58 is observed in the greatest frequency (11,999)`

<div class="alert alert-block alert-info"><b></b>

#### Value Counts by request_method

In [22]:
df.request_method.value_counts(dropna=False)

GET /api/v1/sales HTTP/1.1                      12403
GET /api/v1/items HTTP/1.1                       1065
GET /api/v1/stores HTTP/1.1                       229
GET / HTTP/1.1                                    107
GET /documentation HTTP/1.1                       100
GET /favicon.ico HTTP/1.1                          26
GET /api/v1//api/v1/items HTTP/1.1                 11
GET /api/v1/items/api/v1/items HTTP/1.1             7
GET /api/v1/items/next_page HTTP/1.1                5
GET /api/v1/ HTTP/1.1                               4
GET /api/v1/sales/ HTTP/1.1                         3
GET /api/v1/store HTTP/1.1                          3
GET /api/v1/itemsitems HTTP/1.1                     3
GET /api/v1/sales/HTTP/1.1                          3
GET /api/v1items HTTP/1.1                           2
GET /api/v1 HTTP/1.1                                1
GET /api/v1/I_DIDNT_DO_IT!!!! HTTP/1.1              1
GET /api/V1/HiZach! HTTP/1.1                        1
GET /api/v1/items&page=0 HTT

In [30]:
df.groupby('request_method').count()

Unnamed: 0_level_0,ip,status,size,destination,request_agent,size_mb
request_method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GET / HTTP/1.1,107,107,107,3,107,107
GET /api/V1/HiZach! HTTP/1.1,1,1,1,0,1,1
GET /api/v1 HTTP/1.1,1,1,1,0,1,1
GET /api/v1/ HTTP/1.1,4,4,4,0,4,4
GET /api/v1//api/v1/items HTTP/1.1,11,11,11,1,11,11
GET /api/v1//api/v1/items/next_page HTTP/1.1,1,1,1,0,1,1
GET /api/v1/I_DIDNT_DO_IT!!!! HTTP/1.1,1,1,1,0,1,1
GET /api/v1/helloclass! HTTP/1.1,1,1,1,0,1,1
GET /api/v1/items HTTP/1.1,1065,1065,1065,0,1065,1065
GET /api/v1/items&page=0 HTTP/1.1,1,1,1,0,1,1


***
***Takeaway:***

`1. Most popular end_points were sales, items, and stores`

`2. Less then 1 % of hits visisted doc`

<div class="alert alert-block alert-info"><b></b>

#### Value Counts by status

In [21]:
df.status.value_counts(dropna=False)

200    13960
499       16
301        2
Name: status, dtype: int64

In [29]:
df.groupby('status').count()

Unnamed: 0_level_0,ip,request_method,size,destination,request_agent,size_mb
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
200,13960,13960,13960,25,13960,13960
301,2,2,2,0,2,2
499,16,16,16,0,16,16


***
***Takeaway:***

`1. Most requests were ok and were executed`

<div class="alert alert-block alert-info"><b></b>

#### Value Counts by destination

In [20]:
df.destination.value_counts(dropna=False)

NaN                                                             13953
https://python.zach.lol/                                            4
https://ds.codeup.com/8.3_Acquire/                                  3
https://python.zach.lol/api/v1/stores?page=1                        2
https://python.zach.lol/api/v1/                                     2
https://python.zach.lol/api/v1/items?page=0                         1
https://python.zach.lol/api/v1/items                                1
https://python.zach.lol/api/v1//api/v1/items                        1
https://python.zach.lol/api/V1/HiZach!                              1
http://localhost:8888/notebooks/acquire.ipynb                       1
https://python.zach.lol/api/v1/stores?page=2                        1
http://localhost:8889/notebooks/timeseries_acquisition.ipynb        1
https://python.zach.lol/api/v1/I_DIDNT_DO_IT!!!!                    1
https://python.zach.lol/api/v1//api/v1/items?page=2                 1
https://python.zach.

In [28]:
df.groupby('destination').count()

Unnamed: 0_level_0,ip,request_method,status,size,request_agent,size_mb
destination,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
http://localhost:8888/notebooks/acquire.ipynb,1,1,1,1,1,1
http://localhost:8889/notebooks/timeseries_acquisition.ipynb,1,1,1,1,1,1
https://ds.codeup.com/8.3_Acquire/,3,3,3,3,3,3
https://python.zach.lol/,4,4,4,4,4,4
https://python.zach.lol/api/V1/HiZach!,1,1,1,1,1,1
https://python.zach.lol/api/v1/,2,2,2,2,2,2
https://python.zach.lol/api/v1//api/v1/items,1,1,1,1,1,1
https://python.zach.lol/api/v1//api/v1/items/next_page,1,1,1,1,1,1
https://python.zach.lol/api/v1//api/v1/items?page=2,1,1,1,1,1,1
https://python.zach.lol/api/v1/I_DIDNT_DO_IT!!!!,1,1,1,1,1,1


***
***Takeaway:***

`1. The majority of these request originated outside of the local host`

<div class="alert alert-block alert-info"><b></b>

#### Value Counts by request_agent

In [25]:
df.request_agent.value_counts(dropna=False)

python-requests/2.21.0                                                                                                       12005
python-requests/2.20.1                                                                                                        1911
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36       34
Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0                                               8
Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)                                                                       7
Slackbot 1.0 (+https://api.slack.com/robots)                                                                                     6
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36        4
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gec

In [27]:
df.groupby('request_agent').count()

Unnamed: 0_level_0,ip,request_method,status,size,destination,size_mb
request_agent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0,8,8,8,8,0,8
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",2,2,2,2,1,2
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",4,4,4,4,4,4
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",34,34,34,34,20,34
Python-urllib/3.7,1,1,1,1,0,1
Slackbot 1.0 (+https://api.slack.com/robots),6,6,6,6,0,6
Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots),7,7,7,7,0,7
python-requests/2.20.1,1911,1911,1911,1911,0,1911
python-requests/2.21.0,12005,12005,12005,12005,0,12005


***
***Takeaway:***

`1. The most popular request agent was python-requests`   
`2. The second most popular was Mozilla`   
`3. The third was a slackbot`   

<div class="alert alert-block alert-info"><b></b>

In [None]:
## 