## Detecting Time Series Anomolies

#### Corey Solitaire

`11.05.2020`

In [1]:
import numpy as np
import pandas as pd
import math
from sklearn import metrics

from scipy.stats import entropy

import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates #to format dates on our plots
%matplotlib inline
import seaborn as sns

# This is to make sure matplotlib doesn't throw the following error:
# The next line fixes "TypeError: float() argument must be a string or a number, not 'Timestamp' matplotlib"
pd.plotting.register_matplotlib_converters()

## Wrangle:

#### Read in df

In [2]:
df= pd.read_csv('anonymized-curriculum-access.txt', sep=" ", header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61


In [3]:
df.shape

(719459, 6)

#### Create date time formatt

In [4]:
df.columns = ['date', 'time', 'page_viewed', 'user_id', 'cohort_id', 'ip']
df["datetime"] = df["date"] + df["time"]
df = df.set_index('datetime')
df.drop(columns=['date', 'time'], inplace = True)
df.head()

Unnamed: 0_level_0,page_viewed,user_id,cohort_id,ip
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-2609:55:03,/,1,8.0,97.105.19.61
2018-01-2609:56:02,java-ii,1,8.0,97.105.19.61
2018-01-2609:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61
2018-01-2609:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61
2018-01-2609:56:24,javascript-i/conditionals,2,22.0,97.105.19.61


In [5]:
df.describe()

Unnamed: 0,user_id,cohort_id
count,719459.0,674619.0
mean,392.945478,37.17366
std,211.655544,14.971385
min,1.0,1.0
25%,230.0,26.0
50%,417.0,32.0
75%,568.0,53.0
max,787.0,62.0


In [9]:
df.groupby(['user_id','ip']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,page_viewed,cohort_id
user_id,ip,Unnamed: 2_level_1,Unnamed: 3_level_1
1,12.5.63.210,6,6
1,172.56.15.15,8,8
1,172.56.15.203,3,3
1,172.56.15.46,1,1
1,172.56.15.50,2,2
...,...,...,...
784,99.43.143.116,93,93
785,24.173.208.242,1,1
785,72.181.127.233,30,30
786,72.190.25.232,7,7


In [11]:
df.groupby(['cohort_id','ip']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,page_viewed,user_id
cohort_id,ip,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,107.204.49.22,1,1
1.0,107.77.196.213,9,9
1.0,107.77.217.196,3,3
1.0,107.77.217.65,7,7
1.0,107.77.220.6,1,1
...,...,...,...
62.0,98.199.170.30,2,2
62.0,98.39.66.72,24,24
62.0,99.151.201.25,254,254
62.0,99.162.243.68,126,126


***

#### Takeaway

`1. Dataset consists of 720,000 hits on the codeup curriculum.  The data consists of a datetime object, the page that was being viewed, the user id, cohort id, and ip address.`

`2. It appears that students are assigned numbers (user_id) and grouped in to cohorts (cohort_id)`

`3. Students often hit the website from different ip addresses, suggesting hits in a variety of locations`

***

## Explore:

***Research Question:***

`Is it possible to identify possible IP addresses that do not belong to Codeup Students?`