In [1]:
from __future__ import division
import itertools
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from numpy import linspace, loadtxt, ones, convolve
from sklearn.ensemble import IsolationForest
import numpy as np
import pandas as pd
import collections
import math
from sklearn import metrics
from random import randint
from matplotlib import style
import seaborn as sns
# style.use('fivethirtyeight')
%matplotlib inline

pd.plotting.register_matplotlib_converters()

--------

<h2>README</h2>

- The following exercises show the process of utilizing functions to help analyze anomalies by using Exponential Moving Average and Bollinger Bands
- The other goal for this exercise is to add doc-strings and markdown comments to describe the processes that are occurring within the function

------

- Steps to analyze time series anomalies

     - Acquire the data
     - Prep the data with user (observation)
     - Compute the features
         - compute mid-band
         - compute standard deviation
         - compute upper & lower band
         - create a dataframe with metrics
         - compute %b
         - add the user_id to the dataframe
     - Plot
     - Search for anomalies
     
     
- other items to consider
    - Determine your K (also known as your 'weight'). Standard for real world is usually 6 (this is to reduce noise)
    

In [12]:

# Acquire the data 

def acquire(file_name, column_names):
    
    '''This function acquires a file from a csv, uses no header (blank columns) then assigns column names & finishes by calling the columns to be pulled'''
    
    return pd.read_csv(file_name, sep='\s', header=None, names=column_names, usecols=[0, 2, 3, 4, 5])

# Prep the data

def prep(df, user, span, weight):
    
    ''' This function uses the already acquired data frame and then preps the data by doing the following:
         - have an argument where the user can be specified
         - convert the date column to datetime
         - setting the index as the date
         - finally, this function will return a Pandas Series called pages that produces a user id with total pages accessed. '''
    
    df = df[df.user_id == user]
    df.date = pd.to_datetime(df.date)
    df = df.set_index(df.date)
    pages = df['endpoint'].resample('d').count()
    
# Let's compute the b percentage, %b

def compute_pct_b(pages, span, weight, user):
    
    ''' This function calculates the midband, standard deviation, upper & lower bands. It then
       concatanates the upper & lower bands together. After this is complete, the function then concatanates
       the mid-band along with the upper/lower bands and pages which was created back in the prep function. New columns
       are created to hold the 4 new columns that were brought together through concatanation. Finally, two new columns are 
       made which are the %b and user_id. The functions returns a new dataframe. '''
    
    midband = pages.ewm(span=span).mean()
    stdev = pages.ewm(span=span).std()
    ub = midband + stdev*weight
    lb = midband - stdev*weight
    bb = pd.concat([ub, lb], axis=1)
    my_df = pd.concat([pages, midband, bb], axis=1)
    my_df.columns = ['pages', 'midband', 'ub', 'lb']
    my_df['pct_b'] = (my_df['pages'] - my_df['lb'])/(my_df['ub'] - my_df['lb'])
    my_df['user_id'] = user
    
    return my_df

# plot the upper, mid, lower bands for every user

def plt_bands(my_df, user):
    
    ''' This functions will plot the upper, mid, lower bands for every user along with the original page counts'''
    
    fig, ax = plt.subplots(figsize=(12,8))
    ax.plot(my_df.index, my_df.pages, label='Number of Pages, User: '+str(user))
    ax.plot(my_df.index, my_df.midband, label = 'EMA/midband')
    ax.plot(my_df.index, my_df.ub, label = 'Upper Band')
    ax.plot(my_df.index, my_df.lb, label = 'Lower Band')
    ax.legend(loc='best')
    ax.set_ylabel('Number of Pages')
    
    plt.show()
    

# Find anomalies

def find_anomalies(df, user, span, weight):
    
    '''This function imports a clean/prepped data frame and also uses the user_id to calculate the percentage b by inputting the
    span (the number of days for the exponential moving average) and the weight (that is our k which is used to reduce noise. The higher the k
    the less the noise although, too high and no anomalies will be found). Finally, the function will return all instances of the pct_b column
    that are over 1 '''
    
    pages = prep(df, user, span, weight)
    my_df = compute_pct_b(pages, span, weight, user)
    
    
    
    return my_df[my_df.pct_b>1]
    
    

In [19]:
file_name = 'logs_df.csv'
column_names = ['date', 'endpoint', 'user_id', 'cohort_id', 'source_ip']

In [20]:
df = acquire(file_name, column_names)

In [22]:
span = 30
weight = 3.5
user_df = find_anomalies(df, user, span, weight)

anomalies = pd.DataFrame()
for u in list(df.user_id.unique()):
    user_df = find_anomalies(df, u, span, weight)
    anomalies = pd.concat([anomalies, user_df], axis=0)

TypeError: prep() takes 2 positional arguments but 4 were given