In [38]:
import pandas as pd
import matplotlib.pyplot as plt

In [39]:
def acquire(file_name, column_names):
    '''Takes in file and returns it as a pandas df'''
    return pd.read_csv(file_name, sep="\s", header=None, names=column_names, 
                       usecols=[0, 2, 3, 4, 5])

def prep(df, user):
    '''Takes in df made by the acquire function
    focuses on a user id
    changes date to be date time format
    set index to the new datetime
    returns clean pandas df'''
    # focus on specific user to eliminate noise
    df = df[df.user_id == user]
    # change date format
    df.date = pd.to_datetime(df.date)
    # set index
    df = df.set_index(df.date)
    # resample to daily
    pages = df['endpoint'].resample('d').count()
    return pages

def compute_pct_b(pages, span, weight, user):
    '''creates midband, std, upper band, lower band
    merge them together
    return new pandas df'''
    # make the midband
    midband = pages.ewm(span=span).mean()
    # make the standard deviation
    stdev = pages.ewm(span=span).std()
    # upper bound
    ub = midband + stdev*weight
    # lower bound
    lb = midband - stdev*weight
    # merge upper and lower bound
    bb = pd.concat([ub, lb], axis=1)
    # merch all together
    my_df = pd.concat([pages, midband, bb], axis=1)
    # specify columns
    my_df.columns = ['pages', 'midband', 'ub', 'lb']
    # create percent of bound
    my_df['pct_b'] = (my_df['pages'] - my_df['lb'])/(my_df['ub'] - my_df['lb'])
    # add in user id feature
    my_df['user_id'] = user
    # return new pandas df
    return my_df

def plt_bands(my_df, user):
    '''plots the bounds together on a single visual
    shows the plots together'''
    # set up for subplots
    fig, ax = plt.subplots(figsize=(12,8))
    # plot users
    ax.plot(my_df.index, my_df.pages, label='Number of Pages, User: '+str(user))
    # plot midband
    ax.plot(my_df.index, my_df.midband, label = 'EMA/midband')
    # plot upper boundx
    ax.plot(my_df.index, my_df.ub, label = 'Upper Band')
    # plot lower bound
    ax.plot(my_df.index, my_df.lb, label = 'Lower Band')
    # add in legend
    ax.legend(loc='best')
    # y label title
    ax.set_ylabel('Number of Pages')
    # show the plot
    plt.show()
    
def find_anomalies(df, user, span, weight):
    '''Takes in all prior funcitons
    with the exception of the acquire function'''
    # take in the prep funciton
    pages = prep(df, user)
    # take in the compute percentages function
    my_df = compute_pct_b(pages, span, weight, user)
    # plt_bands(my_df, user)
    return my_df[my_df.pct_b>1]

In [40]:
file_name = "anonymized-curriculum-access.txt"
column_names = ['date', 'endpoint', 'user_id', 'cohort_id', 'source_ip']
df = acquire(file_name, column_names)
df.head()

Unnamed: 0,date,endpoint,user_id,cohort_id,source_ip
0,2018-01-26,/,1,8.0,97.105.19.61
1,2018-01-26,java-ii,1,8.0,97.105.19.61
2,2018-01-26,java-ii/object-oriented-programming,1,8.0,97.105.19.61
3,2018-01-26,slides/object_oriented_programming,1,8.0,97.105.19.61
4,2018-01-26,javascript-i/conditionals,2,22.0,97.105.19.61


In [41]:
user = 1
span = 30
weight = 6
user_df = find_anomalies(df, user, span, weight)

anomalies = pd.DataFrame()
user_df = find_anomalies(df, user, span, weight)
anomalies = pd.concat([anomalies, user_df], axis=0)

In [42]:
span = 30
weight = 3.5

anomalies = pd.DataFrame()
for u in list(df.user_id.unique()):
    user_df = find_anomalies(df, u, span, weight)
    anomalies = pd.concat([anomalies, user_df], axis=0)

In [43]:
anomalies

Unnamed: 0_level_0,pages,midband,ub,lb,pct_b,user_id
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-01-29,44,3.623334,42.081344,-34.834677,1.024945,1
2019-07-08,101,8.248768,96.507472,-80.009936,1.025451,1
2018-10-15,9,0.581421,8.457414,-7.294571,1.034446,3
2019-01-09,4,0.262470,3.802115,-3.277175,1.027953,3
2019-04-06,2,0.129825,1.889712,-1.630061,1.031334,3
...,...,...,...,...,...,...
2021-01-24,7,0.574393,6.857226,-5.708440,1.011362,817
2021-03-25,59,6.824556,58.415342,-44.766230,1.005666,843
2021-02-23,67,8.229409,66.322904,-49.864086,1.005828,851
2021-04-06,19,1.348943,18.056879,-15.358994,1.028224,854
