# Predict AFK/user based of Date and Time

## INIT

In [44]:
# imports
import pandas as pd, sklearn as sk, numpy as np, matplotlib.pyplot as plt
from scipy import stats
from scipy.signal import find_peaks

In [8]:
# read data to df
df = pd.read_csv('slack_data.csv')
imp_col = ['datetime','user','text']

In [28]:
# convert linux_time to datetime and filter AFK
import datetime
df = df[df['text'].notna()]
ts = lambda x: datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S')
df['datetime'] = pd.to_datetime(df['ts'].apply(ts), format='%Y-%m-%d %H:%M:%S')

t_df = df[imp_col]
t_df = t_df[t_df['text'].str.contains('afk',case=False)]
t_df['text'] = 1
t_df.rename(columns={"text":"text_num"}, inplace=True)
t_df = t_df.sort_values('datetime')

t_df.head(10)

Unnamed: 0,datetime,user,text_num
36673,2018-02-01 13:49:44,U026L9E5X,1
17936,2018-02-01 13:49:44,U026L9E5X,1
55010,2018-02-01 13:49:44,U026L9E5X,1
36640,2018-02-01 14:37:34,U0H5WEF6G,1
17903,2018-02-01 14:37:34,U0H5WEF6G,1
54977,2018-02-01 14:37:34,U0H5WEF6G,1
17882,2018-02-01 14:40:59,U3NTV39QU,1
36619,2018-02-01 14:40:59,U3NTV39QU,1
54956,2018-02-01 14:40:59,U3NTV39QU,1
17879,2018-02-01 14:50:31,U0H5WEF6G,1


## Few random exploratory graphs

## Feature Engineering

### Go through the following blogs
- [Analytics Vidhya - 6 Powerful feature Engineering Techniques Time Series](https://www.analyticsvidhya.com/blog/2019/12/6-powerful-feature-engineering-techniques-time-series/)
- [Towards Data Science - Feature Engineering on Time-Series Data for Human Activity Recognition](https://towardsdatascience.com/feature-engineering-on-time-series-data-transforming-signal-data-of-a-smartphone-accelerometer-for-72cbe34b8a60)
- [Real-World Machine Learning - 7.3. Time-series features](https://livebook.manning.com/book/real-world-machine-learning/chapter-7/110)
- [Dezyre - 7.3. Time-series features](https://www.dezyre.com/article/8-feature-engineering-techniques-for-machine-learning/423)

### Feature Extraction

In [52]:
''' Logistic Time Series Data (helper methods)
- Filter users if required
- Resample data in 15min time solts or interval for time series
- Convert text to binary
'''

def time_bin_binaryfy(df,time_var):
    df.set_index(df.datetime, inplace=True)
    df = df.resample(time_var).sum()
    return df

def user_filter(df, user=None):
    if user is not None:
        df = df[df['user']==user]
    df.drop(['user'], axis=1, inplace=True)
    return time_bin_binaryfy(df,'1H')

# sample test
temp_df = user_filter(t_df.sample(n=100, random_state=1))
temp_df

Unnamed: 0_level_0,text_num
datetime,Unnamed: 1_level_1
2018-02-13 11:00:00,1
2018-02-13 12:00:00,0
2018-02-13 13:00:00,0
2018-02-13 14:00:00,0
2018-02-13 15:00:00,0
...,...
2019-09-09 11:00:00,0
2019-09-09 12:00:00,0
2019-09-09 13:00:00,0
2019-09-09 14:00:00,0


In [53]:
temp_df.max()

text_num    2
dtype: int64

In [54]:
# Date related features
def data_features(df):
    df['year']=df.index.year
    df['month']=df.index.month
    df['day']=df.index.day
    df['dayofweek_num']=df.index.dayofweek
    df['dayofweek_name']=df.index.day_name()
    return df[['year','month','day','dayofweek_num','dayofweek_name']]

data_features(temp_df.sample(n=10, random_state=1))

Unnamed: 0_level_0,year,month,day,dayofweek_num,dayofweek_name
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-08-04 08:00:00,2019,8,4,6,Sunday
2019-04-12 08:00:00,2019,4,12,4,Friday
2019-08-09 18:00:00,2019,8,9,4,Friday
2018-06-04 02:00:00,2018,6,4,0,Monday
2018-04-13 04:00:00,2018,4,13,4,Friday
2018-12-19 08:00:00,2018,12,19,2,Wednesday
2018-08-19 22:00:00,2018,8,19,6,Sunday
2018-09-30 08:00:00,2018,9,30,6,Sunday
2018-03-23 17:00:00,2018,3,23,4,Friday
2018-03-15 15:00:00,2018,3,15,3,Thursday


In [55]:
# Time related features
def time_features(df):
    df['hour'] = df.index.hour
    df['minute'] = df.index.minute
    return df[['hour','minute']]

time_features(temp_df.sample(n=10, random_state=1))

Unnamed: 0_level_0,hour,minute
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-08-04 08:00:00,8,0
2019-04-12 08:00:00,8,0
2019-08-09 18:00:00,18,0
2018-06-04 02:00:00,2,0
2018-04-13 04:00:00,4,0
2018-12-19 08:00:00,8,0
2018-08-19 22:00:00,22,0
2018-09-30 08:00:00,8,0
2018-03-23 17:00:00,17,0
2018-03-15 15:00:00,15,0


In [56]:
# Lag features
def lag_features(df,iter_p=1):
    for i in range(iter_p):
        df[f'lag_{i}'] = df['text_num'].shift(i)
    return df[[f'lag_{i}' for i in range(iter_p)]]
lag_features(temp_df.sample(n=10, random_state=1),7)

Unnamed: 0_level_0,lag_0,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-08-04 08:00:00,0,,,,,,
2019-04-12 08:00:00,0,0.0,,,,,
2019-08-09 18:00:00,0,0.0,0.0,,,,
2018-06-04 02:00:00,0,0.0,0.0,0.0,,,
2018-04-13 04:00:00,0,0.0,0.0,0.0,0.0,,
2018-12-19 08:00:00,0,0.0,0.0,0.0,0.0,0.0,
2018-08-19 22:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0
2018-09-30 08:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0
2018-03-23 17:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0
2018-03-15 15:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
# Rolling Window Feature
def rolling_window_features(df):
    df['rolling_mean'] = df['text_num'].rolling(window=7).mean()
    df['rolling_min'] = df['text_num'].rolling(window=7).min()
    df['rolling_max'] = df['text_num'].rolling(window=7).max()
    return df[['rolling_mean','rolling_min','rolling_max']]

rolling_window_features(temp_df.sample(n=10, random_state=1))

Unnamed: 0_level_0,rolling_mean,rolling_min,rolling_max
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-08-04 08:00:00,,,
2019-04-12 08:00:00,,,
2019-08-09 18:00:00,,,
2018-06-04 02:00:00,,,
2018-04-13 04:00:00,,,
2018-12-19 08:00:00,,,
2018-08-19 22:00:00,0.0,0.0,0.0
2018-09-30 08:00:00,0.0,0.0,0.0
2018-03-23 17:00:00,0.0,0.0,0.0
2018-03-15 15:00:00,0.0,0.0,0.0


In [58]:
# Expanding Window Feature
def expanding_window_features(df,expanding_num=1):
    df['expanding_mean'] = df['text_num'].expanding(expanding_num).mean()
    df['expanding_min'] = df['text_num'].expanding(expanding_num).min()
    df['expanding_max'] = df['text_num'].expanding(expanding_num).max()
    return df[['expanding_mean','expanding_min','expanding_max']]

expanding_window_features(temp_df.sample(n=10, random_state=1))

Unnamed: 0_level_0,expanding_mean,expanding_min,expanding_max
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-08-04 08:00:00,0.0,0.0,0.0
2019-04-12 08:00:00,0.0,0.0,0.0
2019-08-09 18:00:00,0.0,0.0,0.0
2018-06-04 02:00:00,0.0,0.0,0.0
2018-04-13 04:00:00,0.0,0.0,0.0
2018-12-19 08:00:00,0.0,0.0,0.0
2018-08-19 22:00:00,0.0,0.0,0.0
2018-09-30 08:00:00,0.0,0.0,0.0
2018-03-23 17:00:00,0.0,0.0,0.0
2018-03-15 15:00:00,0.0,0.0,0.0


In [59]:
def basic_statistical_features(df,window_size=7,step_size=3):
    val = []
    for i in range(0, df.shape[0] - window_size, step_size):
        val.append(df['text_num'].values[i: i + window_size])
    i_df = pd.DataFrame()
    # mean
    i_df['mean'] = pd.Series(val).apply(lambda x: x.mean())
    # std dev
    i_df['std'] = pd.Series(val).apply(lambda x: x.std())
    # avg absolute diff
    i_df['aad'] = pd.Series(val).apply(lambda x: np.mean(np.absolute(x - np.mean(x))))
    # min
    i_df['min'] = pd.Series(val).apply(lambda x: x.min())
    # max
    i_df['max'] = pd.Series(val).apply(lambda x: x.max())
    # max-min diff
    i_df['minmax_diff'] = i_df['max'] - i_df['min']
    # median
    i_df['median'] = pd.Series(val).apply(lambda x: np.median(x))
    # median abs dev
    i_df['mad'] = pd.Series(val).apply(lambda x: np.median(np.absolute(x - np.median(x))))
    # interquartile range
    i_df['iqr'] = pd.Series(val).apply(lambda x: np.percentile(x, 75) - np.percentile(x, 25))
    # values above mean
    i_df['above_mean'] = pd.Series(val).apply(lambda x: np.sum(x > x.mean()))
    # number of peaks
    i_df['peak_count'] = pd.Series(val).apply(lambda x: len(find_peaks(x)[0]))
    # skewness
    i_df['skewness'] = pd.Series(val).apply(lambda x: stats.skew(x))
    # kurtosis
    i_df['kurtosis'] = pd.Series(val).apply(lambda x: stats.kurtosis(x))
    # energy
    i_df['kurtosis'] = pd.Series(val).apply(lambda x: np.sum(x**2)/100)
    return i_df

basic_statistical_features(temp_df).head(20)

Unnamed: 0,mean,std,aad,min,max,minmax_diff,median,mad,iqr,above_mean,peak_count,skewness,kurtosis
0,0.142857,0.349927,0.244898,0,1,1,0.0,0.0,0.0,1,0,2.041241,0.01
1,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0
2,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0
3,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0
4,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0
5,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0
6,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0
7,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0
8,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0
9,0.0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0


### Feature selection

## Prediction Model


Take this df as argument, and return number of prediction scores run through differnt model pipelines
- Simple logistic regression
- Facebook Prophet
- Moving Average (MA)
- Exponential Smoothing (ES)
- Autoregressive Integrated Moving Average (ARIMA)
- LSTM
- LSTM + NN

## Exposed Endpoint