In [0]:
import pandas as pd #data manipulation library using data frames
import os    #library to join path names
from datetime import datetime #library for dates manipulation

In [0]:
#Global path variables
act_path = './drive/My Drive/DatosInsider/r4.2/' #with activity info
dwh_path ='./drive/My Drive/DatosInsider/DWH_tables/' #final repository path

In [0]:
#device.csv
#* Fields: id, date, user, pc, activity (connect/disconnect)

#insider['active_periods']= insider.apply( lambda x: (x.end.year - x.start.year) * 12 + (x.end.month - x.start.month),axis=1)
log_df = pd.read_csv(os.path.join(act_path,'logon.csv'), parse_dates=['date'], usecols=['date','user','activity'])

#read departure date from employee data
emp_dep_dt_df= pd.read_csv(os.path.join(dwh_path,'employee_data_dates.csv'), usecols= ['user_id','max'], parse_dates=['max'])
emp_dep_dt_df.rename(columns={'user_id':'user','max':'departure_date'}, inplace=True)

In [0]:
group_log_on = log_df[log_df.activity=='Logon'].groupby(['user',pd.Grouper(key='date',freq="D")]).agg(
    min_logon= ('date', min),   
    cnt_logon=('activity','count')
)
group_log_on=group_log_on.reset_index()

group_log_off = log_df[log_df.activity=='Logoff'].groupby(['user',pd.Grouper(key='date',freq="D")]).agg(
    max_logoff=('date', max),
    cnt_logoff=('activity','count')
)
group_log_off=group_log_off.reset_index()

daily_log_activity=pd.merge(group_log_on, group_log_off, on=['user','date'], how='left')
daily_log_activity=pd.merge(daily_log_activity, emp_dep_dt_df, on=['user'], how='left')

daily_log_activity['logged_time']= daily_log_activity.apply( lambda x: x.max_logoff-x.min_logon,axis=1)
daily_log_activity[['logged_time']]=daily_log_activity[['logged_time']].apply(pd.to_numeric)
daily_log_activity['min_logon'] = daily_log_activity['min_logon'].apply( lambda x: x.hour*3600+ x.minute*60 +x.second) 
daily_log_activity['max_logoff'] = daily_log_activity['max_logoff'].apply( lambda x: x.hour*3600+ x.minute*60 +x.second) 
daily_log_activity['unauthorized_log']= daily_log_activity.apply( lambda x: 1 if x.date.month > x.departure_date.month else 0,axis=1)


In [0]:
monthly_activity=daily_log_activity.groupby(['user',pd.Grouper(key='date',freq="M")]).agg(
    min_logon= ('min_logon', min) ,
    avg_min_logon=('min_logon', "mean"),    
    med_min_logon=('min_logon', "median"),    
    
    cnt_logon=('cnt_logon',sum),
    avg_cnt_logon=('cnt_logon',"mean"),
    med_cnt_logon=('cnt_logon',"median"),
    
    max_logoff=('max_logoff', max),    
    avg_max_logoff=('max_logoff', "mean"),
    med_max_logoff=('max_logoff', "median"),


    cnt_logoff=('cnt_logoff',sum),
    avg_cnt_logoff=('cnt_logoff',"mean"),
    med_cnt_logoff=('cnt_logoff',"median"),

    avg_logged_time=('logged_time', "mean"),
    med_logged_time=('logged_time', "median"),

    cnt_unauthorized_log =('unauthorized_log',"count")

)
monthly_activity=monthly_activity.reset_index()
#monthly_activity[['min_logon','avg_min_logon','med_min_logon','max_logoff','avg_max_logoff','med_max_logoff']].applymap(lambda x: datetime.time[]))
monthly_activity[['avg_logged_time','med_logged_time']]=monthly_activity[['avg_logged_time','med_logged_time']].apply(pd.to_timedelta)                  
monthly_activity['period']=monthly_activity['date'].dt.strftime('%Y%m')

In [0]:
monthly_activity.to_csv(os.path.join(dwh_path,'monthly_logged_activity.csv'), index=False)