In [None]:
import pandas as pd
import datetime
import calendar

In [None]:
def read_data(filename):
    return pd.read_excel(filename)

In [None]:
def generate_from(start, end, interval=3600):
    interval_num = int((end-start).total_seconds()/interval)
    return pd.DataFrame({'from': [start+datetime.timedelta(seconds=i*interval) for i in range(interval_num)]})

def generate_to(start, end, interval=3600):
    interval_num = int((end-start).total_seconds()/interval)
    return pd.DataFrame({'to': [start+datetime.timedelta(seconds=(i+1)*interval) for i in range(interval_num)]})

In [None]:
def generate_fields(data):
    data['date'] = data['active'].apply(lambda x: datetime.datetime.date(x))
    data['day of week'] = data['date'].apply(lambda x: calendar.day_name[x.weekday()])
    data['from'] = data['active'].apply(lambda x: x.replace(minute=0, second=0))
    data['to'] = data['from'].apply(lambda x: x + datetime.timedelta(hours=1))
    data['time'] = data['from'].apply(lambda x: datetime.datetime.time(x))
    return data

In [None]:
def generate_time_table(data, interval=3600):
    start = min(data['from'])
    end = max(data['to'])
    from_time = generate_from(start, end, interval)
    to_time =  generate_to(start, end, interval)
    time_table = pd.concat([from_time, to_time], axis=1)
    time_table['date'] = time_table['from'].apply(lambda x: datetime.datetime.date(x))
    if interval==3600:
        time_table['time'] = time_table['from'].apply(lambda x: datetime.datetime.time(x))
    time_table['day of week'] = time_table['date'].apply(lambda x: calendar.day_name[x.weekday()])
    return time_table

In [None]:
def generate_gap(time_template, time):
    temp_list = list(time['from'])
    time_template['gap'] = time_template['from'].apply(lambda x: False if x in temp_list else True)
    return time_template

In [None]:
def generate_activities_count(data, time_table, all_flag=True, method='day of week'):
    if all_flag:
        if method=='daily':
            activities = pd.DataFrame(data.groupby(['date', 'day of week']).size())
            activities.columns = ['all daily count']
        elif method=='hourly':
            activities = pd.DataFrame(data.groupby(['from']).size())
            activities.columns = ['all hourly count']
            activities = time_table.merge(activities, on=['from'], how='left').fillna(0)
        else:
            activities = pd.DataFrame(data.groupby(['day of week']).size())
            activities.columns = ['all day of week count']
    else:
        if method=='daily':
            activities = pd.DataFrame(data.groupby(['name', 'date', 'day of week']).size())
            activities.columns = ['ind daily count']
        elif method=='hourly':
            activities = pd.DataFrame(data.groupby(['name', 'from']).size())
            activities.columns = ['ind hourly count']
#             activities = time_table.merge(activities, on=['from'], how='left').fillna(0)
        else:
            activities = pd.DataFrame(data.groupby(['name', 'day of week']).size())
            activities.columns = ['ind day of week count']
    return activities

In [None]:
def run_data_processing(filename, write_data=True):
    filename = 'raw_data.xlsx'
    data = read_data(filename)
    data = generate_fields(data)
    hour_table = generate_time_table(data)
    hour_table = generate_gap(hour_table, data)
    date_table = generate_time_table(data, interval=3600*24)

    all_day_of_week_activities = generate_activities_count(data, hour_table, all_flag=True, method='day of week')
    all_daily_activities = generate_activities_count(data, date_table, all_flag=True, method='daily')
    all_hourly_activities = generate_activities_count(data, hour_table, all_flag=True, method='hourly')
    
    ind_day_of_week_activities = generate_activities_count(data, hour_table, all_flag=False, method='day of week')
    ind_daily_activities = generate_activities_count(data, date_table, all_flag=False, method='daily')
    ind_hourly_activities = generate_activities_count(data, hour_table, all_flag=False, method='hourly')
    
    if write_data:
        with pd.ExcelWriter('processed_data.xlsx') as writer:
            data.to_excel(writer, sheet_name='data', index=False)
            hour_table.to_excel(writer, sheet_name='hour_table', index=False)
            date_table.to_excel(writer, sheet_name='date_table', index=False)
            
            all_day_of_week_activities.to_excel(writer, sheet_name='all_day_of_week_activities')
            all_daily_activities.to_excel(writer, sheet_name='all_daily_activities')
            all_hourly_activities.to_excel(writer, sheet_name='all_hourly_activities', index=False)
            
            ind_day_of_week_activities.to_excel(writer, sheet_name='ind_day_of_week_activities')
            ind_daily_activities.to_excel(writer, sheet_name='ind_daily_activities')
            ind_hourly_activities.to_excel(writer, sheet_name='ind_hourly_activities')            

#### run scripts

In [None]:
filename = 'raw_data.xlsx'
run_data_processing(filename, write_data=True)

In [None]:
# filename = 'raw_data.xlsx'
# data = read_data(filename)
# data = generate_fields(data)
# hour_table = generate_time_table(data)
# hour_table = generate_gap(hour_table, data)
# date_table = generate_time_table(data, interval=3600*24)

# all_day_of_week_activities = generate_activities_count(data, hour_table, all_flag=True, method='day of week')
# all_daily_activities = generate_activities_count(data, date_table, all_flag=True, method='daily')
# all_hourly_activities = generate_activities_count(data, hour_table, all_flag=True, method='hourly')

# ind_day_of_week_activities = generate_activities_count(data, hour_table, all_flag=False, method='day of week')
# ind_daily_activities = generate_activities_count(data, date_table, all_flag=False, method='daily')
# ind_hourly_activities = generate_activities_count(data, hour_table, all_flag=False, method='hourly')