In [27]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [28]:
from activity_validator.hetus_data_processing import main
import activity_validator.hetus_data_processing.hetus_columns as col
# load the main data
# TODO: only works when executed from project root directory
profile_type, data = main.process_hetus_2010_data()
act_data = col.get_activity_data(data)

2023-11-06 19:10:59 DEBUG    Loading HETUS file for DE


2023-11-06 19:11:05 INFO     Loaded HETUS file for DE with 27143 entries and 1939 columns in 5.4 s


-----------------  -----
Number of diaries  27143


-----------------  -----


2023-11-06 19:11:23 INFO     Created activity types file: data\validation_data\activities\available_activity_types.json
2023-11-06 19:11:23 INFO     Out of 9051 groups on Person level, 0 are inconsistent.
2023-11-06 19:11:25 INFO     Extracted 9051 groups on Person level from 27143 entries in 0.0 s
2023-11-06 19:11:25 INFO     Determined working status for 9046 out of 9051 persons (99.9 %)


-----------------  -----
Number of diaries  27143
Number of persons   9051

-----------------  -----


2023-11-06 19:11:25 DEBUG    Timing: 'determine_work_statuses' took: 0.1998 sec
2023-11-06 19:11:25 DEBUG    Timing: 'get_person_categorization_data' took: 0.1998 sec

Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value 'work' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.

2023-11-06 19:11:26 INFO     Determined day type for 27128 out of 27128 diary entries (100.0 %)
2023-11-06 19:11:26 DEBUG    Timing: 'determine_day_types' took: 0.2778 sec
2023-11-06 19:11:29 DEBUG    Timing: 'get_diary_categorization_data' took: 3.8861 sec
2023-11-06 19:11:29 INFO     Sorted 27128 entries into 20 categories.
2023-11-06 19:11:29 DEBUG    Created DataFrame file data\validation_data\categories\categories.csv


COUNTRY                        DE
sex    work status day type      
female full time   no work   1515
                   work      1581
       part time   no work   2129
                   work      1653
       retired     no work   1972
                   work        42
       student     no work   3086
                   work       130
       unemployed  no work   2320
                   work       152
male   full time   no work   2672
                   work      3550
       part time   no work    207
                   work       189
       retired     no work   2147
                   work        51
       student     no work   2875
                   work       117
       unemployed  no work    677
                   work        63


2023-11-06 19:11:30 DEBUG    Timing: 'categorize' took: 1.2315 sec
2023-11-06 19:11:31 DEBUG    Timing: 'process_hetus_2010_data' took: 31.2307 sec


In [33]:
# functions for sorting the data and see how the plot changes
def sort_by_waking_up(act_data):
    """Sort by first activity that is not 'sleep'"""
    # set 'sleep' to NAN
    no_sleep = act_data[act_data != "sleep"].T
    # reset to int index (MACT1 - MACT144 is not sortable)
    no_sleep.reset_index(drop=True, inplace=True)
    # get the index of first non-NAN value for each column
    i_first  = pd.Series([no_sleep[s].first_valid_index() for s in no_sleep])
    # set an arbitrary high index for NAN values and sort the data by the indices
    i_first.fillna(200, inplace=True)
    i_sorted = i_first.argsort()
    sorted_data = act_data.iloc[i_sorted, :]
    return sorted_data

def sort_by_working_time(act_data):
    pass # TODO

In [34]:
sorted_data = sort_by_waking_up(act_data)
# sorted_data = sorted_data.iloc[:100,:]

In [35]:
# encode data as ints
single_col = sorted_data.stack(sorted_data.columns.names)
codes, uniques = single_col.factorize()  # type: ignore
single_col[:] = codes
encoded = single_col.unstack()

In [36]:
# Plot the heatmap. Further info: https://plotly.com/python/heatmaps/
heatmap = go.Heatmap(z=encoded.T, colorscale=px.colors.qualitative.Plotly)

# adapt the colorbar to use the string values
colorbar = go.heatmap.ColorBar(title='Activity Groups', tickvals=list(range(len(uniques))), ticktext=uniques)
heatmap.update(colorbar=colorbar)

fig = go.Figure(heatmap)
title = " - ".join(profile_type.to_tuple())
fig.update_layout(xaxis_title="Diary Entry", yaxis_title="Time", title=title)
# set the tooltip to show the string instead of the encoded int
fig.update_traces(hovertext=sorted_data.T)
fig.layout.yaxis.autorange="reversed"
fig.show()