In [1]:
# library setup
import pandas as pd 
import numpy as np
import time
import re

pd.set_option('future.no_silent_downcasting', True)

In [2]:
# ACT Behavior import
behav_act_df = pd.read_csv("C:/Users/HELIOS-300/Downloads/ACT24_behposture_event(in).csv")
behav_act_log_df = pd.read_csv("C:/Users/HELIOS-300/Downloads/do_log_final_behavior(in).csv")

In [3]:
# Behavior ACT data cleaning

# a) why does unnamed 17 and 18 exist? no one knows :)
behav_act_df1 = behav_act_df.drop(columns=["Date_Time_Absolute_dmy_hmsf", 
"Date_dmy", 
"Time_Absolute_hms", 
"Time_Absolute_f", 
"Unnamed: 17", 
"Unnamed: 18",
"Event_Log"])

# b) NOTE: still need to add the "start_Time" and time relative columns (confused)
# add "id" and "do" style ID's from LOG into ACT behavior file so we can join
def add_id_do_split(df, source_col='Observation', id_col='id', do_col='do', inplace=True):
    parts = df[source_col].str.split('_', expand=True)
    id_series = pd.to_numeric(parts[1], errors='coerce').astype('Int64')
    do_series = pd.to_numeric(parts[2], errors='coerce').astype('Int64')
    if inplace:
        df[id_col] = id_series
        df[do_col] = do_series
        return df
    out = df.copy()
    out[id_col] = id_series
    out[do_col] = do_series
    return out

# add a column from LOG onto Behavior based on "id" and "do"
def add_col_from_other_df_merge(
    left: pd.DataFrame,
    right: pd.DataFrame,
    left_keys: list,
    right_keys: list,
    right_value_col: str,
    new_col_name: str | None = None,
    how: str = 'left',
    validate: str = 'many_to_one'
) -> pd.DataFrame:
    """
    Add a single column from `right` to `left` by joining on two (or more) key columns.
    """
    if new_col_name is None:
        new_col_name = right_value_col

    right_subset = right[right_keys + [right_value_col]].rename(
        columns={right_value_col: new_col_name}
    )
    merged = left.merge(
        right_subset,
        how=how,
        left_on=left_keys,
        right_on=right_keys,
        validate=validate
    )
    return merged

behav_act_df2 = add_id_do_split(behav_act_df1)

behav_act_df3 = add_col_from_other_df_merge(
    left=behav_act_df2,                 # main df (to put columns into)
    right=behav_act_log_df,               # other df (to pull the columns from)
    left_keys=['id', 'do'],
    right_keys=['id', 'do'],
    right_value_col='start_time',      # column from df_right to bring over        # optional rename
    how='left',
    validate='many_to_one'        # set 'one_to_one' to enforce uniqueness if applicable (not here yet really)
)

series_temp = behav_act_df3.pop("start_time")
behav_act_df3.insert(0, "start_time", series_temp)

behav_act_df4 = behav_act_df3.drop(index=behav_act_df3.index[behav_act_df3["Event_Type"] != "State start"])

# parse start_time (supports "8:20:19 AM" and "8:20 AM")
s = behav_act_df4['start_time'].astype(str).str.strip()
dt1 = pd.to_datetime(s, format='%I:%M:%S %p', errors='coerce')
dt2 = pd.to_datetime(s, format='%I:%M %p', errors='coerce')
behav_act_df4['start_time_dt'] = dt1.fillna(dt2)

# parse Time_Relative_hmsf (supports "HH:MM:SS(.f)", "MM:SS(.f)", "SS(.f)")
r = behav_act_df4['Time_Relative_hmsf'].astype(str).str.strip()
r = r.str.replace(',', '.', regex=False).str.replace(';', '.', regex=False)

td = pd.Series(pd.NaT, index=r.index, dtype='timedelta64[ns]')
mask_hms = r.str.count(':') == 2
mask_ms  = r.str.count(':') == 1
mask_sec = r.str.fullmatch(r'\d+(\.\d+)?')
mask_blank = r.eq('') | r.str.lower().isin(['nan', 'none'])

td.loc[mask_hms] = pd.to_timedelta(r[mask_hms], errors='coerce')
td.loc[mask_ms]  = pd.to_timedelta('00:' + r[mask_ms], errors='coerce')  # prefix hours
td.loc[mask_sec] = pd.to_timedelta(r[mask_sec].astype(float), unit='s')
td.loc[mask_blank] = pd.NaT

behav_act_df4['time_relative_td'] = td

# sum to produce the new start time (copy to a working df)
behav_act_df5 = behav_act_df4.copy()
behav_act_df5['start_time_new'] = behav_act_df5['start_time_dt'] + behav_act_df5['time_relative_td']

# time-only display strings (no date)
behav_act_df5['start_time_str'] = behav_act_df5['start_time_dt'].dt.strftime('%I:%M:%S %p')
behav_act_df5['start_time_new_str'] = behav_act_df5['start_time_new'].dt.strftime('%I:%M:%S %p')

# drop intermediates, rename, and position between the first two columns
drop_cols = [c for c in ['start_time_dt','time_relative_td','start_time_new','start_time_str'] if c in behav_act_df5.columns]
behav_act_df5 = behav_act_df5.drop(columns=drop_cols)

behav_act_df5 = behav_act_df5.rename(columns={'start_time_new_str': 'start_time_new'})

first_cols = ['start_time', 'start_time_new', 'Time_Relative_hmsf']
other_cols = [c for c in behav_act_df5.columns if c not in first_cols]
behav_act_df5 = behav_act_df5[first_cols + other_cols]

# now change rows to seconds
# --- 1) Parse Time_Relative_hms to seconds (robust) ---
def _parse_hms_to_seconds(series: pd.Series) -> pd.Series:
    s = series.astype(str).str.strip()
    s = s.str.replace(',', '.', regex=False).str.replace(';', '.', regex=False)

    td = pd.Series(pd.NaT, index=s.index, dtype='timedelta64[ns]')
    mask_hms = s.str.count(':') == 2           # H:M:S(.f)
    mask_ms  = s.str.count(':') == 1           # M:S(.f)
    mask_sec = s.str.fullmatch(r'\d+(\.\d+)?') # seconds only
    mask_blank = s.eq('') | s.str.lower().isin(['nan', 'none'])

    td.loc[mask_hms] = pd.to_timedelta(s[mask_hms], errors='coerce')
    td.loc[mask_ms]  = pd.to_timedelta('00:' + s[mask_ms], errors='coerce')
    # seconds-only → to_timedelta(float, unit='s')
    if mask_sec.any():
        td.loc[mask_sec] = pd.to_timedelta(s[mask_sec].astype(float), unit='s')
    td.loc[mask_blank] = pd.NaT

    return td.dt.total_seconds()

# --- 2) Helper to format seconds as HH:MM:SS or HH:MM:SS.xx (2 decimals) ---
def _format_hms(seconds_float: float, decimals: int = 0) -> str:
    if pd.isna(seconds_float):
        return np.nan
    # Scale to desired fractional units and round once to handle carry
    scale = 10 ** decimals
    total_units = int(round(seconds_float * scale))

    secs = total_units // scale
    frac_units = total_units % scale

    h = secs // 3600
    m = (secs % 3600) // 60
    s = secs % 60

    if decimals == 0:
        return f'{h:02d}:{m:02d}:{s:02d}'
    return f'{h:02d}:{m:02d}:{s:02d}.{frac_units:0{decimals}d}'

# --- 3) Build per-second expansion per Observation (rules as specified) ---
df = behav_act_df5.copy()

# Parse seconds from Time_Relative_hms
df['_seconds'] = _parse_hms_to_seconds(df['Time_Relative_hms'])

# Ensure sorting is stable within Observation by original time
df = df.sort_values(['Observation', '_seconds'], kind='mergesort')

out_groups = []

for obs_value, g in df.groupby('Observation', sort=False):
    g = g.copy()

    # Drop rows with no parsed time; if all missing, skip expanding this group
    g = g[~g['_seconds'].isna()]
    if g.empty:
        continue

    # Floor seconds per event (for tie-breaking "last within same second wins")
    g['_event_second'] = np.floor(g['_seconds']).astype(int)

    # Within the same second, keep the last (later) row
    g_last_each_sec = (
        g.sort_values(['_event_second', '_seconds'], kind='mergesort')
         .drop_duplicates(subset=['_event_second'], keep='last')
    )

    min_s = float(g['_seconds'].min())
    max_s = float(g['_seconds'].max())

    # Start rule:
    # - Normally start at 0 if the first event is at 0.
    # - If the first event > 0, don't create any earlier rows; start at ceil(min_s).
    #   Also, we will flag the first displayed time by adding +0.01s.
    if np.isclose(min_s, 0.0):
        start_second = 0
        flag_first = False
    else:
        start_second = int(np.ceil(min_s))
        flag_first = True

    end_second = int(np.floor(max_s))
    if end_second < start_second:
        end_second = start_second

    seconds_grid = np.arange(start_second, end_second + 1, dtype=int)

    # Align values: for each integer second, use "last known at or before"
    base = (
        g_last_each_sec
        .set_index('_event_second')
        .sort_index()
    )

    # Reindex over a full range from floor(min_s) to end_second to enable ffill at the start
    first_index_second = int(np.floor(min_s))
    full_index = np.arange(first_index_second, end_second + 1, dtype=int)
    aligned = base.reindex(full_index).ffill()

    # Now select just our seconds grid
    take = aligned.loc[seconds_grid].copy()
    take.reset_index(drop=False, inplace=True)
    take.rename(columns={'_event_second': '_second'}, inplace=True)

    # Compute the new per-second display time (HH:MM:SS), with special first-row flag if needed
    time_strings = [_format_hms(s, decimals=0) for s in seconds_grid]
    if flag_first and len(time_strings) > 0:
        flagged = min_s + 0.01
        time_strings[0] = _format_hms(flagged, decimals=2)

    take['Time_Relative_hms_new'] = time_strings

    # Event_Type rule:
    # - For seconds that correspond to an original event second, keep existing Event_Type
    # - For newly created seconds, set "State during"
    existing_seconds = set(g_last_each_sec['_event_second'].astype(int).tolist())
    is_existing = take['_second'].astype(int).isin(existing_seconds)

    if 'Event_Type' in take.columns:
        take.loc[~is_existing, 'Event_Type'] = 'State during'
    else:
        take['Event_Type'] = np.where(is_existing, 'State during', 'State during')

    out_groups.append(take)

# Concatenate all expanded groups
behav_act_df_6 = pd.concat(out_groups, axis=0, ignore_index=True)

# --- 4) Cleanup helper columns ---
for c in ['_seconds', '_second']:
    if c in behav_act_df_6.columns:
        behav_act_df_6 = behav_act_df_6.drop(columns=c)

# --- 5) Insert Time_Relative_hms_new between Time_Relative_hms and Time_Relative_f ---
cols = list(behav_act_df_6.columns)
if 'Time_Relative_hms_new' not in cols:
    # safety; should exist
    pass
else:
    if 'Time_Relative_hms' in cols:
        new_cols = []
        inserted = False
        for c in cols:
            new_cols.append(c)
            if c == 'Time_Relative_hms' and 'Time_Relative_hms_new' in cols and 'Time_Relative_hms_new' not in new_cols:
                new_cols.append('Time_Relative_hms_new')
                inserted = True
        if inserted:
            # remove any duplicate occurrence of Time_Relative_hms_new
            seen = set()
            dedup_cols = []
            for c in new_cols:
                if c == 'Time_Relative_hms_new':
                    if c in seen:
                        continue
                    seen.add(c)
                dedup_cols.append(c)
            behav_act_df_6 = behav_act_df_6[dedup_cols]
    else:
        # If original column not present, leave as is
        pass

subset = behav_act_df_6.iloc[100:110]
subset

Unnamed: 0,start_time,start_time_new,Time_Relative_hmsf,Time_Relative_hms,Time_Relative_hms_new,Time_Relative_f,Time_Relative_sf,Duration_sf,Observation,Behavior,Modifier_1,Modifier_2,Modifier_3,Event_Type,Comment,id,do
100,8:20:19 AM,08:21:54 AM,01:35.6,0:01:35,00:01:40,611.0,95.6115,6.11642,ID_102_01_C,WA- walk,,moderate,SP- Education and Health Services,State during,,102,1
101,8:20:19 AM,08:22:00 AM,01:41.7,0:01:41,00:01:41,728.0,101.728,12.6162,ID_102_01_C,WA- descend stairs,,moderate,SP- Education and Health Services,State start,,102,1
102,8:20:19 AM,08:22:00 AM,01:41.7,0:01:41,00:01:42,728.0,101.728,12.6162,ID_102_01_C,WA- descend stairs,,moderate,SP- Education and Health Services,State during,,102,1
103,8:20:19 AM,08:22:00 AM,01:41.7,0:01:41,00:01:43,728.0,101.728,12.6162,ID_102_01_C,WA- descend stairs,,moderate,SP- Education and Health Services,State during,,102,1
104,8:20:19 AM,08:22:00 AM,01:41.7,0:01:41,00:01:44,728.0,101.728,12.6162,ID_102_01_C,WA- descend stairs,,moderate,SP- Education and Health Services,State during,,102,1
105,8:20:19 AM,08:22:00 AM,01:41.7,0:01:41,00:01:45,728.0,101.728,12.6162,ID_102_01_C,WA- descend stairs,,moderate,SP- Education and Health Services,State during,,102,1
106,8:20:19 AM,08:22:00 AM,01:41.7,0:01:41,00:01:46,728.0,101.728,12.6162,ID_102_01_C,WA- descend stairs,,moderate,SP- Education and Health Services,State during,,102,1
107,8:20:19 AM,08:22:00 AM,01:41.7,0:01:41,00:01:47,728.0,101.728,12.6162,ID_102_01_C,WA- descend stairs,,moderate,SP- Education and Health Services,State during,,102,1
108,8:20:19 AM,08:22:00 AM,01:41.7,0:01:41,00:01:48,728.0,101.728,12.6162,ID_102_01_C,WA- descend stairs,,moderate,SP- Education and Health Services,State during,,102,1
109,8:20:19 AM,08:22:00 AM,01:41.7,0:01:41,00:01:49,728.0,101.728,12.6162,ID_102_01_C,WA- descend stairs,,moderate,SP- Education and Health Services,State during,,102,1


In [4]:
# rows where new observation ID did not start at time 0? flagged with 0.01 at the end (usually no miliseconds)
mask = behav_act_df_6['Time_Relative_hms_new'].str.endswith('.01', na=False)
flagged_rows = behav_act_df_6.loc[mask]

# peek
flagged_rows.head(20)

Unnamed: 0,start_time,start_time_new,Time_Relative_hmsf,Time_Relative_hms,Time_Relative_hms_new,Time_Relative_f,Time_Relative_sf,Duration_sf,Observation,Behavior,Modifier_1,Modifier_2,Modifier_3,Event_Type,Comment,id,do
31669,11:09:47 AM,11:10:12 AM,00:25.6,0:00:25,00:00:25.01,633.0,25.6326,1002.14,ID_116_02_R,SB-sitting,,,,State start,,116,2
123273,10:56:00 AM,10:56:04 AM,00:04.2,0:00:04,00:00:04.01,221.0,4.22078,1469.99,ID_127_02_R,SB-sitting,,,,State start,,127,2


In [5]:
# Create behav_act_df_7 from behav_act_df_6 and encode activity, posture, intensity
# Requires: behav_act_df_6 with columns 'Behavior' and optional 'Modifier_1','Modifier_2','Modifier_3'
import re
import numpy as np
import pandas as pd

# Copy the base dataframe
behav_act_df_7 = behav_act_df_6.copy()
# Provide commonly referenced alias if desired
df_behav_df7 = behav_act_df_7

# Mapping from canonical Activity_Type to (activity_type, broad_domain, waves_domain)
_activity_type_to_meta = {
    'SL- Sleep': ('sleep', 'sleep', 'household_personal'),
    'PC- Groom, Health-Related': ('pc_groom', 'personal', 'household_personal'),
    'PC- Other Personal Care': ('pc_other', 'personal', 'household_personal'),
    'HA- Housework': ('ha_housework', 'household', 'household_personal'),
    'HA- Food Prep and Cleanup': ('ha_food', 'household', 'household_personal'),
    'HA- Interior Maintenance, Repair, & Decoration': ('ha_interior', 'maintenance_repair', 'household_personal'),
    'HA- Exterior Maintenance, Repair, & Decoration': ('ha_exterior', 'maintenance_repair', 'household_personal'),
    'HA- Lawn, Garden and Houseplants': ('ha_lawn', 'lawn_garden', 'household_personal'),
    'HA- Animals and Pets': ('ha_pets', 'household', 'household_personal'),
    'HA- Household Management/Other household activities': ('ha_other', 'household', 'household_personal'),
    'CA- Caring for and Helping Children': ('care_children', 'household', 'household_personal'),
    'CA- Caring for and Helping Adults': ('care_adults', 'household', 'household_personal'),
    'WRK- General**': ('work_general', 'work_education', 'work_education'),
    'WRK- Desk/Screen Based': ('work_screen', 'work_education', 'work_education'),
    'EDU- Taking Class, Research, Homework': ('edu_class', 'work_education', 'work_education'),
    'EDU- Extracurricular': ('edu_other', 'work_education', 'work_education'),
    'ORG- Church, Spiritual': ('com_church', 'purchase_other', 'purchase_other'),
    'Volunteer Work (ORG - Volunteer Work)': ('com_volunteer', 'purchase_other', 'purchase_other'),
    'PUR- Purchasing Goods and Services': ('com_purchase', 'purchase_other', 'purchase_other'),
    'EAT- Eating and Drinking, Waiting': ('ha_eat', 'personal', 'household_personal'),
    'LES- Socializing, Communicating, Non-Screen Based': ('les_social', 'leisure', 'leisure'),
    'LES- Screen-Based (TV, Video Game, Computer, Phone)': ('les_screen', 'Leisure_Screen', 'leisure'),
    'EX- Participating in Sport, Exercise or Recreation***': ('ex_sport', 'exercise', 'leisure'),
    'EX- Attending Sport, Exercise Recreation Event, or Performance': ('les_attend', 'leisure', 'leisure'),
    'TRAV- Passenger (Car/Truck/Motorcycle)': ('trav_pass', 'Trav_car', 'transportation'),
    'TRAV- Driver (Car/Truck/Motorcycle)': ('trav_drive', 'Trav_car', 'transportation'),
    'TRAV- Passenger (Bus, Train, Tram, Plane, Boat, Ship)': ('trav_pass', 'Trav_public', 'transportation'),
    'TRAV- Biking': ('trav_bike', 'active_transportation', 'transportation'),
    'TRAV-Walking': ('trav_walk', 'active_transportation', 'transportation'),
    'TRAV- General': ('trav_other', 'transportation', 'transportation'),
    'OTHER- Non-Codable (delete these rows from dataset)': ('non_codable', 'non_codable', 'non_codable'),
}

# Map raw Behavior values to canonical Activity_Type keys above
_alias_to_activity_type = {
    'sl- sleep': 'SL- Sleep',
    'pc- groom, health-related': 'PC- Groom, Health-Related',
    'pc- other personal care': 'PC- Other Personal Care',
    'ha- housework': 'HA- Housework',
    'ha- food prep and cleanup': 'HA- Food Prep and Cleanup',
    'ha- interior maintenance, repair, & decoration': 'HA- Interior Maintenance, Repair, & Decoration',
    'ha- exterior maintenance, repair, & decoration': 'HA- Exterior Maintenance, Repair, & Decoration',
    'ha- lawn, garden and houseplants': 'HA- Lawn, Garden and Houseplants',
    'ha- animals and pets': 'HA- Animals and Pets',
    'ha- household management/other household activities': 'HA- Household Management/Other household activities',
    'ca- caring for and helping children': 'CA- Caring for and Helping Children',
    'ca- caring for and helping adults': 'CA- Caring for and Helping Adults',
    'wrk- general': 'WRK- General**',
    'wrk- screen based': 'WRK- Desk/Screen Based',
    'edu- taking class, research, homework': 'EDU- Taking Class, Research, Homework',
    'edu- extracurricular': 'EDU- Extracurricular',
    'org- church, spiritual': 'ORG- Church, Spiritual',
    'org- volunteer': 'Volunteer Work (ORG - Volunteer Work)',
    'pur- purchasing goods and services': 'PUR- Purchasing Goods and Services',
    'eat- eating and drinking, waiting': 'EAT- Eating and Drinking, Waiting',
    'les- socializing, communicating, leisure time not screen': 'LES- Socializing, Communicating, Non-Screen Based',
    'les- screen based leisure time (tv, video game, computer)': 'LES- Screen-Based (TV, Video Game, Computer, Phone)',
    'les- screen-based (tv, video game, computer, phone)': 'LES- Screen-Based (TV, Video Game, Computer, Phone)',
    'ex- participating in sport, exercise or recreation': 'EX- Participating in Sport, Exercise or Recreation***',
    'ex- attending sport, recreational event, or performance': 'EX- Attending Sport, Exercise Recreation Event, or Performance',
    'trav- passenger (car/truck/motorcycle)': 'TRAV- Passenger (Car/Truck/Motorcycle)',
    'trav- driver (car/truck/motorcycle)': 'TRAV- Driver (Car/Truck/Motorcycle)',
    'trav- passenger (bus, train, tram, plane, boat, ship)': 'TRAV- Passenger (Bus, Train, Tram, Plane, Boat, Ship)',
    'trav- biking': 'TRAV- Biking',
    'trav- walking': 'TRAV-Walking',
    'trav-walking': 'TRAV-Walking',
    'trav- general': 'TRAV- General',
    'other- non codable': 'OTHER- Non-Codable (delete these rows from dataset)',
    'private/not coded': 'OTHER- Non-Codable (delete these rows from dataset)',
}

_def_exercise_prefixes = (
    'sp-',  # sport movement categories → exercise
)

# Normalization helper
_def_ws_re = re.compile(r"\s+")

def _normalize_behavior(value: object) -> str | None:
    if pd.isna(value):
        return None
    s = str(value).strip().lower()
    s = s.replace('–', '-').replace('—', '-')
    s = _def_ws_re.sub(' ', s)
    # keep original hyphen-spacing; just normalize unicode dashes and collapse whitespace
    return s

# Map Behavior → Activity_Type (canonical key)

def _map_behavior_to_activity_type(value: object) -> str | None:
    s = _normalize_behavior(value)
    if not s:
        return None
    # specific pattern rules to improve recall
    if s.startswith('les- screen'):
        return 'LES- Screen-Based (TV, Video Game, Computer, Phone)'
    if s.startswith('trav- passenger (bus'):
        return 'TRAV- Passenger (Bus, Train, Tram, Plane, Boat, Ship)'
    # direct alias lookup
    return _alias_to_activity_type.get(s)

# Build Activity_Type
behav_act_df_7['Activity_Type'] = behav_act_df_7['Behavior'].apply(_map_behavior_to_activity_type)

# EX: replace Activity_Type with Modifier_1 (e.g., EX-hiking) when available
if 'Modifier_1' in behav_act_df_7.columns:
    def _apply_ex_modifier(row):
        at = row['Activity_Type']
        if at == 'EX- Participating in Sport, Exercise or Recreation***':
            m1 = row['Modifier_1']
            if not pd.isna(m1):
                mnorm = re.sub(r"\s+", ' ', str(m1)).strip().lower().replace('/', '-')
                mnorm = mnorm.replace(' ', '-')  # EX-hiking, EX-weight-training
                if mnorm:
                    return f"EX-{mnorm}"
        return at
    behav_act_df_7['Activity_Type'] = behav_act_df_7.apply(_apply_ex_modifier, axis=1)

# work_type_raw from Modifier_3; final work_type applied only when in work labels
work_labels = {'WRK- General**', 'WRK- Desk/Screen Based'}
if 'Modifier_3' in behav_act_df_7.columns:
    def _mk_work_type(x):
        if pd.isna(x):
            return np.nan
        raw = str(x).strip()
        # strip leading sector code like 'SP- '
        raw = re.sub(r'^\s*sp-\s*', '', raw, flags=re.IGNORECASE)
        s = re.sub(r"\s+", '_', raw.lower()).replace('/', '_')
        s = s.replace('hospiltality', 'hospitality')
        return f"work_{s}" if s else np.nan
    behav_act_df_7['work_type_raw'] = behav_act_df_7['Modifier_3'].apply(_mk_work_type)
else:
    behav_act_df_7['work_type_raw'] = np.nan

# Expand Activity_Type to three encoded columns
cols = ['activity_type', 'broad_domain', 'waves_domain']

def _activity_meta_lookup(activity_type: object):
    if isinstance(activity_type, str) and activity_type.startswith('EX-'):
        return ('ex_sport', 'exercise', 'leisure')
    return _activity_type_to_meta.get(activity_type)

behav_act_df_7[cols] = behav_act_df_7['Activity_Type'].map(_activity_meta_lookup).apply(
    lambda tpl: pd.Series(tpl if isinstance(tpl, tuple) else (np.nan, np.nan, np.nan))
)

# Forward-fill Activity_Type within each observation so it persists between updates
if 'Observation' in behav_act_df_7.columns:
    _group_cols = ['Observation']
elif {'id','do'}.issubset(behav_act_df_7.columns):
    _group_cols = ['id','do']
else:
    _group_cols = None

if _group_cols is not None:
    behav_act_df_7['Activity_Type'] = behav_act_df_7.groupby(_group_cols)['Activity_Type'].ffill()
    # Recompute meta columns after ffill
    behav_act_df_7[cols] = behav_act_df_7['Activity_Type'].map(_activity_meta_lookup).apply(
        lambda tpl: pd.Series(tpl if isinstance(tpl, tuple) else (np.nan, np.nan, np.nan))
    )

# ------------------------------
# Posture / Whole-body movement
# ------------------------------

def _map_posture_wbm_from_behavior(value: object) -> str | None:
    s = _normalize_behavior(value)
    if not s:
        return None
    if s.startswith('sb-sitting'):
        return 'sitting'
    if s.startswith('sb-lying') or s.startswith('sb- lying'):
        return 'lying'
    if s.startswith('la- kneeling'):
        return 'kneel_squat'
    if s == 'la- stretching':
        return 'stretch'
    if s == 'la- stand and move':
        return 'stand_move'
    if s == 'la- stand':
        return 'stand'
    if s in {'wa- walk', 'wa- walking', 'trav- walking', 'trav-walking'}:
        return 'walk'
    if s in {'wa-walk with load', 'wa- walk with load'}:
        return 'walk_load'
    if s == 'wa- ascend stairs':
        return 'ascend'
    if s == 'wa- descend stairs':
        return 'descend'
    if s == 'wa- running':
        return 'running'
    if s == 'sp- bike':
        return 'biking'
    if s in {'sp- other sport movement', 'sp- swing', 'sp -kick', 'sp- jump'}:
        return 'sport_move'
    if s == 'sp- muscle strengthening':
        return 'muscle_strength'
    if s == 'private/not coded':
        return 'not_coded'
    return None

_posture_meta = {
    'sitting': ('sedentary', 'sedentary'),
    'lying': ('sedentary', 'sedentary'),
    'kneel_squat': ('sedentary', 'mixed_move'),
    'stretch': ('sport', 'sport'),
    'stand': ('stand_move', 'mixed_move'),
    'stand_move': ('stand_move', 'mixed_move'),
    'walk': ('walk', 'walk'),
    'walk_load': ('mod_walk', 'walk'),
    'ascend': ('mod_walk', 'walk'),
    'descend': ('mod_walk', 'walk'),
    'running': ('running', 'running'),
    'biking': ('biking', 'biking'),
    'sport_move': ('sport', 'sport'),
    'muscle_strength': ('sport', 'sport'),
    'not_coded': ('not_coded', 'not_coded'),
}

behav_act_df_7['posture_wbm'] = behav_act_df_7['Behavior'].apply(_map_posture_wbm_from_behavior)
_broad_waves = behav_act_df_7['posture_wbm'].map(lambda k: _posture_meta.get(k, (np.nan, np.nan)))
behav_act_df_7[['posture_broad', 'posture_waves']] = pd.DataFrame(_broad_waves.tolist(), index=behav_act_df_7.index)

# Forward-fill posture columns within observation to persist between updates
if _group_cols is not None:
    for _c in ['posture_wbm', 'posture_broad', 'posture_waves']:
        behav_act_df_7[_c] = behav_act_df_7.groupby(_group_cols)[_c].ffill()

# waves_sedentary: split driving vs non-driving sitting; lying/kneel_squat are sedentary; else active

def _waves_sed(row) -> str | None:
    pw = row['posture_wbm']
    if pw == 'sitting':
        at = row['activity_type']
        if at in {'trav_drive', 'trav_pass'}:
            return 'sed_drive'
        return 'sedentary'
    if pw in {'lying', 'kneel_squat'}:
        return 'sedentary'
    if pw in {None, np.nan}:
        return None
    return 'active'

behav_act_df_7['waves_sedentary'] = behav_act_df_7.apply(_waves_sed, axis=1)

# ------------------------------
# Activity intensity (4-class and 3-class)
# ------------------------------

def _posture_intensity(value: object) -> str | None:
    s = _normalize_behavior(value)
    if not s:
        return None
    if s.startswith('sb-sitting') or s.startswith('sb-lying') or s.startswith('sb- lying') or s.startswith('la- kneeling'):
        return 'sedentary'
    if s in {'la- stand', 'la- stand and move', 'la- stretching'}:
        return 'light'
    return None

behav_act_df_7['intensity'] = behav_act_df_7['Behavior'].apply(_posture_intensity)

# Fill remaining intensity from Modifier_2 when present
if 'Modifier_2' in behav_act_df_7.columns:
    def _norm_intensity(m) -> str | None:
        if pd.isna(m):
            return None
        s = str(m).strip().lower()
        if not s:
            return None
        if s.startswith('vig'):
            return 'vigorous'
        if s.startswith('mod'):
            return 'moderate'
        if s == 'light':
            return 'light'
        if s == 'sedentary':
            return 'sedentary'
        return None
    _mask_missing = behav_act_df_7['intensity'].isna()
    behav_act_df_7.loc[_mask_missing, 'intensity'] = behav_act_df_7.loc[_mask_missing, 'Modifier_2'].apply(_norm_intensity)

# waves_intensity: collapse moderate+vigorous → mvpa
behav_act_df_7['waves_intensity'] = behav_act_df_7['intensity'].map(lambda x: 'mvpa' if x in {'moderate', 'vigorous'} else x)

# Finalize work_type: forward-fill raw within observation, then keep only during work
if 'work_type_raw' in behav_act_df_7.columns:
    if _group_cols is not None:
        behav_act_df_7['work_type_raw'] = behav_act_df_7.groupby(_group_cols)['work_type_raw'].ffill()
    behav_act_df_7['work_type'] = np.where(
        behav_act_df_7['Activity_Type'].isin(work_labels),
        behav_act_df_7['work_type_raw'],
        np.nan,
    )
    behav_act_df_7 = behav_act_df_7.drop(columns=['work_type_raw'])

# Drop rows marked non-codable or private/not coded (after encoding)
_non_codable_mask = (
    behav_act_df_7['Activity_Type'] == 'OTHER- Non-Codable (delete these rows from dataset)'
) | (
    behav_act_df_7['Behavior'].astype(str).str.strip().str.lower().isin(['private/not coded'])
)
behav_act_df_7 = behav_act_df_7.loc[~_non_codable_mask].copy()
# keep alias in sync after filtering
df_behav_df7 = behav_act_df_7


In [6]:
log_df = pd.read_csv("C:/Users/HELIOS-300/Downloads/do_log_final_behavior(in).csv")

log_df['date'] = pd.to_datetime({
    'year': pd.to_numeric(log_df['start_year'], errors='coerce'),
    'month': pd.to_numeric(log_df['start_month'], errors='coerce'),
    'day': pd.to_numeric(log_df['start_day'], errors='coerce'),
}, errors='coerce').dt.strftime('%#m/%#d/%Y')

log_df.drop(columns=["start_month", "start_day", "start_year"], inplace=True)
log_df2 = log_df.loc[:, ["id", "do", "date", "start_time"]].copy()

# Convert log_df2 start_time to 24-hour HH:MM:SS
s = log_df2['start_time'].astype(str).str.strip()

# Support both with and without seconds
_dt1 = pd.to_datetime(s, format='%I:%M:%S %p', errors='coerce')
_dt2 = pd.to_datetime(s, format='%I:%M %p', errors='coerce')

log_df2.loc[:, 'start_time'] = _dt1.fillna(_dt2).dt.strftime('%H:%M:%S')
log_df2.loc[:, 'date_time'] = log_df2['date'].astype(str).str.strip() + ' ' + log_df2['start_time'].astype(str).str.strip()

log_df2.rename(columns={"start_time" : "time", "do" : "obs"}, inplace=True)
log_df2 = log_df2.drop(columns=["time", "date_time"])

log_df2.head()

Unnamed: 0,id,obs,date
0,102,1,7/24/2019
1,102,2,7/25/2019
2,116,1,8/20/2019
3,116,2,8/21/2019
4,117,1,8/20/2019


In [7]:
behav_copy = behav_act_df_7[["id", "do", "Time_Relative_hms_new", "activity_type", "posture_waves", "intensity", "start_time_new"]]
behav_copy = behav_copy.rename(columns={"do" : "obs", "Time_Relative_hms_new" : "rel_time"})
behav_copy.head()

Unnamed: 0,id,obs,rel_time,activity_type,posture_waves,intensity,start_time_new
0,102,1,00:00:00,work_general,,,
1,102,1,00:00:01,work_general,,,
2,102,1,00:00:02,work_general,,,
3,102,1,00:00:03,work_general,,,
4,102,1,00:00:04,work_general,,,


In [8]:
# final df
joined = behav_copy.merge(
    log_df2.loc[:, ['id', 'obs', 'date']],
    on=['id', 'obs'],
    how='left',
    validate='many_to_one'
)

# Rename posture column
joined = joined.rename(columns={'posture_waves': 'posture', 'start_time_new' : "time"})

# Create date_time = date + ' ' + time (avoid 'nan' strings)
joined = joined.copy()
joined.loc[:, 'date_time'] = np.where(
    joined['time'].notna(),
    joined['date'].astype(str).str.strip() + ' ' + joined['time'].astype(str).str.strip(),
    np.nan
)

# Reorder columns
joined = joined.loc[:, ['id', 'obs', 'date', 'time', 'date_time', 'rel_time', 'activity_type', 'posture', 'intensity']]

joined.iloc[100:110]

Unnamed: 0,id,obs,date,time,date_time,rel_time,activity_type,posture,intensity
100,102,1,7/24/2019,08:21:54 AM,7/24/2019 08:21:54 AM,00:01:40,trav_walk,walk,moderate
101,102,1,7/24/2019,08:22:00 AM,7/24/2019 08:22:00 AM,00:01:41,trav_walk,walk,moderate
102,102,1,7/24/2019,08:22:00 AM,7/24/2019 08:22:00 AM,00:01:42,trav_walk,walk,moderate
103,102,1,7/24/2019,08:22:00 AM,7/24/2019 08:22:00 AM,00:01:43,trav_walk,walk,moderate
104,102,1,7/24/2019,08:22:00 AM,7/24/2019 08:22:00 AM,00:01:44,trav_walk,walk,moderate
105,102,1,7/24/2019,08:22:00 AM,7/24/2019 08:22:00 AM,00:01:45,trav_walk,walk,moderate
106,102,1,7/24/2019,08:22:00 AM,7/24/2019 08:22:00 AM,00:01:46,trav_walk,walk,moderate
107,102,1,7/24/2019,08:22:00 AM,7/24/2019 08:22:00 AM,00:01:47,trav_walk,walk,moderate
108,102,1,7/24/2019,08:22:00 AM,7/24/2019 08:22:00 AM,00:01:48,trav_walk,walk,moderate
109,102,1,7/24/2019,08:22:00 AM,7/24/2019 08:22:00 AM,00:01:49,trav_walk,walk,moderate


In [9]:
# persist activity and posture tracks within each observation using aligned ffill/bfill
# avoids index mismatch by resetting group indices on apply
# comments are lower case

try:
	behav_act_df_7
	_activity_meta_lookup
	_posture_meta
except NameError:
	raise RuntimeError("required variables/functions not found. run earlier encoding cells first.")

# detect grouping key
if 'Observation' in behav_act_df_7.columns:
	_group_cols = ['Observation']
elif {'id','do'}.issubset(behav_act_df_7.columns):
	_group_cols = ['id','do']
else:
	raise RuntimeError("cannot determine observation grouping columns.")

# stabilize activity track
_before_act_na = int(behav_act_df_7['Activity_Type'].isna().sum())
ff_act = (
	behav_act_df_7.groupby(_group_cols, sort=False)['Activity_Type']
	.apply(lambda s: s.ffill())
	.reset_index(level=_group_cols, drop=True)
)
bf_act = (
	behav_act_df_7.groupby(_group_cols, sort=False)['Activity_Type']
	.apply(lambda s: s.bfill())
	.reset_index(level=_group_cols, drop=True)
)
behav_act_df_7['Activity_Type'] = ff_act.fillna(bf_act)

# recompute activity meta
_act_cols = ['activity_type', 'broad_domain', 'waves_domain']
behav_act_df_7[_act_cols] = behav_act_df_7['Activity_Type'].map(_activity_meta_lookup).apply(
	lambda tpl: pd.Series(tpl if isinstance(tpl, tuple) else (np.nan, np.nan, np.nan))
)
_after_act_na = int(behav_act_df_7['Activity_Type'].isna().sum())

# stabilize posture track
_before_pos_na = int(behav_act_df_7['posture_wbm'].isna().sum())
ff_pos = (
	behav_act_df_7.groupby(_group_cols, sort=False)['posture_wbm']
	.apply(lambda s: s.ffill())
	.reset_index(level=_group_cols, drop=True)
)
bf_pos = (
	behav_act_df_7.groupby(_group_cols, sort=False)['posture_wbm']
	.apply(lambda s: s.bfill())
	.reset_index(level=_group_cols, drop=True)
)
behav_act_df_7['posture_wbm'] = ff_pos.fillna(bf_pos)

# recompute posture meta
_pw = behav_act_df_7['posture_wbm'].map(lambda k: _posture_meta.get(k, (np.nan, np.nan)))
behav_act_df_7[['posture_broad', 'posture_waves']] = pd.DataFrame(_pw.tolist(), index=behav_act_df_7.index)
_after_pos_na = int(behav_act_df_7['posture_wbm'].isna().sum())

# recompute waves_sedentary
def _waves_sed(row) -> str | None:
	pw = row['posture_wbm']
	if pw == 'sitting':
		at = row['activity_type']
		if at in {'trav_drive', 'trav_pass'}:
			return 'sed_drive'
		return 'sedentary'
	if pw in {'lying', 'kneel_squat'}:
		return 'sedentary'
	if pw in {None, np.nan}:
		return None
	return 'active'

behav_act_df_7['waves_sedentary'] = behav_act_df_7.apply(_waves_sed, axis=1)

print(f"activity_type na before/after: {_before_act_na} -> {_after_act_na}")
print(f"posture_wbm na before/after: {_before_pos_na} -> {_after_pos_na}")

# rebuild joined with stabilized tracks
try:
	log_df2
except NameError:
	print("skipping joined rebuild (log_df2 not found)")
else:
	behav_copy_stable = behav_act_df_7[[
		"id", "do", "Time_Relative_hms_new", "activity_type", "posture_waves", "intensity", "start_time_new"
	]].rename(columns={"do": "obs", "Time_Relative_hms_new": "rel_time"})

	joined = behav_copy_stable.merge(
		log_df2.loc[:, ['id', 'obs', 'date']],
		on=['id', 'obs'],
		how='left',
		validate='many_to_one'
	)

	joined = joined.rename(columns={'posture_waves': 'posture', 'start_time_new': 'time'})
	joined = joined.copy()
	joined.loc[:, 'date_time'] = np.where(
		joined['time'].notna(),
		joined['date'].astype(str).str.strip() + ' ' + joined['time'].astype(str).str.strip(),
		np.nan
	)
	joined = joined.loc[:, ['id', 'obs', 'date', 'time', 'date_time', 'rel_time', 'activity_type', 'posture', 'intensity']]
	print("joined missing activity_type:", int(joined['activity_type'].isna().sum()))
	print("joined missing posture:", int(joined['posture'].isna().sum()))
	print("sample:")
	print(joined.iloc[100:110])

activity_type na before/after: 42529 -> 3535
posture_wbm na before/after: 6729 -> 0
joined missing activity_type: 3535
joined missing posture: 0
sample:
      id  obs       date         time              date_time  rel_time  \
100  102    1  7/24/2019  08:21:54 AM  7/24/2019 08:21:54 AM  00:01:40   
101  102    1  7/24/2019  08:22:00 AM  7/24/2019 08:22:00 AM  00:01:41   
102  102    1  7/24/2019  08:22:00 AM  7/24/2019 08:22:00 AM  00:01:42   
103  102    1  7/24/2019  08:22:00 AM  7/24/2019 08:22:00 AM  00:01:43   
104  102    1  7/24/2019  08:22:00 AM  7/24/2019 08:22:00 AM  00:01:44   
105  102    1  7/24/2019  08:22:00 AM  7/24/2019 08:22:00 AM  00:01:45   
106  102    1  7/24/2019  08:22:00 AM  7/24/2019 08:22:00 AM  00:01:46   
107  102    1  7/24/2019  08:22:00 AM  7/24/2019 08:22:00 AM  00:01:47   
108  102    1  7/24/2019  08:22:00 AM  7/24/2019 08:22:00 AM  00:01:48   
109  102    1  7/24/2019  08:22:00 AM  7/24/2019 08:22:00 AM  00:01:49   

    activity_type posture intens

In [10]:
cols = ['id','obs','date','time','date_time','rel_time','activity_type','posture','intensity']
print(joined[cols].isna().sum().sort_values(ascending=False))

intensity        14055
activity_type     3535
time               416
date_time          416
id                   0
obs                  0
date                 0
rel_time             0
posture              0
dtype: int64


In [11]:
behav_act_df["Modifier_1"].unique()

array([nan, 'other', 'hiking', 'walking', 'jogging', 'swimming',
       'surfing/water sport', 'weight training', 'basketball'],
      dtype=object)

In [12]:
behav_act_df["Modifier_2"].unique()

array([nan, 'moderate', 'light', 'vigorous', 'sedentary'], dtype=object)

In [13]:
behav_act_df["Modifier_3"].unique()

array([nan, 'SP- Education and Health Services',
       'SP- Office (business, professional services, finance, info)',
       'SP- Trade, Retail, Transportation, and Utilities', 'Other',
       'SP- Leisure and Hospiltality'], dtype=object)

In [14]:
behav_act_df["Behavior"].unique()

array(['LA- stand', nan, 'WRK- general', 'WA- walk', 'TRAV- walking',
       'WA- descend stairs', 'LA- kneeling/ squatting',
       'PUR- purchasing goods and services', 'LA- stand and move',
       'WA- ascend stairs', 'SB-sitting',
       'EAT- eating and drinking, waiting', 'WRK- screen based',
       'OTHER- non codable', 'private/not coded', 'SP- swing',
       'WA-walk with load', 'SP -kick', 'SB- lying',
       'EX- participating in sport, exercise or recreation', 'SP- bike',
       'PC- groom, health-related',
       'LES- socializing, communicating, leisure time not screen',
       'LES- screen based leisure time (TV, video game, computer)',
       'HA- household management/other household activities',
       'HA- housework', 'TRAV- passenger (car/truck/motorcycle)',
       'HA- interior maintenance, repair, & decoration', 'SL- sleep',
       'HA- animals and pets', 'ORG- volunteer',
       'TRAV- driver (car/truck/motorcycle)',
       'HA- lawn, garden and houseplants', 'HA-

In [15]:
# count rows with modifier_2 nan and behavior not in specified postures; then breakdown by behavior
# comments are lower case

behav_test = behav_act_df[behav_act_df["Event_Type"] == "State start"]

# choose the working dataframe that contains raw behavior/modifiers
_df_src = None
if 'behav_test' in globals():
	_df_src = behav_test
elif 'behav_act_df' in globals():
	_df_src = behav_act_df
else:
	raise RuntimeError('no suitable dataframe found (expected behav_act_df_7 or behav_act_df).')

required_cols = {'Behavior','Modifier_2'}
missing = required_cols - set(_df_src.columns)
if missing:
	raise RuntimeError(f"missing required columns: {missing}")

excluded_behaviors = {
	"SB-sitting",
	"SB- lying",
	"LA- kneeling/ squatting",
	"LA- stand",
	"LA- stretching"
}

mask = _df_src['Modifier_2'].isna() & (~_df_src['Behavior'].isin(excluded_behaviors))

count_total = int(mask.sum())
print(f"total rows with modifier_2 nan and behavior not in excluded set: {count_total:,}")

by_behavior = _df_src.loc[mask, 'Behavior'].value_counts(dropna=False)
print("\ncounts by behavior (descending):")
print(by_behavior)

total rows with modifier_2 nan and behavior not in excluded set: 980

counts by behavior (descending):
Behavior
WRK- general                                                 145
private/not coded                                            143
OTHER- non codable                                            90
WRK- screen based                                             87
HA- household management/other household activities           65
TRAV- walking                                                 62
TRAV- driver (car/truck/motorcycle)                           60
EX- participating in sport, exercise or recreation            52
LES- screen based leisure time (TV, video game, computer)     45
LES- socializing, communicating, leisure time not screen      38
HA- food prep and cleanup                                     38
EAT- eating and drinking, waiting                             35
PC- groom, health-related                                     24
PUR- purchasing goods and services         

In [16]:
behav_act_df["Event_Type"].unique()

array(['State start', 'State point', 'State stop'], dtype=object)

In [17]:
testing_df = behav_act_df_6[behav_act_df_6["Event_Type"] == "State start"]
testing_df["Behavior"].isna().sum()

0

In [18]:
joined.head()

Unnamed: 0,id,obs,date,time,date_time,rel_time,activity_type,posture,intensity
0,102,1,7/24/2019,,,00:00:00,work_general,walk,
1,102,1,7/24/2019,,,00:00:01,work_general,walk,
2,102,1,7/24/2019,,,00:00:02,work_general,walk,
3,102,1,7/24/2019,,,00:00:03,work_general,walk,
4,102,1,7/24/2019,,,00:00:04,work_general,walk,


In [19]:
behav_act_df_7.head()

Unnamed: 0,start_time,start_time_new,Time_Relative_hmsf,Time_Relative_hms,Time_Relative_hms_new,Time_Relative_f,Time_Relative_sf,Duration_sf,Observation,Behavior,...,activity_type,broad_domain,waves_domain,posture_wbm,posture_broad,posture_waves,waves_sedentary,intensity,waves_intensity,work_type
0,8:20:19 AM,,##############################################...,0:00:00,00:00:00,1.0,0.00137,53.7145,ID_102_01_C,WRK- general,...,work_general,work_education,work_education,walk,walk,walk,active,,,work_education_and_health_services
1,8:20:19 AM,,##############################################...,0:00:00,00:00:01,1.0,0.00137,53.7145,ID_102_01_C,WRK- general,...,work_general,work_education,work_education,walk,walk,walk,active,,,work_education_and_health_services
2,8:20:19 AM,,##############################################...,0:00:00,00:00:02,1.0,0.00137,53.7145,ID_102_01_C,WRK- general,...,work_general,work_education,work_education,walk,walk,walk,active,,,work_education_and_health_services
3,8:20:19 AM,,##############################################...,0:00:00,00:00:03,1.0,0.00137,53.7145,ID_102_01_C,WRK- general,...,work_general,work_education,work_education,walk,walk,walk,active,,,work_education_and_health_services
4,8:20:19 AM,,##############################################...,0:00:00,00:00:04,1.0,0.00137,53.7145,ID_102_01_C,WRK- general,...,work_general,work_education,work_education,walk,walk,walk,active,,,work_education_and_health_services
