# Ground Truth and Step Count Merge

## Setup

In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings("ignore")

## Load and Check Data Integrity

**Coding Ground Truth**

In [3]:
# Import coding ground truth
ground_truth = pd.read_csv("ACT24_groundtruth_secbysec_20250415.csv", dtype={"work_type": "string"}) #dtype since missing values were stored as NaN, should be as pd.na

In [4]:
# Display first 5 rows for inspection
ground_truth.head()

Unnamed: 0,id,observation,date,date_time,activity_type,broad_activity_type,work_type,posture,sedentary_not,walking_not,activity_intensity
0,102,1,2019-07-24,2019-07-24 08:20:19,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light
1,102,1,2019-07-24,2019-07-24 08:20:20,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light
2,102,1,2019-07-24,2019-07-24 08:20:21,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light
3,102,1,2019-07-24,2019-07-24 08:20:22,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light
4,102,1,2019-07-24,2019-07-24 08:20:23,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light


In [5]:
# Check for duplicate timestamps
check = ground_truth[["id", "observation", "date_time"]].value_counts()
check[check > 1]

Series([], Name: count, dtype: int64)

**Steps Ground Truth**

In [6]:
# Import seconds ground truth
seconds = pd.read_csv("/Users/hydeet/Library/CloudStorage/OneDrive-CalPoly/StepCount/Eric's Analysis/Required datasets/seconds_ground_truth_20250410.csv")

In [7]:
seconds["Quality"].value_counts()

Quality
Codable         388676
Non-codeable      6108
Name: count, dtype: int64

In [8]:
# Display first 5 rows for inspection
seconds.sort_values(["ID", "Session", "Date", "Time"]).head()

Unnamed: 0,ID,Session,relative_time_steps,Date,Time,Quality,Step
229026,102,1,0:00:01,7/24/2019,08:20:20,Codable,0
229027,102,1,0:00:02,7/24/2019,08:20:21,Codable,0
229028,102,1,0:00:03,7/24/2019,08:20:22,Codable,0
229029,102,1,0:00:04,7/24/2019,08:20:23,Codable,0
229030,102,1,0:00:05,7/24/2019,08:20:24,Codable,1


In [9]:
# Check for duplicate timestamps
check = seconds[["ID", "Session", "Date", "Time"]].value_counts()
# Duplicate timestamps were found, check for specific rows and step counts
check[check > 1].reset_index().merge(right = seconds, on=["ID", "Session", "Date", "Time"])

Unnamed: 0,ID,Session,Date,Time,count,relative_time_steps,Quality,Step
0,135,1,10/15/2019,05:49:58,2,2:18:13,Codable,0
1,135,1,10/15/2019,05:49:58,2,2:18:13,Codable,0
2,135,1,10/15/2019,05:29:08,2,1:57:23,Codable,0
3,135,1,10/15/2019,05:29:08,2,1:57:23,Codable,0
4,135,1,10/15/2019,05:29:00,2,1:57:15,Codable,0
...,...,...,...,...,...,...,...,...
21507,135,1,10/15/2019,04:29:16,2,0:57:31,Codable,0
21508,135,1,10/15/2019,04:29:17,2,0:57:32,Codable,0
21509,135,1,10/15/2019,04:29:17,2,0:57:32,Codable,0
21510,135,1,10/15/2019,04:29:18,2,0:57:33,Codable,0


## Ground Truth Data Cleaning

In [10]:
def quality_agg(srs):
    if "Non-codable" in srs:
        return "Non-codeable"
    return srs.iloc[0]

In [11]:
seconds_2 = seconds.copy()
# Rename columns
seconds_2.rename(columns={"ID": "id", "Session": "observation"}, inplace=True)
# Remove duplicates
# Logic: Get max step count for each timestamp
seconds_2 = seconds_2.groupby(["id", "observation", "Date", "Time", "relative_time_steps"]).agg({'Quality': quality_agg, 'Step': 'max'}).reset_index()
# Create datetime column
seconds_2["date_time"] = pd.to_datetime(seconds_2['Date'] + ' ' + seconds_2['Time'])
seconds_2

Unnamed: 0,id,observation,Date,Time,relative_time_steps,Quality,Step,date_time
0,102,1,7/24/2019,08:20:20,0:00:01,Codable,0,2019-07-24 08:20:20
1,102,1,7/24/2019,08:20:21,0:00:02,Codable,0,2019-07-24 08:20:21
2,102,1,7/24/2019,08:20:22,0:00:03,Codable,0,2019-07-24 08:20:22
3,102,1,7/24/2019,08:20:23,0:00:04,Codable,0,2019-07-24 08:20:23
4,102,1,7/24/2019,08:20:24,0:00:05,Codable,1,2019-07-24 08:20:24
...,...,...,...,...,...,...,...,...
384023,154,2,2/23/2020,01:06:11,0:02:20,Codable,1,2020-02-23 01:06:11
384024,154,2,2/23/2020,01:06:12,0:02:21,Codable,2,2020-02-23 01:06:12
384025,154,2,2/23/2020,01:06:13,0:02:22,Codable,2,2020-02-23 01:06:13
384026,154,2,2/23/2020,01:06:14,0:02:23,Codable,1,2020-02-23 01:06:14


In [12]:
seconds_2["Quality"].value_counts()

Quality
Codable         377920
Non-codeable      6108
Name: count, dtype: int64

In [13]:
# Check for duplicates again
check = seconds_2[["id", "observation", "date_time"]].value_counts()
check[check > 1]

Series([], Name: count, dtype: int64)

In [14]:
ground_truth_2 = ground_truth.copy()
# Create datetime column
ground_truth_2["date_time"] = pd.to_datetime(ground_truth["date_time"])
ground_truth_2

Unnamed: 0,id,observation,date,date_time,activity_type,broad_activity_type,work_type,posture,sedentary_not,walking_not,activity_intensity
0,102,1,2019-07-24,2019-07-24 08:20:19,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light
1,102,1,2019-07-24,2019-07-24 08:20:20,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light
2,102,1,2019-07-24,2019-07-24 08:20:21,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light
3,102,1,2019-07-24,2019-07-24 08:20:22,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light
4,102,1,2019-07-24,2019-07-24 08:20:23,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light
...,...,...,...,...,...,...,...,...,...,...,...
511814,154,2,2020-02-23,2020-02-23 15:55:47,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary
511815,154,2,2020-02-23,2020-02-23 15:55:48,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary
511816,154,2,2020-02-23,2020-02-23 15:55:49,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary
511817,154,2,2020-02-23,2020-02-23 15:55:50,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary


## Check Start and End Times of Both Files

In [15]:
# Function to get start and end info for each id and session
def start_end(GT, id, session): 
    subset = GT[(GT["id"] == id) & (GT["observation"] == session)].reset_index(drop=True)
    start = min(subset["date_time"])
    end = max(subset["date_time"])
    # for i, row in subset.iterrows():
    #     if i != 0:
    #         if row["date_time"] < start:
    #             start = row["date_time"]
    #         elif row["date_time"] > end:
    #             end = row["date_time"]
    return [start, end]
    #print(f"ID = {id}, Session = {session}, Start = {start}, End = {end}")
        

In [16]:
# Function to apply start_end to entire dataframe (ie. each id and session)
def start_time_dataframe(df):
    id_ses = df.groupby(["id", "observation"]).size().reset_index().rename(columns={0: "count"})
    times = {"ID": [], 
                   "Session": [],
                   "Start": [],
                   "End": []}
    for i, row in id_ses.iterrows():
        times["ID"].append(row["id"])
        times["Session"].append(row["observation"])
        ls = start_end(df, row["id"], row["observation"])
        times["Start"].append(ls[0])
        times["End"].append(ls[1])
    return pd.DataFrame(times)

In [17]:
# Get start and end times from both coding and step ground truths
GT_times = start_time_dataframe(ground_truth_2)
seconds_times = start_time_dataframe(seconds_2)

In [18]:
start_end_info = pd.merge(GT_times, seconds_times, on=["ID", "Session"], how="outer", suffixes=("_GT", '_secondsFile'))
# Get differences between seconds/coding ground truth, in seconds
start_end_info["StartDiffSecs(GT-seconds)"] = (start_end_info["Start_GT"] - start_end_info["Start_secondsFile"]).dt.total_seconds()
start_end_info["EndDiffSecs(GT-seconds)"] = (start_end_info["End_GT"] - start_end_info["End_secondsFile"]).dt.total_seconds()
start_end_info

Unnamed: 0,ID,Session,Start_GT,End_GT,Start_secondsFile,End_secondsFile,StartDiffSecs(GT-seconds),EndDiffSecs(GT-seconds)
0,102,1,2019-07-24 08:20:19,2019-07-24 11:21:13,2019-07-24 08:20:20,2019-07-24 11:05:37,-1.0,936.0
1,102,2,2019-07-25 12:42:08,2019-07-25 15:42:00,2019-07-25 12:41:53,2019-07-25 15:28:18,15.0,822.0
2,116,1,2019-08-20 08:08:45,2019-08-20 11:09:00,2019-08-20 08:08:46,2019-08-20 11:08:56,-1.0,4.0
3,116,2,2019-08-21 11:10:12,2019-08-21 13:28:00,2019-08-21 11:09:48,2019-08-21 13:26:58,24.0,62.0
4,117,1,2019-08-20 15:17:46,2019-08-20 18:18:00,2019-08-20 03:17:47,2019-08-20 06:05:55,43199.0,43925.0
5,117,2,2019-08-21 07:11:45,2019-08-21 10:13:00,2019-08-21 07:11:46,2019-08-21 09:41:43,-1.0,1877.0
6,122,1,2019-08-28 10:06:40,2019-08-28 13:06:59,2019-08-28 10:06:41,2019-08-28 12:42:20,-1.0,1479.0
7,122,2,2019-08-29 15:19:51,2019-08-29 18:23:00,2019-08-29 03:19:52,2019-08-29 06:22:41,43199.0,43219.0
8,124,1,2019-08-28 14:59:09,2019-08-28 17:59:05,2019-08-28 02:58:52,2019-08-28 04:49:39,43217.0,47366.0
9,124,2,2019-08-29 09:31:00,2019-08-29 12:31:00,2019-08-29 09:30:40,2019-08-29 12:18:54,20.0,726.0


**Add Issue Flags in Columns**

In [19]:
# Function that flags id/observation pairs if the times between step and coding ground truth are off by around 12 hrs
def flag_military(row):
    if row > 40000:
        return True
    else:
        return False

In [20]:
start_end_info_2 = start_end_info.copy()
start_end_info_2["MilitaryTimeFlag"] = start_end_info_2["StartDiffSecs(GT-seconds)"].apply(flag_military)
start_end_info_2

Unnamed: 0,ID,Session,Start_GT,End_GT,Start_secondsFile,End_secondsFile,StartDiffSecs(GT-seconds),EndDiffSecs(GT-seconds),MilitaryTimeFlag
0,102,1,2019-07-24 08:20:19,2019-07-24 11:21:13,2019-07-24 08:20:20,2019-07-24 11:05:37,-1.0,936.0,False
1,102,2,2019-07-25 12:42:08,2019-07-25 15:42:00,2019-07-25 12:41:53,2019-07-25 15:28:18,15.0,822.0,False
2,116,1,2019-08-20 08:08:45,2019-08-20 11:09:00,2019-08-20 08:08:46,2019-08-20 11:08:56,-1.0,4.0,False
3,116,2,2019-08-21 11:10:12,2019-08-21 13:28:00,2019-08-21 11:09:48,2019-08-21 13:26:58,24.0,62.0,False
4,117,1,2019-08-20 15:17:46,2019-08-20 18:18:00,2019-08-20 03:17:47,2019-08-20 06:05:55,43199.0,43925.0,True
5,117,2,2019-08-21 07:11:45,2019-08-21 10:13:00,2019-08-21 07:11:46,2019-08-21 09:41:43,-1.0,1877.0,False
6,122,1,2019-08-28 10:06:40,2019-08-28 13:06:59,2019-08-28 10:06:41,2019-08-28 12:42:20,-1.0,1479.0,False
7,122,2,2019-08-29 15:19:51,2019-08-29 18:23:00,2019-08-29 03:19:52,2019-08-29 06:22:41,43199.0,43219.0,True
8,124,1,2019-08-28 14:59:09,2019-08-28 17:59:05,2019-08-28 02:58:52,2019-08-28 04:49:39,43217.0,47366.0,True
9,124,2,2019-08-29 09:31:00,2019-08-29 12:31:00,2019-08-29 09:30:40,2019-08-29 12:18:54,20.0,726.0,False


**Issues:** 
* Negative start second differences implies that the ground truth start times are OUTSIDE the seconds file session times
* Positive end second differences implies that the ground truth end times are OUTSIDE the seconds file session times

**This implies that we have tagged times for this (ID, Session) pair in our ground truth file that have no information about steps taken :(**

**1\. Ground Truth File contains military time timestamps but seconds file is in AM/PM**
* MilitaryTimeFlag = True means this row has this problem
* **NOTE**: Fixed.

**2\. (ID, Session) pairs missing in seconds file (9 total)**
* Probably just need to throw these out since we do not have step info for these :(

## Fix AM/PM Disparity

In [21]:
start_end_info_3 = start_end_info_2.copy()
# Fix start AM/PM
start_end_info_3['Start_secondsFile'] = start_end_info_3['Start_secondsFile'] + pd.to_timedelta("12:00:00") * start_end_info_3['MilitaryTimeFlag']
# Fix end AM/PM
start_end_info_3['End_secondsFile'] = start_end_info_3['End_secondsFile'] + pd.to_timedelta("12:00:00") * start_end_info_3['MilitaryTimeFlag']
# Recalculate differences
start_end_info_3["StartDiffSecs(GT-seconds)"] = (start_end_info_3["Start_GT"] - start_end_info_3["Start_secondsFile"]).dt.total_seconds()
start_end_info_3["EndDiffSecs(GT-seconds)"] = (start_end_info_3["End_GT"] - start_end_info_3["End_secondsFile"]).dt.total_seconds()
# Recode military time flag
start_end_info_3['MilitaryTimeFlag'] = start_end_info_3["StartDiffSecs(GT-seconds)"].apply(flag_military)
# We need to determine which rows in the second-by-second data fall within the intersection of the groundtruth time intervals, since they don't match up
# Find the greatest start time
start_end_info_3['max_start'] = start_end_info_3[['Start_GT', 'Start_secondsFile']].max(axis=1)
# Find the smallest end time
start_end_info_3['min_end'] = start_end_info_3[['End_GT', 'End_secondsFile']].min(axis=1)
start_end_info_3

Unnamed: 0,ID,Session,Start_GT,End_GT,Start_secondsFile,End_secondsFile,StartDiffSecs(GT-seconds),EndDiffSecs(GT-seconds),MilitaryTimeFlag,max_start,min_end
0,102,1,2019-07-24 08:20:19,2019-07-24 11:21:13,2019-07-24 08:20:20,2019-07-24 11:05:37,-1.0,936.0,False,2019-07-24 08:20:20,2019-07-24 11:05:37
1,102,2,2019-07-25 12:42:08,2019-07-25 15:42:00,2019-07-25 12:41:53,2019-07-25 15:28:18,15.0,822.0,False,2019-07-25 12:42:08,2019-07-25 15:28:18
2,116,1,2019-08-20 08:08:45,2019-08-20 11:09:00,2019-08-20 08:08:46,2019-08-20 11:08:56,-1.0,4.0,False,2019-08-20 08:08:46,2019-08-20 11:08:56
3,116,2,2019-08-21 11:10:12,2019-08-21 13:28:00,2019-08-21 11:09:48,2019-08-21 13:26:58,24.0,62.0,False,2019-08-21 11:10:12,2019-08-21 13:26:58
4,117,1,2019-08-20 15:17:46,2019-08-20 18:18:00,2019-08-20 15:17:47,2019-08-20 18:05:55,-1.0,725.0,False,2019-08-20 15:17:47,2019-08-20 18:05:55
5,117,2,2019-08-21 07:11:45,2019-08-21 10:13:00,2019-08-21 07:11:46,2019-08-21 09:41:43,-1.0,1877.0,False,2019-08-21 07:11:46,2019-08-21 09:41:43
6,122,1,2019-08-28 10:06:40,2019-08-28 13:06:59,2019-08-28 10:06:41,2019-08-28 12:42:20,-1.0,1479.0,False,2019-08-28 10:06:41,2019-08-28 12:42:20
7,122,2,2019-08-29 15:19:51,2019-08-29 18:23:00,2019-08-29 15:19:52,2019-08-29 18:22:41,-1.0,19.0,False,2019-08-29 15:19:52,2019-08-29 18:22:41
8,124,1,2019-08-28 14:59:09,2019-08-28 17:59:05,2019-08-28 14:58:52,2019-08-28 16:49:39,17.0,4166.0,False,2019-08-28 14:59:09,2019-08-28 16:49:39
9,124,2,2019-08-29 09:31:00,2019-08-29 12:31:00,2019-08-29 09:30:40,2019-08-29 12:18:54,20.0,726.0,False,2019-08-29 09:31:00,2019-08-29 12:18:54


In [22]:
# Check to make sure there are no more AM/PM disparities
start_end_info_3[start_end_info_3['MilitaryTimeFlag']]

Unnamed: 0,ID,Session,Start_GT,End_GT,Start_secondsFile,End_secondsFile,StartDiffSecs(GT-seconds),EndDiffSecs(GT-seconds),MilitaryTimeFlag,max_start,min_end


In [23]:
# Edit steps ground truth to fix AM/PM disparities
seconds_3 = seconds_2.merge(right = start_end_info_2[["ID", "Session", "MilitaryTimeFlag", "StartDiffSecs(GT-seconds)"]], how = 'left',
              left_on = ["id", "observation"], right_on = ["ID", "Session"])
seconds_3["date_time"] = seconds_3["date_time"] + pd.to_timedelta("12:00:00") * seconds_3["MilitaryTimeFlag"]
seconds_3 = seconds_3[["id", "observation", "date_time", "Quality", "Step"]]
seconds_3

Unnamed: 0,id,observation,date_time,Quality,Step
0,102,1,2019-07-24 08:20:20,Codable,0
1,102,1,2019-07-24 08:20:21,Codable,0
2,102,1,2019-07-24 08:20:22,Codable,0
3,102,1,2019-07-24 08:20:23,Codable,0
4,102,1,2019-07-24 08:20:24,Codable,1
...,...,...,...,...,...
384023,154,2,2020-02-23 13:06:11,Codable,1
384024,154,2,2020-02-23 13:06:12,Codable,2
384025,154,2,2020-02-23 13:06:13,Codable,2
384026,154,2,2020-02-23 13:06:14,Codable,1


In [24]:
# Check for duplicate times
check = seconds_3[["id", "observation", "date_time"]].value_counts()
check[check>1]

Series([], Name: count, dtype: int64)

In [25]:
seconds_3["Quality"].value_counts()

Quality
Codable         377920
Non-codeable      6108
Name: count, dtype: int64

## Merge Ground Truth Datasets

In [26]:
# Outer merge coding with steps ground truth
merged = pd.merge(left = ground_truth_2, right = seconds_3, how="outer", on=["id", "observation", "date_time"])
# Add flag for times that are inside the max start and min end interval
merged = merged.merge(right = start_end_info_3[["ID", "Session", "max_start", "min_end"]], how = 'left',
                            left_on = ["id", "observation"], right_on = ["ID", "Session"])
merged["inside_flag"] = (merged["date_time"] >= merged["max_start"]) & (merged["date_time"] <= merged["min_end"])
merged = merged.drop(["ID", "Session", "max_start", "min_end"], axis=1)
merged

Unnamed: 0,id,observation,date,date_time,activity_type,broad_activity_type,work_type,posture,sedentary_not,walking_not,activity_intensity,Quality,Step,inside_flag
0,102,1,2019-07-24,2019-07-24 08:20:19,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,,,False
1,102,1,2019-07-24,2019-07-24 08:20:20,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0.0,True
2,102,1,2019-07-24,2019-07-24 08:20:21,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0.0,True
3,102,1,2019-07-24,2019-07-24 08:20:22,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0.0,True
4,102,1,2019-07-24,2019-07-24 08:20:23,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512175,154,2,2020-02-23,2020-02-23 15:55:47,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,,,False
512176,154,2,2020-02-23,2020-02-23 15:55:48,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,,,False
512177,154,2,2020-02-23,2020-02-23 15:55:49,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,,,False
512178,154,2,2020-02-23,2020-02-23 15:55:50,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,,,False


In [27]:
# Check to make sure that there is only one row per second
check = merged[["id", "observation", "date_time"]].value_counts()
check[check > 1]

Series([], Name: count, dtype: int64)

In [28]:
merged["Quality"].value_counts()

Quality
Codable         377920
Non-codeable      6108
Name: count, dtype: int64

## Check Valid Rows (Intersection of Steps, Coding Tables)

At this point, we know that the rows are invalid if:

1. The ID and Session/observation are missing from the seconds ground truth entirely, not partially
2. OR They fall outside of where the time intervals of the two ground truth files intersect.

We need to know if any more rows are invalid. The only rows that could be invalid at this point are rows that:

1. Have a valid ID/Session
2. AND they fall within the time interval
3. AND they either don't have steps or coding ground truth

We need to check if any such rows exist.

In [29]:
# List of id/session that were found in the seconds file and in ground truth file
id_session_notna = start_end_info_3[["ID", "Session"]][~start_end_info_3["Start_secondsFile"].isna()]
id_session_notna

Unnamed: 0,ID,Session
0,102,1
1,102,2
2,116,1
3,116,2
4,117,1
5,117,2
6,122,1
7,122,2
8,124,1
9,124,2


In [30]:
# Find all valid rows - rows that should have data from both coding and step ground truth
# They should be in the valid id/sessions
merged_valid = id_session_notna.merge(right = merged, left_on = ["ID", "Session"], right_on = ["id", "observation"]).drop(["ID", "Session"], axis=1)
# They should also fall within the time interval between the max start and min end of both coding/step ground truth
merged_valid = merged_valid[merged_valid["inside_flag"] == True]
merged_valid

Unnamed: 0,id,observation,date,date_time,activity_type,broad_activity_type,work_type,posture,sedentary_not,walking_not,activity_intensity,Quality,Step,inside_flag
1,102,1,2019-07-24,2019-07-24 08:20:20,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0.0,True
2,102,1,2019-07-24,2019-07-24 08:20:21,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0.0,True
3,102,1,2019-07-24,2019-07-24 08:20:22,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0.0,True
4,102,1,2019-07-24,2019-07-24 08:20:23,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0.0,True
5,102,1,2019-07-24,2019-07-24 08:20:24,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,1.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416158,154,2,2020-02-23,2020-02-23 13:06:11,"EDU- taking class, research, homework",work_education,,stand and move,not_sedentary,not_walking,light,Codable,1.0,True
416159,154,2,2020-02-23,2020-02-23 13:06:12,"EDU- taking class, research, homework",work_education,,stand and move,not_sedentary,not_walking,light,Codable,2.0,True
416160,154,2,2020-02-23,2020-02-23 13:06:13,"EDU- taking class, research, homework",work_education,,stand and move,not_sedentary,not_walking,light,Codable,2.0,True
416161,154,2,2020-02-23,2020-02-23 13:06:14,"EDU- taking class, research, homework",work_education,,stand and move,not_sedentary,not_walking,light,Codable,1.0,True


In [31]:
# Check if there are any invalid rows remaining - there should be none
merged_valid[(merged_valid["walking_not"].isna()) | (merged_valid["Step"].isna())]

Unnamed: 0,id,observation,date,date_time,activity_type,broad_activity_type,work_type,posture,sedentary_not,walking_not,activity_intensity,Quality,Step,inside_flag


In [32]:
merged_valid["Quality"].value_counts()

Quality
Codable         377559
Non-codeable      6108
Name: count, dtype: int64

Based on our analysis, there are no invalid rows that:

1. Have a valid ID/Session
2. AND they fall within the time interval
3. AND they either don't have steps or coding ground truth

## Analyze Time Interval Disparity

Here we show the rows that have a valid ID/Session but fall outside the time interval intersection of the two ground truth tables. They should match the sum of the difference in seconds of the start/end that we calculated in the start_end_info tables.

In [33]:
# Find the total number of seconds/rows that fall outside the max start/min end interval
total_outside = start_end_info_3['StartDiffSecs(GT-seconds)'].abs().sum() + start_end_info_3['EndDiffSecs(GT-seconds)'].abs().sum()
total_outside

np.float64(42672.0)

In [34]:
merged_outside = merged[merged["inside_flag"] == False]
merged_outside

Unnamed: 0,id,observation,date,date_time,activity_type,broad_activity_type,work_type,posture,sedentary_not,walking_not,activity_intensity,Quality,Step,inside_flag
0,102,1,2019-07-24,2019-07-24 08:20:19,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,,,False
9919,102,1,2019-07-24,2019-07-24 11:05:38,WRK- general,work_education,"SP- Office (business, professional services, f...",walk,not_sedentary,walking,moderate,,,False
9920,102,1,2019-07-24,2019-07-24 11:05:39,WRK- screen based,work_education,"SP- Office (business, professional services, f...",sitting,sedentary,not_walking,sedentary,,,False
9921,102,1,2019-07-24,2019-07-24 11:05:40,WRK- screen based,work_education,"SP- Office (business, professional services, f...",sitting,sedentary,not_walking,sedentary,,,False
9922,102,1,2019-07-24,2019-07-24 11:05:41,WRK- screen based,work_education,"SP- Office (business, professional services, f...",sitting,sedentary,not_walking,sedentary,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512175,154,2,2020-02-23,2020-02-23 15:55:47,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,,,False
512176,154,2,2020-02-23,2020-02-23 15:55:48,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,,,False
512177,154,2,2020-02-23,2020-02-23 15:55:49,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,,,False
512178,154,2,2020-02-23,2020-02-23 15:55:50,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,,,False


The numbers match up. We have the correct number of rows that fall outside the interval. Because they fall outside of the interval intersection, they will only have steps or coding ground truth, not both.

We can further separate the two into rows that are in only coding and rows that are only in step count (that fall outside the common time interval intersection).

In [35]:
# Only in step count
sc_only = merged_outside[merged_outside["activity_type"].isna()]
sc_only

Unnamed: 0,id,observation,date,date_time,activity_type,broad_activity_type,work_type,posture,sedentary_not,walking_not,activity_intensity,Quality,Step,inside_flag
10855,102,2,,2019-07-25 12:41:53,,,,,,,,Codable,0.0,False
10856,102,2,,2019-07-25 12:41:54,,,,,,,,Codable,0.0,False
10857,102,2,,2019-07-25 12:41:55,,,,,,,,Codable,0.0,False
10858,102,2,,2019-07-25 12:41:56,,,,,,,,Codable,0.0,False
10859,102,2,,2019-07-25 12:41:57,,,,,,,,Codable,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501863,154,2,,2020-02-23 13:03:55,,,,,,,,Codable,0.0,False
501864,154,2,,2020-02-23 13:03:56,,,,,,,,Codable,0.0,False
501865,154,2,,2020-02-23 13:03:57,,,,,,,,Codable,0.0,False
501866,154,2,,2020-02-23 13:03:58,,,,,,,,Codable,0.0,False


In [36]:
# Only in coding
gt_only = merged_outside[merged_outside["Step"].isna()]
gt_only

Unnamed: 0,id,observation,date,date_time,activity_type,broad_activity_type,work_type,posture,sedentary_not,walking_not,activity_intensity,Quality,Step,inside_flag
0,102,1,2019-07-24,2019-07-24 08:20:19,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,,,False
9919,102,1,2019-07-24,2019-07-24 11:05:38,WRK- general,work_education,"SP- Office (business, professional services, f...",walk,not_sedentary,walking,moderate,,,False
9920,102,1,2019-07-24,2019-07-24 11:05:39,WRK- screen based,work_education,"SP- Office (business, professional services, f...",sitting,sedentary,not_walking,sedentary,,,False
9921,102,1,2019-07-24,2019-07-24 11:05:40,WRK- screen based,work_education,"SP- Office (business, professional services, f...",sitting,sedentary,not_walking,sedentary,,,False
9922,102,1,2019-07-24,2019-07-24 11:05:41,WRK- screen based,work_education,"SP- Office (business, professional services, f...",sitting,sedentary,not_walking,sedentary,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512175,154,2,2020-02-23,2020-02-23 15:55:47,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,,,False
512176,154,2,2020-02-23,2020-02-23 15:55:48,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,,,False
512177,154,2,2020-02-23,2020-02-23 15:55:49,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,,,False
512178,154,2,2020-02-23,2020-02-23 15:55:50,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,,,False


Some further analysis on the rows that are only in the coding ground truth:

In [37]:
gt_only_summary = gt_only['activity_type'].value_counts()
gt_only_summary = gt_only_summary.reset_index()
gt_only_summary = gt_only_summary.rename(columns = {"activity_type": "category"})
gt_only_summary['column'] = 'activity_type'

for col in ['broad_activity_type', 'work_type', 'posture', 'sedentary_not', 'walking_not', 'Quality']:
    new_summary = gt_only[col].value_counts()
    new_summary = new_summary.reset_index()
    new_summary = new_summary.rename(columns = {col: "category"})
    new_summary['column'] = col
    gt_only_summary = pd.concat([gt_only_summary, new_summary], axis=0)
    
gt_only_summary.groupby(["column", "category"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
column,category,Unnamed: 2_level_1
activity_type,CA- caring for and helping children,418
activity_type,"EAT- eating and drinking, waiting",9520
activity_type,"EDU- taking class, research, homework",11766
activity_type,EX- weight training,1
activity_type,HA- food prep and cleanup,11
activity_type,HA- household management/other household activities,474
activity_type,"HA- lawn, garden and houseplants",19
activity_type,"LES- screen based leisure time (TV, video game, computer)",3163
activity_type,"LES- socializing, communicating, leisure time not screen",3213
activity_type,OTHER- non codable,179


**IMPORTANT NOTE**: Observed, most rows are non-walking, sedentary, sitting. We may re-add these rows to the dataset if we can confirm from Dr. Keadle that we can assume a step count of 0 for these rows.

Some further analysis on the rows only in step count:

In [38]:
sc_only['Step'].value_counts()

Step
0.0    341
1.0     11
2.0      9
Name: count, dtype: int64

## UPDATE: Impute Behavioral-Only with 0s and Re-Add to Data

All behavioral only rows have been determined to be 0 steps. We will impute and add back to the ground truth.

In [39]:
# Impute with 0s
gt_only2 = gt_only.copy()
gt_only2["Step"] = 0
gt_only2["Quality"] = "Codable"

# Add back to dataset
merged_valid2 = pd.concat([merged_valid, gt_only2], ignore_index=True).sort_values(["id", "observation", "date_time"])
merged_valid2

Unnamed: 0,id,observation,date,date_time,activity_type,broad_activity_type,work_type,posture,sedentary_not,walking_not,activity_intensity,Quality,Step,inside_flag
383667,102,1,2019-07-24,2019-07-24 08:20:19,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0.0,False
0,102,1,2019-07-24,2019-07-24 08:20:20,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0.0,True
1,102,1,2019-07-24,2019-07-24 08:20:21,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0.0,True
2,102,1,2019-07-24,2019-07-24 08:20:22,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0.0,True
3,102,1,2019-07-24,2019-07-24 08:20:23,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425973,154,2,2020-02-23,2020-02-23 15:55:47,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,Codable,0.0,False
425974,154,2,2020-02-23,2020-02-23 15:55:48,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,Codable,0.0,False
425975,154,2,2020-02-23,2020-02-23 15:55:49,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,Codable,0.0,False
425976,154,2,2020-02-23,2020-02-23 15:55:50,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,Codable,0.0,False


In [40]:
merged_valid2["Quality"].value_counts()

Quality
Codable         419870
Non-codeable      6108
Name: count, dtype: int64

## Analyze Missing IDs/Sessions from Steps Ground Truth

These rows correspond to coding ground truth ID/Sessions that are missing from the steps ground truth entirely.

In [41]:
# List of id/session that were found in ground truth file but NOT seconds file
id_session_na = start_end_info_3[["ID", "Session"]][start_end_info_3["Start_secondsFile"].isna()]
id_session_na

Unnamed: 0,ID,Session
10,126,1
21,131,2
27,134,2
31,136,2
33,138,2
34,139,1
35,139,2
46,154,1


In [42]:
merged_invalid_idsession = id_session_na.merge(right = merged, left_on = ["ID", "Session"], right_on = ["id", "observation"]).drop(["ID", "Session"], axis=1)
merged_invalid_idsession

Unnamed: 0,id,observation,date,date_time,activity_type,broad_activity_type,work_type,posture,sedentary_not,walking_not,activity_intensity,Quality,Step,inside_flag
0,126,1,2019-09-02,2019-09-02 11:14:51,"LES- socializing, communicating, leisure time ...",leisure,,sitting,sedentary,not_walking,sedentary,,,True
1,126,1,2019-09-02,2019-09-02 11:14:52,"LES- socializing, communicating, leisure time ...",leisure,,sitting,sedentary,not_walking,sedentary,,,True
2,126,1,2019-09-02,2019-09-02 11:14:53,"LES- socializing, communicating, leisure time ...",leisure,,sitting,sedentary,not_walking,sedentary,,,True
3,126,1,2019-09-02,2019-09-02 11:14:54,"LES- socializing, communicating, leisure time ...",leisure,,sitting,sedentary,not_walking,sedentary,,,True
4,126,1,2019-09-02,2019-09-02 11:14:55,"LES- socializing, communicating, leisure time ...",leisure,,sitting,sedentary,not_walking,sedentary,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85836,154,1,2020-02-22,2020-02-22 16:06:55,TRAV- walking,active_transportation,,walk,not_sedentary,walking,moderate,,,True
85837,154,1,2020-02-22,2020-02-22 16:06:56,TRAV- walking,active_transportation,,walk,not_sedentary,walking,moderate,,,True
85838,154,1,2020-02-22,2020-02-22 16:06:57,TRAV- walking,active_transportation,,walk,not_sedentary,walking,moderate,,,True
85839,154,1,2020-02-22,2020-02-22 16:06:58,TRAV- walking,active_transportation,,walk,not_sedentary,walking,moderate,,,True


## New Time Intervals

In [43]:
final_start_end = start_time_dataframe(merged_valid2)
final_start_end

Unnamed: 0,ID,Session,Start,End
0,102,1,2019-07-24 08:20:19,2019-07-24 11:21:13
1,102,2,2019-07-25 12:42:08,2019-07-25 15:42:00
2,116,1,2019-08-20 08:08:45,2019-08-20 11:09:00
3,116,2,2019-08-21 11:10:12,2019-08-21 13:28:00
4,117,1,2019-08-20 15:17:46,2019-08-20 18:18:00
5,117,2,2019-08-21 07:11:45,2019-08-21 10:13:00
6,122,1,2019-08-28 10:06:40,2019-08-28 13:06:59
7,122,2,2019-08-29 15:19:51,2019-08-29 18:23:00
8,124,1,2019-08-28 14:59:09,2019-08-28 17:59:05
9,124,2,2019-08-29 09:31:00,2019-08-29 12:31:00


Compare to log:

In [44]:
log = pd.read_csv("/Users/hydeet/Library/CloudStorage/OneDrive-CalPoly/StepCount/Eric's Analysis/Required datasets/do_log_final_behavior.csv")
log["Start"] = pd.to_datetime(log["start_month"].astype(str) + "/" +
                              log["start_day"].astype(str) + "/" +
                              log["start_year"].astype(str) + "/" + ' ' +
                              log["start_time"])
log["End"] = pd.to_datetime(log["start_month"].astype(str) + "/" +
                              log["start_day"].astype(str) + "/" +
                              log["start_year"].astype(str) + "/" + ' ' +
                              log["stop_time"])
log = log[['id', 'do', 'Start', 'End']].rename(columns = {"do": "Session", "id": "ID"})
log

Unnamed: 0,ID,Session,Start,End
0,102,1,2019-07-24 08:20:19,2019-07-24 11:21:13
1,102,2,2019-07-25 12:42:08,2019-07-25 15:42:00
2,116,1,2019-08-20 08:08:45,2019-08-20 11:09:00
3,116,2,2019-08-21 11:09:47,2019-08-21 13:28:00
4,117,1,2019-08-20 15:17:46,2019-08-20 18:18:00
5,117,2,2019-08-21 07:11:45,2019-08-21 10:13:00
6,122,1,2019-08-28 10:06:40,2019-08-28 13:06:59
7,122,2,2019-08-29 15:19:51,2019-08-29 18:23:00
8,124,1,2019-08-28 14:59:09,2019-08-28 17:59:05
9,124,2,2019-08-29 09:31:00,2019-08-29 12:31:00


In [45]:
log_compare = final_start_end.merge(log, on=["ID", "Session"], how="outer", suffixes=("_final", "_log"))
log_compare["Start_diff"] = (log_compare["Start_final"] - log_compare["Start_log"]).dt.total_seconds()
log_compare["End_diff"] = (log_compare["End_final"] - log_compare["End_log"]).dt.total_seconds()
log_compare

Unnamed: 0,ID,Session,Start_final,End_final,Start_log,End_log,Start_diff,End_diff
0,102,1,2019-07-24 08:20:19,2019-07-24 11:21:13,2019-07-24 08:20:19,2019-07-24 11:21:13,0.0,0.0
1,102,2,2019-07-25 12:42:08,2019-07-25 15:42:00,2019-07-25 12:42:08,2019-07-25 15:42:00,0.0,0.0
2,116,1,2019-08-20 08:08:45,2019-08-20 11:09:00,2019-08-20 08:08:45,2019-08-20 11:09:00,0.0,0.0
3,116,2,2019-08-21 11:10:12,2019-08-21 13:28:00,2019-08-21 11:09:47,2019-08-21 13:28:00,25.0,0.0
4,117,1,2019-08-20 15:17:46,2019-08-20 18:18:00,2019-08-20 15:17:46,2019-08-20 18:18:00,0.0,0.0
5,117,2,2019-08-21 07:11:45,2019-08-21 10:13:00,2019-08-21 07:11:45,2019-08-21 10:13:00,0.0,0.0
6,122,1,2019-08-28 10:06:40,2019-08-28 13:06:59,2019-08-28 10:06:40,2019-08-28 13:06:59,0.0,0.0
7,122,2,2019-08-29 15:19:51,2019-08-29 18:23:00,2019-08-29 15:19:51,2019-08-29 18:23:00,0.0,0.0
8,124,1,2019-08-28 14:59:09,2019-08-28 17:59:05,2019-08-28 14:59:09,2019-08-28 17:59:05,0.0,0.0
9,124,2,2019-08-29 09:31:00,2019-08-29 12:31:00,2019-08-29 09:31:00,2019-08-29 12:31:00,0.0,0.0


In [46]:
# log export for data pull
log_exp = log_compare.copy()
log_exp["Start"] = log_exp["Start_final"].combine_first(log_exp["Start_log"])
log_exp["End"] = log_exp["End_final"].combine_first(log_exp["End_log"])
log_exp[["ID", "Session", "Start", "End"]].to_csv("log-new.csv", index=False)

## Final Table

**Final Ground Truth with Invalid Rows Dropped**

In [47]:
final_gt = merged_valid2.drop(["inside_flag"], axis=1).rename(columns={"Step": "step", "Quality": "quality"}).reset_index(drop=True)
# Zero out non-codeables
final_gt['step'] = final_gt['step'].astype(int) * (final_gt["quality"] == "Codable")
final_gt

Unnamed: 0,id,observation,date,date_time,activity_type,broad_activity_type,work_type,posture,sedentary_not,walking_not,activity_intensity,quality,step
0,102,1,2019-07-24,2019-07-24 08:20:19,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0
1,102,1,2019-07-24,2019-07-24 08:20:20,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0
2,102,1,2019-07-24,2019-07-24 08:20:21,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0
3,102,1,2019-07-24,2019-07-24 08:20:22,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0
4,102,1,2019-07-24,2019-07-24 08:20:23,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
425973,154,2,2020-02-23,2020-02-23 15:55:47,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,Codable,0
425974,154,2,2020-02-23,2020-02-23 15:55:48,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,Codable,0
425975,154,2,2020-02-23,2020-02-23 15:55:49,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,Codable,0
425976,154,2,2020-02-23,2020-02-23 15:55:50,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,Codable,0


In [48]:
# Check non-codeables
final_gt[(final_gt["activity_type"] == "OTHER- non codable") | (final_gt["posture"] == "private/not coded")].groupby(["activity_type", "posture", "quality"])["step"].sum()

activity_type                                              posture            quality     
CA- caring for and helping children                        private/not coded  Codable          2
EAT- eating and drinking, waiting                          private/not coded  Codable          0
EX- surfing/water sport                                    private/not coded  Codable          0
HA- household management/other household activities        private/not coded  Codable          2
                                                                              Non-codeable     0
HA- housework                                              private/not coded  Codable          0
LES- screen based leisure time (TV, video game, computer)  private/not coded  Non-codeable     0
LES- socializing, communicating, leisure time not screen   private/not coded  Codable          0
OTHER- non codable                                         private/not coded  Codable         66
                                    

In [49]:
final_gt["quality"].value_counts()

quality
Codable         419870
Non-codeable      6108
Name: count, dtype: int64

In [50]:
# Create relative time variable at the very end

# Ensure date_time is datetime type
final_gt["date_time"] = pd.to_datetime(final_gt["date_time"])

# Group by id and observation and subtract the min date_time from each row in that group
final_gt["relative_time"] = (
    final_gt.groupby(["id", "observation"])["date_time"]
    .transform(lambda x: x - x.min())
)

# Format timedelta to HH:MM:SS string
final_gt["relative_time"] = final_gt["relative_time"].apply(lambda x: str(x).split()[-1])

# Get list of columns
cols = list(final_gt.columns)

# Find the index of 'date_time'
idx = cols.index("date_time")

# Remove 'relative_time' from its current position
cols.remove("relative_time")

# Insert 'relative_time' right after 'date_time'
cols.insert(idx + 1, "relative_time")

# Reorder the DataFrame
final_gt = final_gt[cols]

final_gt


Unnamed: 0,id,observation,date,date_time,relative_time,activity_type,broad_activity_type,work_type,posture,sedentary_not,walking_not,activity_intensity,quality,step
0,102,1,2019-07-24,2019-07-24 08:20:19,00:00:00,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0
1,102,1,2019-07-24,2019-07-24 08:20:20,00:00:01,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0
2,102,1,2019-07-24,2019-07-24 08:20:21,00:00:02,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0
3,102,1,2019-07-24,2019-07-24 08:20:22,00:00:03,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0
4,102,1,2019-07-24,2019-07-24 08:20:23,00:00:04,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425973,154,2,2020-02-23,2020-02-23 15:55:47,02:51:47,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,Codable,0
425974,154,2,2020-02-23,2020-02-23 15:55:48,02:51:48,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,Codable,0
425975,154,2,2020-02-23,2020-02-23 15:55:49,02:51:49,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,Codable,0
425976,154,2,2020-02-23,2020-02-23 15:55:50,02:51:50,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,Codable,0


In [51]:
# Eric making the manual changes that Sarah emailed about for ID 128 obs 1
# Define filtering mask
mask = (
    (final_gt["id"] == 128) &
    (final_gt["observation"] == 1) &
    (final_gt["date_time"] >= pd.to_datetime("9/6/2019  7:07:40 AM")) &
    (final_gt["date_time"] <= pd.to_datetime("9/6/2019  7:36:14 AM"))
)

# Apply the manual updates
final_gt.loc[mask, "quality"] = "Non-codeable"

#Check that it worked
final_gt.loc[mask, ["id", "observation", "date_time", "quality"]]

Unnamed: 0,id,observation,date_time,quality
138695,128,1,2019-09-06 07:07:40,Non-codeable
138696,128,1,2019-09-06 07:07:41,Non-codeable
138697,128,1,2019-09-06 07:07:42,Non-codeable
138698,128,1,2019-09-06 07:07:43,Non-codeable
138699,128,1,2019-09-06 07:07:44,Non-codeable
...,...,...,...,...
140405,128,1,2019-09-06 07:36:10,Non-codeable
140406,128,1,2019-09-06 07:36:11,Non-codeable
140407,128,1,2019-09-06 07:36:12,Non-codeable
140408,128,1,2019-09-06 07:36:13,Non-codeable


In [52]:
final_gt.to_csv("merged_groundtruth_secbysec_20250415.csv", index=False)

Export with noncodeable omitted to send to Dr. Keadle and Paige

In [53]:
final_with_drops = final_gt[(final_gt["activity_type"] != "OTHER- non codable") & (final_gt["posture"] != "private/not coded") & (final_gt["quality"] == "Codable")]
final_with_drops

Unnamed: 0,id,observation,date,date_time,relative_time,activity_type,broad_activity_type,work_type,posture,sedentary_not,walking_not,activity_intensity,quality,step
0,102,1,2019-07-24,2019-07-24 08:20:19,00:00:00,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0
1,102,1,2019-07-24,2019-07-24 08:20:20,00:00:01,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0
2,102,1,2019-07-24,2019-07-24 08:20:21,00:00:02,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0
3,102,1,2019-07-24,2019-07-24 08:20:22,00:00:03,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0
4,102,1,2019-07-24,2019-07-24 08:20:23,00:00:04,WRK- general,work_education,SP- Education and Health Services,stand,not_sedentary,not_walking,light,Codable,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425973,154,2,2020-02-23,2020-02-23 15:55:47,02:51:47,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,Codable,0
425974,154,2,2020-02-23,2020-02-23 15:55:48,02:51:48,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,Codable,0
425975,154,2,2020-02-23,2020-02-23 15:55:49,02:51:49,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,Codable,0
425976,154,2,2020-02-23,2020-02-23 15:55:50,02:51:50,"EDU- taking class, research, homework",work_education,,sitting,sedentary,not_walking,sedentary,Codable,0


In [54]:
# Eric making the manual changes that Sarah outlined in documenting_changes.docx
# First, ID 116 obs 2
# Define filtering mask
mask = (
    (final_with_drops["id"] == 116) &
    (final_with_drops["observation"] == 2) &
    (final_with_drops["date_time"] >= pd.to_datetime("8/21/2019 11:43:47 AM")) &
    (final_with_drops["date_time"] <= pd.to_datetime("8/21/2019 11:45:57 AM"))
)

# Apply the manual updates
final_with_drops.loc[mask, "posture"] = "stand and move"
final_with_drops.loc[mask, "sedentary_not"] = "not_sedentary"
final_with_drops.loc[mask, "activity_intensity"] = "light"

#Check that it worked
final_with_drops.loc[mask, ["id", "observation", "date_time", "posture", "sedentary_not", "activity_intensity"]]

Unnamed: 0,id,observation,date_time,posture,sedentary_not,activity_intensity
34479,116,2,2019-08-21 11:43:47,stand and move,not_sedentary,light
34480,116,2,2019-08-21 11:43:48,stand and move,not_sedentary,light
34481,116,2,2019-08-21 11:43:49,stand and move,not_sedentary,light
34482,116,2,2019-08-21 11:43:50,stand and move,not_sedentary,light
34483,116,2,2019-08-21 11:43:51,stand and move,not_sedentary,light
...,...,...,...,...,...,...
34605,116,2,2019-08-21 11:45:53,stand and move,not_sedentary,light
34606,116,2,2019-08-21 11:45:54,stand and move,not_sedentary,light
34607,116,2,2019-08-21 11:45:55,stand and move,not_sedentary,light
34608,116,2,2019-08-21 11:45:56,stand and move,not_sedentary,light


In [55]:
# ID 117 obs 1
# Define filtering mask
mask = (
    (final_with_drops["id"] == 117) &
    (final_with_drops["observation"] == 1) &
    (final_with_drops["date_time"] >= pd.to_datetime("8/20/2019  6:05:40 PM")) &
    (final_with_drops["date_time"] <= pd.to_datetime("8/20/2019  6:05:55 PM"))
)

# Apply the manual updates
final_with_drops.loc[mask, "posture"] = "stand and move"
final_with_drops.loc[mask, "sedentary_not"] = "not_sedentary"
final_with_drops.loc[mask, "activity_intensity"] = "light"

#Check that it worked
final_with_drops.loc[mask, ["id", "observation", "date_time", "posture", "sedentary_not", "activity_intensity"]]

Unnamed: 0,id,observation,date_time,posture,sedentary_not,activity_intensity
50807,117,1,2019-08-20 18:05:40,stand and move,not_sedentary,light
50808,117,1,2019-08-20 18:05:41,stand and move,not_sedentary,light
50809,117,1,2019-08-20 18:05:42,stand and move,not_sedentary,light
50810,117,1,2019-08-20 18:05:43,stand and move,not_sedentary,light
50811,117,1,2019-08-20 18:05:44,stand and move,not_sedentary,light
50812,117,1,2019-08-20 18:05:45,stand and move,not_sedentary,light
50813,117,1,2019-08-20 18:05:46,stand and move,not_sedentary,light
50814,117,1,2019-08-20 18:05:47,stand and move,not_sedentary,light
50815,117,1,2019-08-20 18:05:48,stand and move,not_sedentary,light
50816,117,1,2019-08-20 18:05:49,stand and move,not_sedentary,light


In [56]:
# ID 128 obs 2
# Define filtering mask
mask = (
    (final_with_drops["id"] == 128) &
    (final_with_drops["observation"] == 2) &
    (final_with_drops["date_time"] >= pd.to_datetime("9/10/2019  2:40:36 PM")) &
    (final_with_drops["date_time"] <= pd.to_datetime("9/10/2019  2:42:57 PM"))
)

# Apply the manual updates
final_with_drops.loc[mask, "posture"] = "stand and move"
final_with_drops.loc[mask, "sedentary_not"] = "not_sedentary"
final_with_drops.loc[mask, "activity_intensity"] = "light"

#Check that it worked
final_with_drops.loc[mask, ["id", "observation", "date_time", "posture", "sedentary_not", "activity_intensity"]]

Unnamed: 0,id,observation,date_time,posture,sedentary_not,activity_intensity
153811,128,2,2019-09-10 14:40:36,stand and move,not_sedentary,light
153812,128,2,2019-09-10 14:40:37,stand and move,not_sedentary,light
153813,128,2,2019-09-10 14:40:38,stand and move,not_sedentary,light
153814,128,2,2019-09-10 14:40:39,stand and move,not_sedentary,light
153815,128,2,2019-09-10 14:40:40,stand and move,not_sedentary,light
...,...,...,...,...,...,...
153948,128,2,2019-09-10 14:42:53,stand and move,not_sedentary,light
153949,128,2,2019-09-10 14:42:54,stand and move,not_sedentary,light
153950,128,2,2019-09-10 14:42:55,stand and move,not_sedentary,light
153951,128,2,2019-09-10 14:42:56,stand and move,not_sedentary,light


In [57]:
final_with_drops.to_csv(
    "merged_groundtruth_secbysec_clean_20250415.csv",
    index=False,
    date_format="%m/%d/%Y %H:%M:%S")