<font size ="10"> Data Engeneering</font>

Transform in Multilabel classification with "awake", "asleep", "not wearing"

In [4]:
import pandas as pd
import plotly.express as px
pd.set_option('mode.chained_assignment', None) #Avoids warning
import json
from tqdm.notebook import tqdm
from tqdm.notebook import trange
from pandarallel import pandarallel
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import stumpy
from stumpy.floss import _cac
import gc
import multiprocessing 
from imblearn.under_sampling import RandomUnderSampler
import optuna.integration.lightgbm as olgb
import lightgbm as lgb
import sklearn
from lightgbm import early_stopping
from lightgbm import log_evaluation
from sklearn.model_selection import train_test_split
from nancorrmp.nancorrmp import NaNCorrMp
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [None]:
#Check values for days 
#train = pd.read_parquet("train_compressed.parquet")
train = pd.read_parquet("train_label.parquet")
train.drop(columns = ['state'], inplace = True)

<font size ="7"> First work with the events file to build the necessary labels</font>

In [22]:
#Clean the events data from multiple wakeup/onset on the same night
events = pd.read_csv("events_compressed.csv")
fault_indeces = []
for idx in range(0, len(events) - 1):
    
    logic1 = (events.loc[idx, 'event'] == events.loc[idx + 1, 'event'])\
            and (events.loc[idx, 'night'] == events.loc[idx + 1, 'night']) 
    
    logic2 = (events.loc[idx, 'night'] != events.loc[idx + 1, 'night'])\
            and (events.loc[idx, 'night'] != events.loc[idx - 1, 'night'])
    
    if logic1 or logic2:
        fault_indeces.append(idx)

events.drop(fault_indeces, axis = 0, inplace = True)

In [23]:
#Add label 3 ("not worn") for the nights where no event was recorded
new_user_col = []
new_night_col = []
for i in range(277):
    user = events[events["series_id"]==i]
    nights = list(user["night"].unique())
    train_user = train[train["series_id"]==i]
    total_nights = (train_user["timestamp"].max()-train_user["timestamp"].min()).days
    missing_nights = [x for x in list(range(1,total_nights+1)) if x not in nights]
    new_night_col.extend(missing_nights)
    new_user_col.extend([i]*len(missing_nights))
    
df = pd.DataFrame({"series_id": new_user_col, "night": new_night_col, "event": [3]*len(new_user_col)
, "step": [None]*len(new_user_col), "timestamp": [None]*len(new_user_col)})

events_worn = pd.concat([events, df]).sort_values(by=['series_id', 'night'])

In [8]:
#For a given timestep and delta returns all the rows in that timeframe, then checks in a rolling window the 
#difference between the largest and smallest ENMO and if below a thresholds then takes the earliest/latest event
#and assigns to it the labels taken off/worn 
def get_sliding_window(series_id, time1, delta, event):
    idx = None
    window_size = 720 #1.5 hours with no activity, hyperparameters that can be modified(together with the max/min) in the lines below
    threshold = 0.1
    if event == 2:
        df = train[(train["series_id"] == series_id) & (train["timestamp"] > time1) & (train["timestamp"] < time1+delta)] 
    else:
        df = train[(train["series_id"] == series_id) & (train["timestamp"] < time1) & (train["timestamp"] > time1-delta)]

    df["window"] = df["enmo"].rolling(window_size).max()-df["enmo"].rolling(window_size).min()
    zero_window_indices = df[df["window"] < threshold].timestamp

    #Situation where no valid rest period was detected, force the search
    while len(zero_window_indices) == 0:
        threshold += 0.1
        window = df["enmo"].rolling(window_size)
        df["window"] = window.max()-window.min()
        zero_window_indices = df[df["window"] < threshold].timestamp
        
    if event == 2:
        idx = zero_window_indices.min()-timedelta(seconds=(window_size-1)*5) #Because the index of rolling window is to the right
    else:
        idx = zero_window_indices.max()
    step = df[df['timestamp'] == idx].iloc[0]['step']
    return idx, step

In [9]:
#Detect when the watch was taken off/worn again between the nights where an event was recorded and none and then add a label to these events
timestamps = []
events = []
steps = []
nights = []
series_ids = []
for i in trange(277):
    user = events_worn[events_worn["series_id"]==i]
    filtered_timestamps = user[((user['event'].shift(-1) == 3) | (user['event'].shift(1) == 3)) & (user['event'] != 3)]
    filtered_timestamps["timestamp"] = pd.to_datetime(filtered_timestamps["timestamp"], format='%Y-%m-%dT%H:%M:%S%z')
    time_event = filtered_timestamps[["timestamp","event"]]
    for j in range(len(time_event)):
        timestamp,step = get_sliding_window(i,time_event.iloc[j]["timestamp"],timedelta(hours=24),time_event.iloc[j]["event"])
        timestamps.append(timestamp)
        night = filtered_timestamps.iloc[j]["night"]
        if time_event.iloc[j]["event"] == 2:
            events.append(4)
            nights.append(night+1)
        else:
            events.append(5)
            nights.append(night-1)
        series_ids.append(i)
        steps.append(step)

  0%|          | 0/277 [00:00<?, ?it/s]

In [10]:
#Add the collected data to the labels and drop the label 3 
new_labels = {"series_id":series_ids, "night": nights, "event" : events, "step": steps, "timestamp": timestamps}
new_df = pd.DataFrame(new_labels)
concatenated_df = pd.concat([events_worn, new_df], ignore_index=True)
concatenated_df['timestamp'] = pd.to_datetime(concatenated_df['timestamp'], format='%Y-%m-%dT%H:%M:%S%z')
sorted_df = concatenated_df.sort_values(by=['series_id','timestamp'])
sorted_df= sorted_df[sorted_df['event'] != 3]
sorted_df['event'] = sorted_df['event'].replace({4:3, 5:4})
print(sorted_df.head(10))

       series_id  night  event   step           timestamp
0              0      1      1   4992 2018-08-14 22:26:00
1              0      1      2  10932 2018-08-15 06:41:00
2              0      2      1  20244 2018-08-15 19:37:00
3              0      2      2  27492 2018-08-16 05:41:00
4              0      3      1  39996 2018-08-16 23:03:00
5              0      3      2  44400 2018-08-17 05:10:00
6              0      4      1  57240 2018-08-17 23:00:00
7              0      4      2  62856 2018-08-18 06:48:00
12084          0      5      3  74000 2018-08-18 22:16:40
12085          0      5      4  81772 2018-08-19 09:04:20


In [11]:
sorted_df.to_csv("events_modified.csv", index = False)

In [3]:
#Provide the label to the training data 0 = awake, 1 = sleeping, 2 = not wearing, based on the events_modified
events = pd.read_csv("events_modified.csv")

def process_user(chunk):
    i = chunk.iloc[0]["series_id"]
    user = events[events["series_id"] == i] #Concurrent readin, shouldn't be a problem
    if len(user) == 0:
        chunk['state'] = 2  # No data recording-> Never worn the bracelet
    else:
        for j in range(len(user) - 1):
            event = user.iloc[j]["event"]
            timestamp = user.iloc[j]["timestamp"]
            next_event = user.iloc[j + 1]["event"]
            next_timestamp = user.iloc[j + 1]["timestamp"]

            if j + 1 == len(user) - 1:  # corner case where we are in the last entry
                if next_event == 2:  # Woke up in the last day
                    chunk.loc[(chunk["timestamp"] > next_timestamp), 'state'] = 1
                else:  # Not worn the last days
                    chunk.loc[(chunk["timestamp"] > next_timestamp), 'state'] = 2
            if j == 0:  # First case, need to check what is the first event
                if event == 1:  # Went to sleep at first night, so he was awake before
                    chunk.loc[(chunk["timestamp"] < timestamp), 'state'] = 0
                else:  # No event recorded in the first nights, so not wearing
                    chunk.loc[(chunk["timestamp"] < timestamp), 'state'] = 2
            # Other cases
            if (event == 1 and next_event == 2):  # Sleeping case
                chunk.loc[(chunk["timestamp"] >= timestamp) & (
                            chunk["timestamp"] <= next_timestamp), 'state'] = 1
            elif (event == 2 and next_event == 1) or (event == 2 and next_event == 3) or (
                    event == 4 and next_event == 1):  # Awake case
                chunk.loc[(chunk["timestamp"] >= timestamp) & (
                            chunk["timestamp"] <= next_timestamp), 'state'] = 0
            elif (event == 3 and next_event == 4):  # Not Wearing case
                chunk.loc[(chunk["timestamp"] >= timestamp) & (
                            chunk["timestamp"] <= next_timestamp), 'state'] = 2
            else:  # Check in case of mistakes
                print(user)
                print(timestamp)
    chunck["state"] = chunck["state"].astype(np.unit8)
    return chunk

# Create a list of series_id values to process
chunks = [train[train['series_id'] == i] for i in range(277)]

# Set up multiprocessing pool
num_processes = multiprocessing.cpu_count()
pool = multiprocessing.Pool(processes=num_processes)
results = []

# Process users in parallel
with tqdm(total=len(chunks)) as pbar:
    for result in pool.imap_unordered(process_user, chunks):
        results.append(result) 
        pbar.update(1)

# Close the pool
pool.close()
pool.join()
train = pd.concat(results).sort_values(by=["series_id","step"]) #Avoid fragmentation

  0%|          | 0/277 [00:00<?, ?it/s]

<font size ="7"> Now adding features to the train/test data (With Pandas, use polars for faster results)</font>

Features List (Rows):
signal_awake = Wave that represents the awake period probability (flaot 32)
signal_sleep = Wave that represents the sleep period probability (flaot 32)
lids = Not sure what it does (float 32)
[mean,median,max,min] apply these rolling windows (make some different sizes) operations plus others on the columns ['enmo', 'anglez', 'anglez_x_enmo', 'anglezabs', 'anglez_diff', "anglez_diffabs"]
anglez_diff = Difference between previous entry and new entry (float 32)
anglez_diffabs = Absolute difference between previous entry and new entry (float 32)
anglez_x_enmo = Anglez * Enmo (float 32)
[std] on 'anglez' and 'anglez_diffabs'
[month,day,hour,minute]
mp_enmo = Matrix Profile
mp_angle = Matrix Profile

In [1]:
def is_datetime64_ns_dtype(dataframe, column_name):
    # Get the dtype of the specified column
    dtype = dataframe[column_name].dtype
    
    # Check if the dtype is not datetime64
    return pd.api.types.is_datetime64_ns_dtype(dtype)
    
def reduce_mem_usage(df):
    
    """ 
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.        
    """
    
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object and not is_datetime64_ns_dtype(df, col) and not 'category':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
                    df[col] = df[col].astype(np.uint8)
                elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
                    df[col] = df[col].astype(np.uint16)
                elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
                    df[col] = df[col].astype(np.uint32)
                elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
                    df[col] = df[col].astype(np.int32)  
            else:
                df[col] = df[col].astype(np.float16)
        
    return df

In [2]:
def feat_eng(df):
    
    df.sort_values(['timestamp'], inplace=True)
    
    #BASE SIGNALS
    df.set_index('timestamp', inplace=True)
    df['lids'] = np.maximum(0., df['enmo'] - 0.02)
    df['lids'] = df['lids'].rolling(f'{120*5}s', center=True, min_periods=1).agg('sum')
    df['lids'] = 100 / (df['lids'] + 1)
    df['lids'] = df['lids'].rolling(f'{360*5}s', center=True, min_periods=1).agg('mean').astype(np.float32)
    
    df["enmo_diff"] = df["enmo"].diff().astype(np.float32)
    df["enmo_diffabs"] = abs(df["enmo_diff"]).astype(np.float32)
    df['anglez_x_lids'] = (df['anglez'] * df['lids']).astype(np.float32)
    df["lids_diff"] = df["lids"].diff().astype(np.float32)
    df["lids_diffabs"] = abs(df["lids_diff"]).astype(np.float32)
    df['enmo_x_lids'] = (df['enmo'] * df['lids']).astype(np.float32)
    
    
    for col in ["enmo_diff", "enmo_diffabs",'anglez_x_lids',"lids_diff","lids_diffabs",'enmo_x_lids']:
        
        for n in [51]:
            df[f'{col}_diff_{n}'] = df[col].diff(periods=n).astype(np.float32)
        
            rol_args = {'window':f'{n*5}s', 'min_periods':1, 'center':True}
            
            for agg in ['median', 'mean', 'max', 'min']:
                df[f'{col}_{agg}_{n}'] = df[col].rolling(**rol_args).agg(agg).astype(np.float32).values
                gc.collect()
                
            df[f'{col}_mad_{n}'] = (df[col] - df[f'{col}_median_{n}']).abs().rolling(**rol_args).median().astype(np.float32)

            df[f'{col}_amplit_{n}'] = df[f'{col}_max_{n}']-df[f'{col}_min_{n}']
            df[f'{col}_diff_{n}_max'] = df[f'{col}_max_{n}'].rolling(**rol_args).max().astype(np.float32)
            
            if col in ['anglez', 'anglez_diffabs']:
                df[f'{col}_{agg}_{n}'] = df[col].rolling(**rol_args).std().astype(np.float32)
    
            gc.collect()
        
#         df[f'conv1d_{col}']
    
    df.reset_index(inplace=True)
    df.bfill(inplace=True)
    df.ffill(inplace=True)
    df.dropna(inplace=True)

    df = reduce_mem_usage(df)

    return df

In [5]:
signal_awake = dict(zip(range(1440), np.sin(np.linspace(0, np.pi, 1440) + 0.208 * np.pi) ** 24))
signal_onset = dict(zip(range(1440), np.sin(np.linspace(0, np.pi, 1440) + 0.555 * np.pi) ** 24))
#Might modify the 2 hyperparameters

def custom_feat_eng(df):
    
    #Time related features
    #df['series_id'] = df['series_id'].astype('category')
    #df['timestamp'] = pd.to_datetime(df['timestamp']).apply(lambda t: t.tz_localize(None))
    df['month'] = df['timestamp'].dt.month.astype(np.uint8)
    df['day'] = df['timestamp'].dt.day.astype(np.uint8)
    df['hour'] = df['timestamp'].dt.hour.astype(np.uint8)
    df['minute'] = df['timestamp'].dt.minute.astype(np.uint8)
    df['day_of_week'] = df['timestamp'].dt.dayofweek.astype(np.uint8)
    df.sort_values(['timestamp'], inplace=True)
    
    df['signal_onset'] = (df.timestamp.dt.hour * 60 + df.timestamp.dt.minute).map(signal_onset).astype(np.float32)
    df['signal_awake'] = (df.timestamp.dt.hour * 60 + df.timestamp.dt.minute).map(signal_awake).astype(np.float32)

    #BASE SIGNALS
    df.set_index('timestamp', inplace=True)
    df['lids'] = np.maximum(0., df['enmo'] - 0.02)
    df['lids'] = df['lids'].rolling(f'{120*5}s', center=True, min_periods=1).agg('sum')
    df['lids'] = 100 / (df['lids'] + 1)
    df['lids'] = df['lids'].rolling(f'{360*5}s', center=True, min_periods=1).agg('mean').astype(np.float32)
    
    #Additional Features
    for col in ['enmo', 'anglez', 'lids']:
        #Base new features
        n = 12
        rol_args = {'window':f'{n*5}s', 'min_periods':1, 'center':True}
        for agg in ['median', 'mean', 'max', 'min', 'std']:   
            df[f'{col}_{agg}_{n*5}'] = df[col].rolling(**rol_args).agg(agg).astype(np.float32)
        df[f'{col}_mad_{n*5}'] = (df[col] - df[f'{col}_median_{n*5}']).abs().rolling(**rol_args).median().astype(np.float32) #Median absolute deviation
        df[f'{col}_amplit_{n*5}'] = df[f'{col}_max_{n*5}']-df[f'{col}_min_{n*5}']    
        df.drop(columns = [f'{col}_median_{n*5}'], inplace = True)
        
        #Mean differences across different intervals
        for i in [12,24,120]:
            rol_args = {'window':f'{i*5}s', 'min_periods':1, 'center':True}
            df[f'{col}_next_{i*5}_diff'] = df[col].rolling(**rol_args).mean().diff(periods=n+1)
            df[f'{col}_prev_{i*5}_diff'] = df[col].rolling(**rol_args).mean().diff(periods=-n+1)
            gc.collect()

        #Entropy measure
        #Entropy me
        for b in [20,200]:
            hist, bin_edges = np.histogram(df[col], bins=b)
            probabilities = hist / np.sum(hist)
            df[f'entropy_{b}_{col}'] = np.digitize(df[col], bin_edges[:-1])-1
            translate = {i:value for i,value in enumerate(probabilities)}
            df[f'entropy_{b}_{col}'] = df[f'entropy_{b}_{col}'].map(translate)
            df[f'entropy_{b}_{col}'] = -df[f'entropy_{b}_{col}']*np.log(df[f'entropy_{b}_{col}'])
            gc.collect() 

    #Seconds since/until previous/next significant movement and how many in rolling window as defined by percentile (on enmo)  
    for p in [0.6, 0.9, 0.99]:
        quantile = df['enmo'].quantile(p)
        steps_high = df[df['enmo']>quantile].step 
        steps_high = pd.concat([pd.Series(0),steps_high,pd.Series([len(df)])])
        to_next = steps_high.diff(1).tolist()  
        steps_to_next = []
        steps_to_prev = []
        for i,val in enumerate(to_next[1:]):
            val = int(val)
            to_extend = range(0,val)
            if i == 0:     
                half_val = int(val/2)
                steps_to_next.extend(reversed(to_extend))
                steps_to_prev.extend(range(0+half_val,val+half_val))
                continue
            if i == len(to_next)-2: 
                half_val = int(val/2)
                steps_to_next.extend(reversed(range(0+half_val,val+half_val)))
                steps_to_prev.extend(to_extend)
                break
            steps_to_next.extend(reversed(to_extend))
            steps_to_prev.extend(to_extend)

        #Code encounters some errors with None values, therefore try to correct
        last_next = 100
        last_prev = 100
        for item in reversed(steps_to_next):
            if item is not None:
                last_next = item
                break
        for item in reversed(steps_to_prev):
            if item is not None:
                last_prev = item    
                break                 
        df[f'steps_to_next_{p}'] = steps_to_next.extend(reversed(range(last_next,last_next+(len(df)-len(steps_to_next)))))
        df[f'steps_to_prev_{p}'] = steps_to_prev.extend(range(last_prev,last_prev+(len(df)-len(steps_to_prev))))
        df[f'steps_to_next_{p}'] = df[f'steps_to_next_{p}'].fillna(last_next).astype(np.uint32)
        df[f'steps_to_prev_{p}'] = df[f'steps_to_prev_{p}'].fillna(last_prev).astype(np.uint32)

        
        df[f'log_steps_to_next_{p}'] = np.log1p(df[f'steps_to_next_{p}']).astype(np.float32)
        df[f'log_steps_to_prev_{p}'] = np.log1p(df[f'steps_to_prev_{p}']).astype(np.float32)
        
        for i in [24,120,400]:   
            rol_args = {'window':f'{i*5}s', 'min_periods':1, 'center':True}
            df[f'amount_of_movement_{p}_in_last{i*5}'] = df['enmo']>quantile
            df[f'amount_of_movement_{p}_in_last{i*5}'] = df[f'amount_of_movement_{p}_in_last{i*5}'].rolling(**rol_args).sum().astype(np.float32)
            df[f'amount_of_movement_{p}_in_last{i*5}'] = np.log1p(df[f'amount_of_movement_{p}_in_last{i*5}']).astype(np.float32)
    
    """
    ############################ Matrix Profile(Computationally too expensive) ##############################################
    window_size = 5760
    nan_values = np.full(window_size-1, np.nan)
    stump = stumpy.scrump(df['enmo'].astype(np.float64).values, m=window_size, pre_scrump = True) #Window size of 8 hours
    df['enmo_mp'] = np.append(stump.P_, nan_values)
    df['enmo_mp'] = df['enmo_mp'].astype(np.float16)
    df['enmo_i'] = np.append(stump.I_, nan_values)
    df['enmo_i'] = df['enmo_i'].astype(np.float16)
    stump = stumpy.scrump(df['anglez'].astype(np.float64).values, m=window_size, pre_scrump = True)
    df['anglez_mp'] = np.append(stump.P_, nan_values)
    df['anglez_mp'] = df['anglez_mp'].astype(np.float16)
    df['anglez_i'] = np.append(stump.I_, nan_values)
    df['anglez_i'] = df['anglez_i'].astype(np.float16)
    """
    
    df.reset_index(inplace=True)
    df.ffill(inplace=True)
    df.bfill(inplace=True)
    df.dropna(inplace=True)

    df = reduce_mem_usage(df)

    return df

Other Feature Engeneering approaches:
https://www.kaggle.com/code/zotovaa/cmi-gradient-boosting-new-parameters
https://www.kaggle.com/code/lccburk/feature-engineering-and-random-forest-prediction
https://www.kaggle.com/code/zhukovoleksiy/detect-sleep-states-starter-notebook-ensemble
https://www.kaggle.com/code/renatoreggiani/feature-eng-ideas-and-lightgbm-cmi

In [6]:
train = pd.read_parquet("train_label.parquet")

In [7]:
# Reuse the results chunks from the previous session
chunks = [train[train['series_id'] == i] for i in range(277)]

In [None]:
num_processes = multiprocessing.cpu_count()
pool = multiprocessing.Pool(processes=num_processes)
results = []
with tqdm(total=len(chunks)) as pbar:
    for result in pool.imap_unordered(feat_eng, chunks):
        results.append(result) 
        pbar.update(1)

# Close the pool
pool.close()
pool.join()

  0%|          | 0/277 [00:00<?, ?it/s]

In [29]:
del chunks
del train
gc.collect()
train_mod = pd.concat(results).sort_values(by=["series_id","step"])
del results
gc.collect()

0

In [30]:
train_mod.to_parquet("train3.parquet")

In [31]:
train_mod.dtypes

timestamp                  datetime64[ns]
series_id                          uint16
step                               uint32
anglez                            float32
enmo                              float32
                                ...      
enmo_x_lids_diff_51_max           float32
entropy_20_enmo                   float64
entropy_200_enmo                  float64
entropy_20_anglez                 float64
entropy_200_anglez                float64
Length: 65, dtype: object

In [32]:
print(train_mod.columns)

Index(['timestamp', 'series_id', 'step', 'anglez', 'enmo', 'state', 'lids',
       'enmo_diff', 'enmo_diffabs', 'anglez_x_lids', 'lids_diff',
       'lids_diffabs', 'enmo_x_lids', 'enmo_diff_diff_51',
       'enmo_diff_median_51', 'enmo_diff_mean_51', 'enmo_diff_max_51',
       'enmo_diff_min_51', 'enmo_diff_mad_51', 'enmo_diff_amplit_51',
       'enmo_diff_diff_51_max', 'enmo_diffabs_diff_51',
       'enmo_diffabs_median_51', 'enmo_diffabs_mean_51', 'enmo_diffabs_max_51',
       'enmo_diffabs_min_51', 'enmo_diffabs_mad_51', 'enmo_diffabs_amplit_51',
       'enmo_diffabs_diff_51_max', 'anglez_x_lids_diff_51',
       'anglez_x_lids_median_51', 'anglez_x_lids_mean_51',
       'anglez_x_lids_max_51', 'anglez_x_lids_min_51', 'anglez_x_lids_mad_51',
       'anglez_x_lids_amplit_51', 'anglez_x_lids_diff_51_max',
       'lids_diff_diff_51', 'lids_diff_median_51', 'lids_diff_mean_51',
       'lids_diff_max_51', 'lids_diff_min_51', 'lids_diff_mad_51',
       'lids_diff_amplit_51', 'lids_diff_di