In [1]:
import pandas as pd
import numpy as np
import json
import polars as pl
import gc
from tqdm import tqdm
import multiprocessing
import joblib
import lightgbm as lgb

column_names = {
    'series_id_column_name': 'series_id',
    'time_column_name': 'step',
    'event_column_name': 'event',
    'score_column_name': 'score',
}

tolerances = {
    'onset': [12, 36, 60, 90, 120, 150, 180, 240, 300, 360], 
    'wakeup': [12, 36, 60, 90, 120, 150, 180, 240, 300, 360]
}

In [2]:
#For a quantile and a dataframe, returns a list containign the distances to the previous/next element above that quantile
def return_dist(df, p, len_df):
    steps_high = df.filter(pl.col('enmo') > pl.quantile('enmo',p)).select('step')
    steps_high = steps_high.to_numpy().flatten()
    steps_high = np.append((np.insert(steps_high,0,0)),len_df)
    to_next = np.diff(steps_high).tolist()
    steps_to_next = []
    steps_to_prev = []
    for i, val in enumerate(to_next[1:]):
        val = int(val)
        to_extend = range(0, val)
        if i == 0:
            half_val = int(val / 2)
            steps_to_next.extend(list(reversed(to_extend)))
            steps_to_prev.extend(list(range(0+half_val, val+half_val)))
            continue
        if i == len(to_next)-2:
            half_val = int(val / 2)
            steps_to_next.extend(list(reversed(range(0+half_val, val+half_val))))
            steps_to_prev.extend(list(to_extend))
            break
        steps_to_next.extend(list(reversed(to_extend)))
        steps_to_prev.extend(list(to_extend))
    
    last_next = 100
    last_prev = 100
    for item in reversed(steps_to_next):
        if item is not None:
            last_next = item
            break
    for item in reversed(steps_to_prev):
        if item is not None:
            last_prev = item    
            break
            
    steps_to_next.extend(reversed(range(last_next,last_next+(len_df-len(steps_to_next)))))
    steps_to_prev.extend(range(last_prev,last_prev+(len_df-len(steps_to_prev))))
    
    return pl.Series(values = steps_to_next), pl.Series(values = steps_to_prev), last_next, last_prev

In [3]:
signal_awake = dict(zip(range(1440), np.sin(np.linspace(0, np.pi, 1440) + 0.208 * np.pi) ** 24))
signal_onset = dict(zip(range(1440), np.sin(np.linspace(0, np.pi, 1440) + 0.555 * np.pi) ** 24))

def custom_feat_eng(df:pl.DataFrame, to_keep:list[str])->pl.DataFrame:
    len_df = len(df) #It sucks to have to do this only for the height 
    #Column arrays
    Prefab = []
    Entropy = []
    Distance = []
    Movement = []
    
    
    lids = np.maximum(pl.col('enmo')-0.02, 0.).rolling_sum(window_size = 120, center=True, min_periods=1)
    df = df.with_columns((100/(lids+1)).rolling_mean(window_size = 360, center=True, min_periods=1).cast(pl.Float32).alias('lids'))
    
    
    # Time related features
    Time_columns = [
    #pl.col('timestamp').str.to_datetime(),  
    pl.col('timestamp').dt.hour().cast(pl.UInt8).alias('hour'),
    pl.col('timestamp').dt.minute().cast(pl.UInt8).alias('minute')
    ]
    
    #Not included 'lids_diffabs','lids_diff',enmo_diffabs'
    Base_signals = [
        (pl.col('hour') * 60 + pl.col('minute')).cast(pl.UInt16).map_dict(signal_awake).cast(pl.Float32).alias('signal_awake'),
        (pl.col('hour') * 60 + pl.col('minute')).cast(pl.UInt16).map_dict(signal_onset).cast(pl.Float32).alias('signal_onset'),
        pl.col('anglez').abs().cast(pl.Float32).alias('anglezabs'),
        pl.col('anglez').diff().cast(pl.Float32).alias('anglez_diff'), 
        pl.col('enmo').diff().cast(pl.Float32).alias('enmo_diff'), 
        pl.col('anglez').diff().abs().cast(pl.Float32).alias('anglez_diffabs'), 
        (pl.col('enmo')*pl.col('lids')).cast(pl.Float32).alias('enmo_x_lids'),
        (pl.col('anglez')*pl.col('enmo')).cast(pl.Float32).alias('anglez_x_enmo'),
        (pl.col('lids')*pl.col('anglez')).cast(pl.Float32).alias('anglez_x_lids'),
    ]
    
    
    rol_args = {'window_size':51, 'min_periods':1, 'center':True}
    rol_args_2 = {'window_size':60, 'min_periods':1, 'center':True}
    
    for column in ['anglezabs','anglez_diff','enmo_diff','anglez_diffabs','enmo_x_lids',
                   'anglez_x_lids','anglez_x_enmo','enmo', 'anglez', 'lids']:
        n = 51
        #Various rolling statistics (Modified kaggle code)
        Prefab.extend([pl.col(column).diff(n).cast(pl.Float32).alias(f'{column}_diff_{n}'),
                    pl.col(column).rolling_max(**rol_args).cast(pl.Float32).alias(f'{column}_diff_{n}_max'),
                    pl.col(column).rolling_min(**rol_args).cast(pl.Float32).alias(f'{column}_min_{n}'),
                    pl.col(column).rolling_max(**rol_args).cast(pl.Float32).alias(f'{column}_max_{n}'),
                    pl.col(column).rolling_std(**rol_args).cast(pl.Float32).alias(f'{column}_std_{n}'),
                    pl.col(column).rolling_mean(**rol_args).cast(pl.Float32).alias(f'{column}_mean_{n}'),
                    pl.col(column).rolling_median(**rol_args).cast(pl.Float32).alias(f'{column}_median_{n}'),
                    (pl.col(column).rolling_median(**rol_args)-pl.col(column)).abs().rolling_median(**rol_args).cast(pl.Float32).alias(f'{column}_mad_{n}'),
                    (pl.col(column).rolling_min(**rol_args)-pl.col(column).rolling_max(**rol_args)).cast(pl.Float32).alias(f'{column}_amplit_{n}'),
                    (pl.col(column).rolling_median(**rol_args_2)-pl.col(column)).abs().rolling_median(**rol_args_2).cast(pl.Float32).alias(f'{column}_mad_60')])
    
    #Other columns to add manually 
    Prefab.extend([pl.col('anglez').rolling_mean(window_size = 120, center=True, min_periods=1).diff(-120+1).cast(pl.Float32).alias('anglez_prev_600_diff'),
            pl.col('anglez').rolling_std(**rol_args_2).cast(pl.Float32).alias(f'anglez_std_60'),
            pl.col('enmo').rolling_mean(window_size = 120, center=True, min_periods=1).diff(-120+1).cast(pl.Float32).alias('enmo_prev_600_diff'),
            pl.col('enmo').rolling_mean(**rol_args_2).cast(pl.Float32).alias(f'enmo_mean_60'),
            pl.col('enmo').rolling_min(**rol_args_2).cast(pl.Float32).alias(f'enmo_min_60')])

    
    for col in ['enmo', 'anglez', 'lids']:
        col_max = df.select(col).max() 
        col_min = df.select(col).min() 
        for b in [20,200]:
            if df.get_column(col).n_unique() < b: #Corner case if there are not enough values
                hist, bin_edges = np.histogram(df.get_column(col), bins=df.get_column(col).n_unique())
                probabilities = hist / np.sum(hist)
                digitized = np.digitize(df.get_column(col), bin_edges[:-1])-1
                entropy_values = np.array([-probabilities[x]*np.log(probabilities[x]) for x in digitized]).astype(np.float32)
                Entropy.append(pl.Series(values = entropy_values).alias(f'entropy_{b}_{col}'))  
                continue
            digitized = pl.col(col).cut(np.linspace(col_min,col_max,b,False)[1:], labels = [str(x) for x in range(b)]).cast(pl.Utf8).str.parse_int(10).cast(pl.UInt32).alias('digitized') #Need to cast cat dtype first to str and then int
            digitized = df.select(digitized).to_series(0) 
            counted = digitized.value_counts(parallel = False)
            probabilities = (digitized.map_dict(dict(zip(counted.to_series(0),counted.to_series(1)))))/len_df 
            probabilities = -probabilities*probabilities.log() 
            Entropy.append(probabilities.alias(f'entropy_{b}_{col}'))

    
    for p in [0.6, 0.9, 0.99]:
        steps_to_next, steps_to_prev, last_next, last_prev = return_dist(df,p,len_df) 
        Distance.extend([steps_to_next.fill_nan(last_next).cast(pl.UInt32).alias(f'steps_to_next_{p}'), #Make sure that for loops don't break lazy eval
                         steps_to_prev.fill_nan(last_prev).cast(pl.UInt32).alias(f'steps_to_prev_{p}'),
                         steps_to_next.log1p().cast(pl.Float32).alias(f'log_steps_to_next_{p}'),
                         steps_to_prev.log1p().cast(pl.Float32).alias(f'log_steps_to_prev_{p}')])
        
        for i in [24,120,400]:   
                rol_args = {'window_size':i, 'min_periods':1, 'center':True}
                mask = pl.when(pl.col('enmo') > pl.quantile('enmo',p)).then(1).otherwise(0)
                Movement.append(mask.rolling_sum(**rol_args).cast(pl.Float32).alias(f'amount_of_movement_{p}_in_last_{i*5}'))
    
    df = df.lazy() #Once did all the computations on the dataframe optimize query      
    df = df.with_columns(Time_columns)
    df = df.with_columns(Base_signals)      
    df = df.with_columns(Prefab)  
    df = df.with_columns(Distance)  
    df = df.with_columns(Movement) 
    df = df.with_columns(Entropy)
    
    df = df.select(pl.all().forward_fill()) 
    df = df.select(pl.all().backward_fill()) 
    df = df.select(pl.all().drop_nans()) 

    return df.select(to_keep)

In [4]:
id_cols = ['series_id', 'step', 'timestamp']
target_col  =['state']
feature_cols = np.array(['anglez', 'enmo','lids', 'hour', 'minute', 'signal_awake', 'signal_onset', 
                'anglezabs', 'anglez_diff', 'enmo_diff', 'anglez_diffabs', 'enmo_x_lids', 
                'anglez_x_enmo', 'anglez_x_lids', 'anglezabs_diff_51', 'anglezabs_diff_51_max', 
                'anglezabs_min_51', 'anglezabs_max_51', 'anglezabs_std_51', 'anglezabs_mean_51',
                'anglezabs_median_51', 'anglezabs_mad_51', 'anglezabs_amplit_51', 
                'anglezabs_mad_60', 'anglez_diff_diff_51', 'anglez_diff_diff_51_max', 
                'anglez_diff_min_51', 'anglez_diff_max_51', 'anglez_diff_std_51', 
                'anglez_diff_mean_51', 'anglez_diff_median_51', 'anglez_diff_mad_51', 
                'anglez_diff_amplit_51', 'anglez_diff_mad_60', 'enmo_diff_diff_51', 
                'enmo_diff_diff_51_max', 'enmo_diff_min_51', 'enmo_diff_max_51', 
                'enmo_diff_std_51', 'enmo_diff_mean_51', 'enmo_diff_median_51', 
                'enmo_diff_mad_51', 'enmo_diff_amplit_51', 'enmo_diff_mad_60', 
                'anglez_diffabs_diff_51', 'anglez_diffabs_diff_51_max', 'anglez_diffabs_min_51', 
                'anglez_diffabs_max_51', 'anglez_diffabs_std_51', 'anglez_diffabs_mean_51', 
                'anglez_diffabs_median_51', 'anglez_diffabs_mad_51', 'anglez_diffabs_amplit_51', 
                'anglez_diffabs_mad_60', 'enmo_x_lids_diff_51', 'enmo_x_lids_diff_51_max', 
                'enmo_x_lids_min_51', 'enmo_x_lids_max_51', 'enmo_x_lids_std_51', 
                'enmo_x_lids_mean_51', 'enmo_x_lids_median_51', 'enmo_x_lids_mad_51', 
                'enmo_x_lids_amplit_51', 'enmo_x_lids_mad_60', 'anglez_x_lids_diff_51', 
                'anglez_x_lids_diff_51_max', 'anglez_x_lids_min_51', 'anglez_x_lids_max_51', 
                'anglez_x_lids_std_51', 'anglez_x_lids_mean_51', 'anglez_x_lids_median_51',
                'anglez_x_lids_mad_51', 'anglez_x_lids_amplit_51', 'anglez_x_lids_mad_60', 
                'anglez_x_enmo_diff_51', 'anglez_x_enmo_diff_51_max', 'anglez_x_enmo_min_51', 
                'anglez_x_enmo_max_51', 'anglez_x_enmo_std_51', 'anglez_x_enmo_mean_51', 
                'anglez_x_enmo_median_51', 'anglez_x_enmo_mad_51', 'anglez_x_enmo_amplit_51', 
                'anglez_x_enmo_mad_60', 'enmo_diff_51', 'enmo_diff_51_max', 'enmo_min_51', 
                'enmo_max_51', 'enmo_std_51', 'enmo_mean_51', 'enmo_median_51', 'enmo_mad_51', 
                'enmo_amplit_51', 'enmo_mad_60', 'anglez_diff_51', 'anglez_diff_51_max', 
                'anglez_min_51', 'anglez_max_51', 'anglez_std_51', 'anglez_mean_51', 
                'anglez_median_51', 'anglez_mad_51', 'anglez_amplit_51', 'anglez_mad_60', 
                'lids_diff_51', 'lids_diff_51_max', 'lids_min_51', 'lids_max_51', 'lids_std_51', 
                'lids_mean_51', 'lids_median_51', 'lids_mad_51', 'lids_amplit_51', 'lids_mad_60', 
                'anglez_prev_600_diff', 'anglez_std_60', 'enmo_prev_600_diff', 'enmo_mean_60', 
                'enmo_min_60', 'steps_to_next_0.6', 'steps_to_prev_0.6', 'log_steps_to_next_0.6', 
                'log_steps_to_prev_0.6', 'steps_to_next_0.9', 'steps_to_prev_0.9', 
                'log_steps_to_next_0.9', 'log_steps_to_prev_0.9', 'steps_to_next_0.99', 
                'steps_to_prev_0.99', 'log_steps_to_next_0.99', 'log_steps_to_prev_0.99', 
                'amount_of_movement_0.6_in_last_120', 'amount_of_movement_0.6_in_last_600', 
                'amount_of_movement_0.6_in_last_2000', 'amount_of_movement_0.9_in_last_120',
                'amount_of_movement_0.9_in_last_600', 'amount_of_movement_0.9_in_last_2000', 
                'amount_of_movement_0.99_in_last_120', 'amount_of_movement_0.99_in_last_600', 
                'amount_of_movement_0.99_in_last_2000', 'entropy_20_enmo', 'entropy_200_enmo', 
                'entropy_20_anglez', 'entropy_200_anglez', 'entropy_20_lids', 'entropy_200_lids'])
#Split array according to the original value where it's from
enmo_feat = []
anglez_feat = []
lids_feat = []
for feat in feature_cols:
    if 'enmo' in feat:
        enmo_feat.append(feat)
    elif 'anglez' in feat:
        anglez_feat.append(feat)
    else:
        lids_feat.append(feat)
splits = {'enmo1': enmo_feat[:len(enmo_feat)//2]+target_col,
         'enmo2': enmo_feat[len(enmo_feat)//2:]+target_col, 
         'anglez1': anglez_feat[:len(anglez_feat)//2]+target_col, 
         'anglez2': anglez_feat[len(anglez_feat)//2:]+target_col,
         'lids1': lids_feat[:len(lids_feat)//2]+target_col,
         'lids2': lids_feat[len(lids_feat)//2:]+target_col}

In [5]:
to_keep = ['entropy_200_enmo',
 'steps_to_next_0.99',
 'steps_to_next_0.9',
 'entropy_20_lids',
 'entropy_20_anglez',
 'hour',
 'log_steps_to_next_0.99',
 'log_steps_to_next_0.9',
 'lids',
 'signal_onset',
 'signal_awake',
 'log_steps_to_prev_0.99',
 'entropy_200_lids',
 'entropy_200_anglez',
 'amount_of_movement_0.9_in_last_2000',
 'anglez_diff_diff_51_max',
 'anglezabs_max_51',
 'enmo_x_lids_min_51',
 'anglez_diff_mad_51',
 'anglez_x_enmo_diff_51_max',
 'amount_of_movement_0.99_in_last_2000',
 'anglez_x_enmo_median_51',
 'anglezabs',
 'anglez',
 'anglezabs_min_51',
 'lids_mad_60']

In [16]:
import pandas as pd

events = pd.read_csv('events_compressed.csv') 
counted_series = events['series_id'].value_counts().reset_index()
counted_series.columns = ['series_id', 'count']
most = list(counted_series['series_id'])[0:5]
print(most)

[174, 275, 250, 41, 22]


In [20]:
splits = {'app': to_keep+id_cols+target_col}

for split,cols in splits.items():
    print(f'Current split: {split}')
    train = pl.scan_parquet(source = 'train_label.parquet',) #length of 127.946.340
    chuncks = []
    for idt in tqdm(most):
        id_df = train.filter(pl.col('series_id') == idt).collect()
        chuncks.append(custom_feat_eng(id_df,cols))
    train = pl.concat(chuncks, how = 'vertical').collect()
    train.write_parquet(f"train_{split}.parquet", compression = 'zstd', compression_level = 10)

Current split: app


100%|██████████| 5/5 [00:17<00:00,  3.60s/it]


Create timeframe for training fine segmentation

In [6]:
import random
def get_timeframe(df:pl.DataFrame, timeframe:int, event:pl.DataFrame, series_id: int, len_df: int)->pl.DataFrame:
    event_steps = event.filter(pl.col('series_id') == series_id).select('step').to_numpy()
    true_vals = []
    target = []
    for event_step in event_steps:
        offset = random.randint(-timeframe//3,timeframe//3) #Small offset so that the split is not in the dead center, can be corrected 
    #by looking how far the tree prediction on average is
        start_idx = event_step[0]-timeframe//2+offset
        end_idx = event_step[0]+timeframe//2+offset
        if start_idx >= 0 and end_idx<len_df and (len(true_vals) == 0 or start_idx>true_vals[-1]): #Avoid overlapping
            true_vals += range(start_idx, end_idx)
            target.append(timeframe//2+offset)
    filter_array = np.array([False]*len_df)
    filter_array[true_vals] = True
    filter_array = pl.Series(filter_array)
    return df.filter(filter_array), target

In [8]:
splits = {'fine': to_keep+target_col}
timeframe = 1440 #2 hours timeframe
event = pl.read_csv('events_compressed.csv')

for split,cols in splits.items():
    print(f'Current split: {split}')
    train = pl.scan_parquet(source = 'train_label.parquet',) #length of 127.946.340
    chuncks = []
    targets = []
    for idt in tqdm(range(140,277)):
        id_df = train.filter(pl.col('series_id') == idt).collect()
        eng_df = custom_feat_eng(id_df,cols)
        eng_df, target = get_timeframe(eng_df, timeframe, event, idt, len(id_df))
        chuncks.append(eng_df)
        targets += target
    train = pl.concat(chuncks, how = 'vertical').collect()
    train.write_parquet(f"/sleep/train_{split}_{timeframe}_271.parquet")
    with open("target.txt", "a") as file:
        for item in targets:
            file.write(str(item) + "\n")

Current split: fine


100%|██████████| 137/137 [08:41<00:00,  3.81s/it]
