In [1]:
import numpy as np
import pandas as pd
from itertools import groupby
from sklearn.model_selection import train_test_split
from pandas.api.types import is_datetime64_ns_dtype

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
from imblearn.under_sampling import RandomUnderSampler
from joblib import Parallel, delayed
import gc

import warnings
warnings.filterwarnings("ignore")

In [4]:
test = pd.read_csv("../data/raw/Generated_Test_Data.csv")
test.to_parquet("../data/raw/Generated_Test_Data.parquet")

In [9]:
test

Unnamed: 0,series_id,step,timestamp,anglez,enmo
0,test_series,0,2024-03-12 12:00:00,43.479698,0.070968
1,test_series,1,2024-03-12 12:00:05,-69.227322,0.082818
2,test_series,2,2024-03-12 12:00:10,19.877962,0.058755
3,test_series,3,2024-03-12 12:00:15,-71.799299,0.02582
4,test_series,4,2024-03-12 12:00:20,-6.56579,0.040585
5,test_series,5,2024-03-12 12:00:25,-66.885441,0.025394
6,test_series,6,2024-03-12 12:00:30,-61.769739,0.03545
7,test_series,7,2024-03-12 12:00:35,-53.89836,0.077288
8,test_series,8,2024-03-12 12:00:40,-60.90766,0.021552
9,test_series,9,2024-03-12 12:00:45,-58.406604,0.098169


In [6]:
train = pd.read_parquet("../data/raw/train_series.parquet")

In [8]:
train_filtered = train[train["step"] == 592375]
print(train_filtered)


              series_id    step                 timestamp     anglez    enmo
982255     03d92c9f6f8a  592375  2018-07-04T18:44:35-0400 -48.561501  0.0131
2103655    04f547b8017d  592375  2019-01-01T18:44:35-0500  19.493999  0.0786
3584515    062dbd4c95e6  592375  2018-09-25T18:59:35-0400 -10.802900  0.0885
5430595    0ce74d6d2106  592375  2017-09-24T23:14:35-0400  -0.524900  0.0151
10161175   12d01911d509  592375  2019-02-12T17:44:35-0500  -1.289600  0.0886
10905655   1319a1935f48  592375  2018-02-01T18:29:35-0500  -4.849200  0.0628
15719755   18a0ca03431d  592375  2018-04-03T22:44:35-0400 -16.542801  0.0038
17879755   1d4569cbac0f  592375  2018-11-20T18:29:35-0500  11.799400  0.0233
19993675   25e2b3dd9c3b  592375  2019-01-17T17:14:35-0500  25.237801  0.0288
20668315   2654a87be968  592375  2018-11-19T16:59:35-0500  10.324400  0.0291
21687295   280e08693c6d  592375  2019-04-22T17:59:35-0400 -14.314400  0.0415
24170035   2b8d87addea9  592375  2018-12-05T22:59:35-0500 -18.587700  0.0505

In [2]:
def feat_eng(df):
    
    df['series_id'] = df['series_id'].astype('category')
    df['timestamp'] = pd.to_datetime(df['timestamp']).apply(lambda t: t.tz_localize(None))
    df['hour'] = df["timestamp"].dt.hour
    
    df.sort_values(['timestamp'], inplace=True)
    df.set_index('timestamp', inplace=True)
    
    df['lids'] = np.maximum(0., df['enmo'] - 0.02)
    df['lids'] = df['lids'].rolling(f'{120*5}s', center=True, min_periods=1).agg('sum')
    df['lids'] = 100 / (df['lids'] + 1)
    df['lids'] = df['lids'].rolling(f'{360*5}s', center=True, min_periods=1).agg('mean').astype(np.float32)
    
    df["enmo"] = (df["enmo"]*1000).astype(np.int16)
    df["anglez"] = df["anglez"].astype(np.int16)
    df["anglezdiffabs"] = df["anglez"].diff().abs().astype(np.float32)
    
    for col in ['enmo', 'anglez', 'anglezdiffabs']:
        
        # periods in seconds        
        periods = [60, 360, 720, 3600] 
        
        for n in periods:
            
            rol_args = {'window':f'{n+5}s', 'min_periods':10, 'center':True}
            
            for agg in ['median', 'mean', 'max', 'min', 'var']:
                df[f'{col}_{agg}_{n}'] = df[col].rolling(**rol_args).agg(agg).astype(np.float32).values
                gc.collect()
            
            if n == max(periods):
                df[f'{col}_mad_{n}'] = (df[col] - df[f'{col}_median_{n}']).abs().rolling(**rol_args).median().astype(np.float32)
            
            df[f'{col}_amplit_{n}'] = df[f'{col}_max_{n}']-df[f'{col}_min_{n}']
            df[f'{col}_amplit_{n}_min'] = df[f'{col}_amplit_{n}'].rolling(**rol_args).min().astype(np.float32).values
            
#             if col in ['enmo', 'anglez']:
            df[f'{col}_diff_{n}_max'] = df[f'{col}_max_{n}'].diff().abs().rolling(**rol_args).max().astype(np.float32)
            df[f'{col}_diff_{n}_mean'] = df[f'{col}_max_{n}'].diff().abs().rolling(**rol_args).mean().astype(np.float32)

    
            gc.collect()
    
    df.reset_index(inplace=True)
    df.dropna(inplace=True)

    return df

In [3]:
file = "../data/processed/merged_data.parquet"


In [4]:
def feat_eng_by_id(idx):
    
    from warnings import simplefilter 
    simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
    
    df  = pd.read_parquet(file, filters=[('series_id','=',idx)])
    df['event'] = df['event'].astype(np.int8)
    df = feat_eng(df)
    
    return df


In [5]:
DEV = False

series_id  = pd.read_parquet(file, columns=['series_id'])
series_id = series_id.series_id.unique()

print(len(series_id))

if DEV:
    series_id = series_id[::10]

35


In [6]:
%%time

train = Parallel(n_jobs=6)(delayed(feat_eng_by_id)(i) for i in series_id)
train = pd.concat(train, ignore_index=True)



CPU times: user 9.91 s, sys: 35.8 s, total: 45.7 s
Wall time: 5min 27s


In [7]:
train

Unnamed: 0,timestamp,series_id,step,anglez,enmo,event,hour,lids,anglezdiffabs,enmo_median_60,...,anglezdiffabs_median_3600,anglezdiffabs_mean_3600,anglezdiffabs_max_3600,anglezdiffabs_min_3600,anglezdiffabs_var_3600,anglezdiffabs_mad_3600,anglezdiffabs_amplit_3600,anglezdiffabs_amplit_3600_min,anglezdiffabs_diff_3600_max,anglezdiffabs_diff_3600_mean
0,2018-11-05 10:00:40,08db4255286f,8,-32,48,1,10,14.109544,2.0,58.0,...,1.0,1.855978,10.0,0.0,2.510536,1.0,10.0,10.0,0.0,0.0
1,2018-11-05 10:00:45,08db4255286f,9,-34,46,1,10,14.099998,2.0,58.0,...,1.0,1.856369,10.0,0.0,2.503771,1.0,10.0,10.0,0.0,0.0
2,2018-11-05 10:00:50,08db4255286f,10,-32,79,1,10,14.090455,2.0,58.0,...,1.0,1.856757,10.0,0.0,2.497041,1.0,10.0,10.0,0.0,0.0
3,2018-11-05 10:00:55,08db4255286f,11,-31,71,1,10,14.081097,1.0,51.0,...,1.0,1.851752,10.0,0.0,2.499585,1.0,10.0,10.0,0.0,0.0
4,2018-11-05 10:01:00,08db4255286f,12,-29,58,1,10,14.071933,2.0,58.0,...,1.0,1.852151,10.0,0.0,2.492906,1.0,10.0,10.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13165052,2017-12-02 15:44:05,d5e47b94477e,395809,-18,95,1,15,13.379234,5.0,97.0,...,2.0,2.711590,11.0,0.0,5.097676,1.0,11.0,11.0,0.0,0.0
13165053,2017-12-02 15:44:10,d5e47b94477e,395810,-19,57,1,15,13.356959,1.0,106.0,...,2.0,2.718919,11.0,0.0,5.091511,1.0,11.0,11.0,0.0,0.0
13165054,2017-12-02 15:44:15,d5e47b94477e,395811,-14,66,1,15,13.334811,5.0,113.0,...,2.0,2.707317,11.0,0.0,5.055408,1.0,11.0,11.0,0.0,0.0
13165055,2017-12-02 15:44:20,d5e47b94477e,395812,-19,114,1,15,13.312504,5.0,113.0,...,2.0,2.709239,11.0,0.0,5.067816,1.0,11.0,11.0,0.0,0.0


In [8]:
# REDUCE train data by half
step=400 if DEV else 60
train = train.iloc[::step]
step

60

In [10]:
drop_cols = ['series_id', 'step', 'timestamp']

X, y = train.drop(columns=drop_cols+['event']), train['event']

gc.collect()


584

In [11]:
if not DEV:
    del train
    gc.collect()

In [13]:
class EnsembleAvgProba():
    
    def __init__(self, classifiers):
        
        self.classifiers = classifiers
    
    def fit(self,X,y):
        
        for classifier in self.classifiers:                
            classifier.fit(X, y)
            gc.collect()
     
    def predict_proba(self, X):
        
        probs = []
        
        for m in self.classifiers:
            probs.append(m.predict_proba(X))
        
        probabilities = np.stack(probs)
        p = np.mean(probabilities, axis=0)
        
        return p 
    
    def predict(self, X):
        
        probs = []
        
        for m in self.classifiers:
            probs.append(m.predict(X))
        
        probabilities = np.stack(probs)
        p = np.mean(probabilities, axis=0)
        
        return p.round()

In [22]:
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
import xgboost as xgb

lgb_params1 = {    
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'max_depth': 6,
    'learning_rate': 0.03,
    'n_estimators': 1,
    'subsample_for_bin': 200000,
    'min_child_weight': 0.001,
    'min_child_samples': 20,
    'subsample': 0.9,
    'colsample_bytree': 0.7,  # Uncommented this line
    'reg_alpha': 0.05,
    'reg_lambda': 0.05,
}

xgb_params = {
    'n_estimators': 1,
    'objective': "binary:logistic",
    'learning_rate': 0.02,
    'max_depth': 7,
    'subsample': 0.9,
    'colsample_bytree': 0.7,
    'random_state': 42
}

cat_params = {
    'iterations': 2,
    'learning_rate': 0.03,
    'depth': 6,
    'random_state': 42,
    'verbose': 0  # to prevent training output, remove or set to a larger value to see training progress
}

model = EnsembleAvgProba(classifiers=[
    lgb.LGBMClassifier(random_state=42, **lgb_params1),
    GradientBoostingClassifier(n_estimators=1, max_depth=5, min_samples_leaf=300, random_state=42),
    RandomForestClassifier(n_estimators=1, min_samples_leaf=300, random_state=42, n_jobs=-1),
    xgb.XGBClassifier(**xgb_params),
    CatBoostClassifier(**cat_params),
    ExtraTreesClassifier(n_estimators=1, min_samples_leaf=300, random_state=42, n_jobs=-1)
])



In [23]:
model.fit(X, y)


[LightGBM] [Info] Number of positive: 144339, number of negative: 75079
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.205231 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22150
[LightGBM] [Info] Number of data points in the train set: 219418, number of used features: 115
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.657827 -> initscore=0.653624
[LightGBM] [Info] Start training from score 0.653624


In [32]:
def get_events(idx, classifier, file='test_series.parquet') :
    
    test  = pd.read_parquet(f'../data/raw/{file}',
                    filters=[('series_id','=',idx)])
    test = feat_eng(test)
    X_test = test.drop(columns=drop_cols)
    test = test[drop_cols]

    preds, probs = classifier.predict(X_test), classifier.predict_proba(X_test)[:, 1]
    
    test['prediction'] = preds
    test['prediction'] = test['prediction'].rolling(360+1, center=True).median()
    test['probability'] = probs
    
    test = test[test['prediction']!=2]
    
    test.loc[test['prediction']==0, 'probability'] = 1-test.loc[test['prediction']==0, 'probability']
    test['score'] = test['probability'].rolling(60*12*5, center=True, min_periods=10).mean().bfill().ffill()

    
    test['pred_diff'] = test['prediction'].diff()
    
    test['event'] = test['pred_diff'].replace({1:'wakeup', -1:'onset', 0:np.nan})
    
    test_wakeup = test[test['event']=='wakeup'].groupby(test['timestamp'].dt.date).agg('first')
    test_onset = test[test['event']=='onset'].groupby(test['timestamp'].dt.date).agg('last')
    test = pd.concat([test_wakeup, test_onset], ignore_index=True).sort_values('timestamp')

    return test

In [33]:
cols_sub = ['series_id','step','event','score']

series_id  = pd.read_parquet('../data/raw/test_series.parquet', columns=['series_id'])
series_id = series_id.series_id.unique()

tests = []

for idx in series_id: 

    test = get_events(idx, model)
    tests.append(test[cols_sub])

In [35]:
submission = pd.concat(tests, ignore_index=True).reset_index(names='row_id')


In [36]:
submission

Unnamed: 0,row_id,series_id,step,event,score
