In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet
/kaggle/input/child-mind-institute-detect-sleep-states/sample_submission.csv
/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv
/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet
/kaggle/input/model-gru/model_best.pth


In [2]:
import os
from functools import wraps
import time
import pandas as pd
import numpy as np
from sklearn import preprocessing
import gc
from tqdm.auto import tqdm
import torch
from torch import nn
from torch.utils.data import Dataset

torch.set_num_interop_threads(4)
torch.set_num_threads(4)

device = "cuda" if torch.cuda.is_available() else "cpu"
device



'cuda'

In [3]:
def track_time(f):
    @wraps(f)
    def wrap(*args, **kw):
        ts = time.time()
        result = f(*args, **kw)
        te = time.time()
        print('func:%r took: %2.4f sec' % \
          (f.__name__, te-ts))
        return result
    return wrap

In [4]:
scaler = preprocessing.MinMaxScaler()
columns_to_scale = ['anglez', 'enmo']
class DataParser:
    def __init__(self, data_dir: str = "/kaggle/input/child-mind-institute-detect-sleep-states") -> None:
        self.data_dir = data_dir

    @track_time
    def _clean(self, df: pd.DataFrame) -> pd.DataFrame:
        if "step" in df.columns and "timestamp" in df.columns:
            df = df.dropna(subset=["step", "timestamp"])
            return df
        else:
            raise KeyError("Missing columns: either `step` or `timestamp` not exist.")

    @track_time
    def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
        if "night" in df.columns:
            df["night"] = df["night"].astype(np.int16)

        if "step" in df.columns and "timestamp" in df.columns:
            df["step"] = df["step"].astype(np.int32)
            df["timestamp"] = pd.to_datetime(df["timestamp"], format="%Y-%m-%dT%H:%M:%S%z", utc=True)

        if "anglez" and "enmo" in df.columns:
            normalized_data = scaler.fit_transform(df[columns_to_scale])
            df['anglez_norm'] = normalized_data[:, 0]
            df['enmo_norm'] = normalized_data[:, 1]
            
        df['hour'] = df['timestamp'].dt.hour

        return df

    def load_data(self, file_name: str, file_type: str) -> pd.DataFrame:
        if file_type == "parquet":
            df = pd.read_parquet(os.path.join(self.data_dir, file_name))
            df = self._clean(df)
            df = self._transform(df)
        else:
            df = pd.read_csv(os.path.join(self.data_dir, file_name))
            df = self._clean(df)
            
        return df

In [5]:
parser = DataParser()
test_series = parser.load_data("test_series.parquet", "parquet")
ids = test_series.series_id.unique()
gc.collect()

func:'_clean' took: 0.0035 sec
func:'_transform' took: 0.0181 sec


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


0

In [6]:
class ResidualBiGRU(nn.Module):
    def __init__(self, hidden_size, n_layers=1, bidir=True):
        super(ResidualBiGRU, self).__init__()

        self.hidden_size = hidden_size
        self.n_layers = n_layers

        self.gru = nn.GRU(
            hidden_size,
            hidden_size,
            n_layers,
            batch_first=True,
            bidirectional=bidir,
        )
        dir_factor = 2 if bidir else 1
        self.fc1 = nn.Linear(
            hidden_size * dir_factor, hidden_size * dir_factor * 2
        )
        self.ln1 = nn.LayerNorm(hidden_size * dir_factor * 2)
        self.fc2 = nn.Linear(hidden_size * dir_factor * 2, hidden_size)
        self.ln2 = nn.LayerNorm(hidden_size)

    def forward(self, x, h=None):
        res, new_h = self.gru(x, h)

        res = self.fc1(res)
        res = self.ln1(res)
        res = nn.functional.relu(res)
        
        res = self.fc2(res)
        res = self.ln2(res)
        res = nn.functional.relu(res)
        
        res = res + x

        return res, new_h

class MultiResidualBiGRU(nn.Module):
    def __init__(self, input_size, hidden_size, out_size, n_layers, kernel_size=3, bidir=True):
        super(MultiResidualBiGRU, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.out_size = out_size
        self.n_layers = n_layers

        self.conv1d = nn.Conv1d(in_channels=input_size, out_channels=(hidden_size - input_size) // 2, kernel_size=kernel_size, padding=kernel_size // 2)
        self.conv_gn = nn.GroupNorm(1, (hidden_size - input_size) // 2)

        self.conv1d_5 = nn.Conv1d(in_channels=input_size, out_channels=(hidden_size - input_size) // 2, kernel_size=5, padding=2)
        self.conv1d_gn_5 = nn.GroupNorm(1, (hidden_size - input_size) // 2)

        self.res_bigrus = nn.ModuleList([
            ResidualBiGRU(hidden_size, n_layers=1, bidir=bidir) for _ in range(n_layers)
        ])
        
        self.fc_out = nn.Linear(hidden_size, out_size)

    def forward(self, x, h=None):
        x = x.transpose(1, 2)

        x1 = self.conv1d(x)
        x1 = self.conv_gn(x1)
        x1 = nn.functional.relu(x1)

        x2 = self.conv1d_5(x)
        x2 = self.conv1d_gn_5(x2)
        x2 = nn.functional.relu(x2)
        
        x = torch.cat((x, x1, x2), dim=1)
        x = x.transpose(1, 2)

        if h is None:
            h = [None for _ in range(self.n_layers)]

        new_h = []
        for i, res_bigru in enumerate(self.res_bigrus):
            x, new_hi = res_bigru(x, h[i])
            new_h.append(new_hi)

        x = self.fc_out(x)
        return x, new_h

In [7]:
SAMPLE_FREQ = 12
class SleepDataset(Dataset):
    def __init__(
        self,
        series_ids,
        series,
    ):
        series_ids = series_ids
        series = series.reset_index()
        self.data = []
        
        for viz_id in tqdm(series_ids):
            self.data.append(series.loc[(series.series_id==viz_id)].copy().reset_index())
            
    def downsample_seq_generate_features(self, feat, downsample_factor=SAMPLE_FREQ, std_only=False, is_hour=False):
        if len(feat) % downsample_factor != 0:
            feat = np.concatenate([feat, np.zeros(downsample_factor-((len(feat))%downsample_factor))+feat[-1]])

        feat = np.reshape(feat, (-1, downsample_factor))
        
        if is_hour:
            feat_hour = np.max(feat, 1)
            hour_sin = np.sin(feat_hour * (2 * np.pi / 24))
            hour_cos = np.cos(feat_hour * (2 * np.pi / 24))
            return np.dstack([hour_sin, hour_cos])[0]

        feat_mean   = np.mean(feat,1)
        feat_std    = np.std(feat,1)
        feat_median = np.median(feat,1)
        feat_max    = np.max(feat,1)
        feat_min    = np.min(feat,1)

        if std_only:
            return np.dstack([feat_std])[0]
    
        return np.dstack([feat_mean, feat_std, feat_median, feat_max, feat_min])[0]
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        X = self.data[index][["anglez_norm", "enmo_norm", "hour"]]
        
        X_anglez = self.downsample_seq_generate_features(X.values[:, 0], SAMPLE_FREQ, std_only=True)
        X_enmo   = self.downsample_seq_generate_features(X.values[:, 1], SAMPLE_FREQ)
        X_hour   = self.downsample_seq_generate_features(X.values[:, 2], SAMPLE_FREQ, is_hour=True)
        
        X = np.concatenate([X_anglez, X_enmo, X_hour], -1)
        X = torch.from_numpy(X)
        return X
    
test_ds = SleepDataset(test_series.series_id.unique(),test_series)
del test_series
gc.collect()

  0%|          | 0/3 [00:00<?, ?it/s]

47

In [8]:
max_chunk_size = 24*60*60

In [9]:
def filter_close_candidates(onset_candidates, wakeup_candidates, min_interval=30):
    filtered_onset = []
    filtered_wakeup = []

    onset_set = set(onset_candidates)
    wakeup_set = set(wakeup_candidates)

    for onset in onset_candidates:
        if all(abs(onset - wakeup) >= min_interval for wakeup in wakeup_set):
            filtered_onset.append(onset)

    for wakeup in wakeup_candidates:
        if all(abs(wakeup - onset) >= min_interval for onset in onset_set):
            filtered_wakeup.append(wakeup)

    return filtered_onset, filtered_wakeup

In [10]:
model = MultiResidualBiGRU(input_size=8, hidden_size=64, out_size=2, n_layers=5).to(device)
model.load_state_dict(torch.load(f'/kaggle/input/model-gru/gru-model_best.pth',map_location=device))
submission = pd.DataFrame()
for i in range(len(test_ds)):
    X = test_ds[i].unsqueeze(0)
    seq_len = X.shape[1]
    X = X.to(device)
    h = None
    pred = torch.zeros((seq_len, 2)).half()
    
    for j in range(0, seq_len, max_chunk_size):
        y_pred, h = model(X[:, j: j + max_chunk_size, :].float(), h)
        h = [hi.detach() for hi in h]
        pred[j: j + max_chunk_size] = y_pred.detach()
        del y_pred; gc.collect()

    pred = pred.cpu().numpy()
    series_id = ids[i]

    days = len(pred) / (17280 / SAMPLE_FREQ)
    scores0, scores1 = np.zeros(len(pred), dtype=np.float16), np.zeros(len(pred), dtype=np.float16)
    
    for min_interval in [30]:
        for index in range(len(pred)):
            if pred[index, 0] == max(pred[max(0, index - min_interval): index + min_interval, 0]):
                scores0[index] = max(pred[max(0, index - min_interval): index + min_interval, 0])
            if pred[index, 1] == max(pred[max(0, index - min_interval): index + min_interval, 1]):
                scores1[index] = max(pred[max(0, index - min_interval): index + min_interval, 1])

    candidates_onset = np.argsort(scores0)[-max(1, round(days)):]
    candidates_wakeup = np.argsort(scores1)[-max(1, round(days)):]

    candidates_onset, candidates_wakeup = filter_close_candidates(candidates_onset, candidates_wakeup, min_interval=30)

    onset = test_ds.data[i][['step']].iloc[np.clip(candidates_onset * SAMPLE_FREQ, 0, len(test_ds.data[i]) - 1)].astype(np.int32)
    onset['event'] = 'onset'
    onset['series_id'] = series_id
    onset['score'] = scores0[candidates_onset]

    wakeup = test_ds.data[i][['step']].iloc[np.clip(candidates_wakeup * SAMPLE_FREQ, 0, len(test_ds.data[i]) - 1)].astype(np.int32)
    wakeup['event'] = 'wakeup'
    wakeup['series_id'] = series_id
    wakeup['score'] = scores1[candidates_wakeup]

    submission = pd.concat([submission, onset, wakeup], axis=0)
    del onset, wakeup, candidates_onset, candidates_wakeup, scores0, scores1, pred, series_id
    gc.collect()

submission = submission.sort_values(['series_id', 'step']).reset_index(drop=True)
submission['row_id'] = submission.index.astype(int)
submission['score'] = submission['score'].fillna(submission['score'].mean())
submission = submission[['row_id', 'series_id', 'step', 'event', 'score']]
submission.to_csv('submission.csv', index=False)

In [11]:
submission

Unnamed: 0,row_id,series_id,step,event,score
