In [None]:
import pandas as pd
import numpy as np
import time
import json
from datetime import datetime
import joblib
import random
import math
import pyarrow as pa 
import ctypes
from tqdm.auto import tqdm 
from scipy.interpolate import interp1d
from math import pi, sqrt, exp
import sklearn,sklearn.model_selection
import torch
from torch import nn,Tensor
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, SubsetRandomSampler
from sklearn.metrics import average_precision_score
from timm.scheduler import CosineLRScheduler
from pyarrow.parquet import ParquetFile
import gc
import matplotlib.pyplot as plt
plt.style.use("ggplot")

In [None]:
MAIN_DIR = "/kaggle/input/child-mind-institute-detect-sleep-states/"
TEST_SERIES = MAIN_DIR + "test_series.parquet"

In [None]:
class data_reader:
    def __init__(self):
        super().__init__()
        self.names_mapping = {
            "test_series" : {"path" : TEST_SERIES, "is_parquet" : True, "has_timestamp" : True}
        }
        self.valid_names = ["test_series"]
    
    def cleaning(self, data):
        "cleaning function : drop na values"
        before_cleaning = len(data)
        print("Number of missing timestamps : ", len(data[data["timestamp"].isna()]))
        data = data.dropna(subset=["timestamp"])
        after_cleaning = len(data)
        print("Percentage of removed steps : {:.1f}%".format(100 * (before_cleaning - after_cleaning) / before_cleaning) )
        return data

    @staticmethod
    def reduce_memory_usage(data):
        start_mem = data.memory_usage().sum() / 1024**2
        print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
        for col in data.columns:
            col_type = data[col].dtype    
            if col_type != object:
                c_min = data[col].min()
                c_max = data[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        data[col] = data[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        data[col] = data[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        data[col] = data[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        data[col] = data[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        data[col] = data[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        data[col] = data[col].astype(np.float32)
                    else:
                        data[col] = data[col].astype(np.float64)
            else:
                data[col] = data[col].astype('category')

        end_mem = data.memory_usage().sum() / 1024**2
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
        return data
    
   

    def load_data(self, data_name):
        data_props = self.names_mapping[data_name]
        if data_props["is_parquet"]:
            data = pd.read_parquet(data_props["path"])
        else:
            data = pd.read_csv(data_props["path"])
            
        gc.collect()
        print('cleaning')
        data = self.cleaning(data)
        gc.collect()
        data = self.reduce_memory_usage(data)
        return data

In [None]:
reader = data_reader()
test_series = reader.load_data(data_name="test_series")
ids = test_series.series_id.unique()
gc.collect()

In [None]:
class ResidualBiGRU(nn.Module):
    def __init__(self, hidden_size, n_layers=1, bidir=True):
        super(ResidualBiGRU, self).__init__()

        self.hidden_size = hidden_size
        self.n_layers = n_layers

        self.gru = nn.GRU(
            hidden_size,
            hidden_size,
            n_layers,
            batch_first=True,
            bidirectional=bidir,
        )
        dir_factor = 2 if bidir else 1
        self.fc1 = nn.Linear(
            hidden_size * dir_factor, hidden_size * dir_factor * 2
        )
        self.ln1 = nn.LayerNorm(hidden_size * dir_factor * 2)
        self.fc2 = nn.Linear(hidden_size * dir_factor * 2, hidden_size)
        self.ln2 = nn.LayerNorm(hidden_size)

    def forward(self, x, h=None):
        res, new_h = self.gru(x, h)
        # res.shape = (batch_size, sequence_size, 2*hidden_size)

        res = self.fc1(res)
        res = self.ln1(res)
        res = nn.functional.relu(res)

        res = self.fc2(res)
        res = self.ln2(res)
        res = nn.functional.relu(res)

        # skip connection
        res = res + x

        return res, new_h

In [None]:
class MultiResidualBiGRU(nn.Module):
    def __init__(self, input_size, hidden_size, out_size, n_layers, bidir=True):
        super(MultiResidualBiGRU, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.out_size = out_size
        self.n_layers = n_layers

        self.fc_in = nn.Linear(input_size, hidden_size)
        self.ln = nn.LayerNorm(hidden_size)
        self.res_bigrus = nn.ModuleList(
            [
                ResidualBiGRU(hidden_size, n_layers=1, bidir=bidir)
                for _ in range(n_layers)
            ]
        )
        self.fc_out = nn.Linear(hidden_size, out_size)

    def forward(self, x, h=None):
        # if we are at the beginning of a sequence (no hidden state)
        if h is None:
            # (re)initialize the hidden state
            h = [None for _ in range(self.n_layers)]

        x = self.fc_in(x)
        x = self.ln(x)
        x = nn.functional.relu(x)

        new_h = []
        for i, res_bigru in enumerate(self.res_bigrus):
            x, new_hi = res_bigru(x, h[i])
            new_h.append(new_hi)

        x = self.fc_out(x)
#         x = F.normalize(x,dim=0)
        return x, new_h  # log probabilities + hidden states

In [None]:
# Constant
SAMPLE_FREQ = 12
max_chunk_size = 24*60*12
min_interval = 30

In [None]:
class CHIDataset(Dataset):
    def __init__(self, series_ids, series):
        self.series_ids = series_ids
        self.series = series.reset_index()
        self.data = []

        # Load and reset index for each series ID
        for viz_id in tqdm(series_ids):
            self.data.append(series.loc[(series.series_id == viz_id)].copy().reset_index())

    def downsample_seq_generate_features(self, feat, window_size):
        # Downsample data and generate features
        if len(feat) % window_size != 0:
            feat = np.concatenate([feat, np.zeros(window_size - (len(feat) % window_size)) + feat[-1]])
        feat = np.reshape(feat, (-1, window_size))
        feat_mean = np.mean(feat, axis=1)
        feat_std = np.std(feat, axis=1)

        return np.vstack([feat_mean, feat_std]).T

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        # Extract relevant columns and convert to numpy array
        X = self.data[index][['anglez', 'enmo']].values.astype(np.float32)

        # Apply absolute value transformation to anglez column
        X[:, 0] = np.abs(X[:, 0])

        # Downsample features with different window sizes
        features = []
        max_len = 0
        for window_size in [12, 360, 720]:
            for i in range(X.shape[1]):
                downsampled = self.downsample_seq_generate_features(X[:, i], window_size)
                features.append(downsampled)
                max_len = max(max_len, downsampled.shape[0])

        # Ensure all features have the same length by padding
        for i in range(len(features)):
            if features[i].shape[0] < max_len:
                padding = np.zeros((max_len - features[i].shape[0], features[i].shape[1]))
                features[i] = np.vstack((features[i], padding))

        # Concatenate all features along the last axis
        X = np.concatenate(features, axis=1)

        # Convert the numpy array to a torch tensor
        X = torch.from_numpy(X)
        return X

In [None]:
# Create the dataset for inference
test_ds = SleepDataset(test_series.series_id.unique(), test_series)
# Clear memory
gc.collect()

In [None]:
model = MultiResidualBiGRU(input_size=12, hidden_size=64, out_size=2, n_layers=5).to(device).eval()
model.load_state_dict(torch.load(f'/kaggle/input/chi-train-residual-bi-gru/model_best.pth', map_location=device))

In [None]:
submission = pd.DataFrame()

for i in range(len(test_ds)):
    # Ensure the input data is on the correct device
    X = test_ds[i].half().to(device)
    
    seq_len = X.shape[0]
    h = None
    pred = torch.zeros((len(X), 2), device=device).half()  # Ensure pred is also on the correct device
    
    for j in range(0, seq_len, max_chunk_size):
        y_pred, h = model(X[j: j + max_chunk_size].float(), h)
        h = [hi.detach().to(device) for hi in h]  # Ensure hidden states are on the correct device
        pred[j: j + max_chunk_size] = y_pred.detach()
        del y_pred
        gc.collect()
    
    del h, X
    gc.collect()
    
    # Move pred back to CPU for numpy operations
    pred = pred.cpu().numpy()
    
    series_id = ids[i]
    
    days = len(pred) / (17280 / 12)
    scores0, scores1 = np.zeros(len(pred), dtype=np.float16), np.zeros(len(pred), dtype=np.float16)
    
    for index in range(len(pred)):
        if pred[index, 0] == max(pred[max(0, index - min_interval):index + min_interval, 0]):
            scores0[index] = max(pred[max(0, index - min_interval):index + min_interval, 0])
        if pred[index, 1] == max(pred[max(0, index - min_interval):index + min_interval, 1]):
            scores1[index] = max(pred[max(0, index - min_interval):index + min_interval, 1])
    
    candidates_onset = np.argsort(scores0)[-max(1, round(days)):]
    candidates_wakeup = np.argsort(scores1)[-max(1, round(days)):]
    
    onset = test_ds.data[i][['step']].iloc[np.clip(candidates_onset * 12, 0, len(test_ds.data[i]) - 1)].astype(np.int32)
    onset['event'] = 'onset'
    onset['series_id'] = series_id
    onset['score'] = scores0[candidates_onset]
    
    wakeup = test_ds.data[i][['step']].iloc[np.clip(candidates_wakeup * 12, 0, len(test_ds.data[i]) - 1)].astype(np.int32)
    wakeup['event'] = 'wakeup'
    wakeup['series_id'] = series_id
    wakeup['score'] = scores1[candidates_wakeup]
    
    submission = pd.concat([submission, onset, wakeup], axis=0)
    
    # Clean up
    del onset, wakeup, candidates_onset, candidates_wakeup, scores0, scores1, pred, series_id
    gc.collect()

In [None]:
submission = submission.sort_values(['series_id', 'step']).reset_index(drop=True)
submission['row_id'] = submission.index.astype(int)
submission['score'] = submission['score'].fillna(submission['score'].mean())
submission = submission[['row_id', 'series_id', 'step', 'event', 'score']]
submission.to_csv('submission.csv', index=False)
submission