In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/testmodel/transformer_epoch_1.pth
/kaggle/input/testmodel/model_best.pth
/kaggle/input/testmodel/model_train_1.pth
/kaggle/input/testmodel/model_valid_5.pth
/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet
/kaggle/input/child-mind-institute-detect-sleep-states/sample_submission.csv
/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv
/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet


In [2]:
import pandas as pd
import numpy as np
import gc
import math
import torch
from torch import nn
from torch.utils.data import Dataset
from pyarrow.parquet import ParquetFile
import pyarrow as pa
device = 'cuda' if torch.cuda.is_available() else 'cpu'



In [3]:
class PATHS:
    MAIN_DIR = "/kaggle/input/child-mind-institute-detect-sleep-states/"
    SUBMISSION = MAIN_DIR + "sample_submission.csv"
    TRAIN_EVENTS = MAIN_DIR + "train_events.csv"
    TRAIN_SERIES = MAIN_DIR + "train_series.parquet"
    TEST_SERIES = MAIN_DIR + "test_series.parquet"
class CFG:
    DEMO_MODE = True
class data_reader:
    def __init__(self, demo_mode):
        super().__init__()
        self.names_mapping = {
            "submission" : {"path" : PATHS.SUBMISSION, "is_parquet" : False, "has_timestamp" : False}, 
            "train_events" : {"path" : PATHS.TRAIN_EVENTS, "is_parquet" : False, "has_timestamp" : True},
            "train_series" : {"path" : PATHS.TRAIN_SERIES, "is_parquet" : True, "has_timestamp" : True},
            "test_series" : {"path" : PATHS.TEST_SERIES, "is_parquet" : True, "has_timestamp" : True}
        }
        self.valid_names = ["submission", "train_events", "train_series", "test_series"]
        self.demo_mode = demo_mode
    
    def verify(self, data_name):
        "function for data name verification"
        if data_name not in self.valid_names:
            print("PLEASE ENTER A VALID DATASET NAME, VALID NAMES ARE : ", self.valid_names)
        return
    
    def cleaning(self, data):
        "cleaning function : drop na values"
        data = data.dropna(subset=["timestamp"])
        return data
    
    @staticmethod
    def reduce_memory_usage(data):
        "iterate through all the columns of a dataframe and modify the data type to reduce memory usage."
        for col in data.columns:
            col_type = data[col].dtype    
            if col_type != object:
                c_min = data[col].min()
                c_max = data[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        data[col] = data[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        data[col] = data[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        data[col] = data[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        data[col] = data[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        data[col] = data[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        data[col] = data[col].astype(np.float32)
                    else:
                        data[col] = data[col].astype(np.float64)
            else:
                data[col] = data[col].astype('category')

        return data
    
    def load_data(self, data_name):
        "function for data loading"
        self.verify(data_name)
        data_props = self.names_mapping[data_name]
        if data_props["is_parquet"]:
            if self.demo_mode:
                pf = ParquetFile(data_props["path"]) 
                demo_rows = next(pf.iter_batches(batch_size=20_000)) 
                data = pa.Table.from_batches([demo_rows]).to_pandas()
            else:
                data = pd.read_parquet(data_props["path"])
        else:
            if self.demo_mode:
                data = pd.read_csv(data_props["path"], nrows=20_000)
            else:
                data = pd.read_csv(data_props["path"])
                
        gc.collect()
        if data_props["has_timestamp"]:
            data = self.cleaning(data)
            gc.collect()
        data = self.reduce_memory_usage(data)
        return data

In [4]:
reader = data_reader(demo_mode=False)
test_series = reader.load_data(data_name="test_series")
ids = test_series.series_id.unique()

In [5]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.3, max_len=24*60):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class Transformer(nn.Module):
    def __init__(self, input_size, hidden_size, out_size, n_layers):
        super(Transformer, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.out_size = out_size
        self.n_layers = n_layers

        self.fc_in = nn.Linear(input_size, hidden_size)
        self.ln = nn.LayerNorm(hidden_size)
        
        encoder_layers = nn.TransformerEncoderLayer(hidden_size, 2)
        encoder_layers.self_attn.batch_first = True
        
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, n_layers)
        
        self.fc_out = nn.Linear(hidden_size, out_size)
        self.pos_encoder = PositionalEncoding(hidden_size)

    def forward(self, x):
        x = self.fc_in(x)
        x = self.ln(x)
        x = nn.functional.relu(x)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = self.fc_out(x)
        return x

In [6]:
class SleepDataset(Dataset):
    def __init__(
        self, series_id
    ):
        self.data = series_id
    def downsample_seq_generate_features(self, feat, target, downsample_factor = 12):
        if target == 0: #angelz만 적용
            feat = np.diff(feat)
            feat = np.insert(feat, 0, feat[0])
            feat = np.abs(feat) #크기를 적용
        if len(feat)%downsample_factor!=0:
            feat = np.concatenate([feat,np.zeros(downsample_factor-((len(feat))%downsample_factor))+feat[-1]])
        feat = np.reshape(feat, (-1,downsample_factor))
        if target == 1: #enmo의 경우

            feat = np.sum(feat, 1)

            feat = 100 / (feat + 1) #스케일링 후 역수를 취해 작은 값의 비중을 크게 함

            return np.dstack([feat])[0]
        else:
            feat_mean = np.mean(feat,1)
            feat_std = np.std(feat,1)
            feat_median = np.median(feat,1)
            feat_max = np.max(feat,1)
            feat_min = np.min(feat,1)
            return np.dstack([feat_mean,feat_std,feat_median,feat_max,feat_min])[0]

    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        X = pd.read_csv(f"/kaggle/working/{index}.csv")
        X = X[['anglez','enmo']].values.astype(np.float32)
        X = np.concatenate([self.downsample_seq_generate_features(X[:,i],i,12) for i in range(X.shape[1])],-1)
        X = torch.from_numpy(X).to(torch.float32)
        return X

In [7]:
for i, viz_id in enumerate(test_series.series_id.unique()):
    file_name = f"/kaggle/working/{i}.csv"
    d = test_series.loc[(test_series.series_id==viz_id)].copy().reset_index()
    d.to_csv(file_name, index=False)
    print(i)

0
1
2


In [8]:
test_ds = SleepDataset(test_series.series_id.unique())
del test_series
gc.collect()
max_chunk_size = 24 * 60 * 12
min_interval = 30
model = Transformer(input_size=6,hidden_size=64,out_size=2,n_layers=5).to(device).eval()
model.load_state_dict(torch.load(f'/kaggle/input/model/tr-model_best.pth',map_location=device))
submission = pd.DataFrame()
for i in range(len(ids)):
    batch = test_ds[i]
    data = pd.read_csv(f"/kaggle/working/{i}.csv")
    data_length = len(batch)
    series_id = ids[i]
    for chunk_start in range(0, data_length, max_chunk_size):
        chunk_end = min(chunk_start + max_chunk_size, data_length)
        pred = np.array([], dtype=np.float16).reshape(0, 2)
        for cchunk in range(chunk_start, chunk_end, max_chunk_size//12):
            with torch.no_grad():
                chunk = batch[cchunk:min(chunk_end, cchunk+max_chunk_size//12)].clone().to(device)
                pred_ = model(chunk).half().cpu().numpy()[:, -1, :]
            pred = np.concatenate((pred, pred_), axis=0)
            del pred_
            gc.collect()
            torch.cuda.empty_cache()
        del chunk
        gc.collect()
        torch.cuda.empty_cache()
        days = len(pred) / (17280 / 12)
        scores0, scores1 = np.zeros(len(pred), dtype=np.float16), np.zeros(len(pred), dtype=np.float16)
        for index in range(len(pred)):
            if pred[index, 0] == max(pred[max(0, index - min_interval):min(len(pred), index + min_interval), 0]):
                scores0[index] = max(pred[max(0, index - min_interval):min(len(pred), index + min_interval), 0])
            if pred[index, 1] == max(pred[max(0, index - min_interval):min(len(pred), index + min_interval), 1]):
                scores1[index] = max(pred[max(0, index - min_interval):min(len(pred), index + min_interval), 1])
        del pred
        gc.collect()
        torch.cuda.empty_cache()
        candidates_onset = chunk_start + np.argsort(scores0)[-max(1, round(days)):]
        candidates_wakeup = chunk_start + np.argsort(scores1)[-max(1, round(days)):]
        onset = data[['step']].iloc[np.clip(candidates_onset * 12, 0, len(batch) - 1)].astype(np.int32)
        onset['event'] = 'onset'
        onset['series_id'] = series_id
        onset['score'] = scores0[candidates_onset]
        wakeup = data[['step']].iloc[np.clip(candidates_wakeup * 12, 0, len(batch) - 1)].astype(np.int32)
        wakeup['event'] = 'wakeup'
        wakeup['series_id'] = series_id
        wakeup['score'] = scores1[candidates_wakeup]
        submission = pd.concat([submission, onset, wakeup], axis=0)
        del onset, wakeup, candidates_onset, candidates_wakeup
        gc.collect()
        torch.cuda.empty_cache()
submission = submission.sort_values(['series_id', 'step']).reset_index(drop=True)
submission['row_id'] = submission.index.astype(int) 
submission['score'] = submission['score'].fillna(submission['score'].mean())
submission = submission[['row_id','series_id','step','event','score']]
submission.to_csv('/kaggle/working/submission.csv',index=False)

In [9]:
submission

Unnamed: 0,row_id,series_id,step,event,score
0,0,038441c925bb,12,onset,0.453613
1,1,038441c925bb,12,wakeup,0.214355
2,2,03d92c9f6f8a,12,onset,0.453125
3,3,03d92c9f6f8a,12,wakeup,0.213379
4,4,0402a003dae9,12,onset,0.54834
5,5,0402a003dae9,12,wakeup,0.337646
