In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

### About the Competition

The goal of [this competition](https://www.kaggle.com/competitions/child-mind-institute-detect-sleep-states/overview) is to detect sleep onset and wake. You will develop a model trained on wrist-worn accelerometer data in order to determine a person's sleep state.



## Loading & EDA

In [None]:
train = pd.read_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet')
train_events = pd.read_csv('/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv')
test = pd.read_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet')


train.head(10)

In [None]:
from pandas.api.types import is_datetime64_ns_dtype

def reduce_mem_usage(df):
    """iterate through all the numeric columns of a dataframe and modify
    the data usage to reduce memory usage
    """
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object and not is_datetime64_ns_dtype(df[col]) and not 'category':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16) and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32) and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.float16)
    return df

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)
train_events = reduce_mem_usage(train_events)

In [None]:
train.head(10)

In [None]:
len(train), len(test)

We can see the train dataset is one with 127 million rows but few features. Let's deal with the missing entries in the dataset.

In [None]:
train.isnull().sum()

In [None]:
train_events.isnull().sum()

The `series_id` feature is one which repeats across many observations. According to the dataset, the `series_id` is a unique identifier for each accelerometer series. Let's understand more about this feature.

In [None]:
train[train['series_id'] == '038441c925bb'].sample(20)

In [None]:
len(train[train['series_id'] == '038441c925bb'])

It appears each accelerometer unique identifier has a lot of observations in the dataset. The `step` indicates the number of time data is sampled from the accelerometer.

In [None]:
train['series_id'].value_counts()

As for the train_events dataframe

In [None]:
train_events.head(10)

In [None]:
train_events['event'].value_counts()

This is the target to predict in this competition. The `onset` means the beginning of sleep and `wakeup` means the end as detected by the accelerometer. 

In [None]:
plt.figure(figsize = (8, 6))
plt.title('Distribution of Events in the train data', fontsize = 14, fontweight = 'bold')

sns.countplot(x = 'event', data = train_events)

For more interesting visualizing of the data -> [notebook](https://www.kaggle.com/code/jocelyndumlao/sleep-event-detection-eda)

In [None]:
event_mapping = {'onset': 0, 'wakeup': 1}

train_events['event'] = train_events['event'].map(event_mapping)

In [None]:
train_events['event'].value_counts()

In [None]:
def missing_values_table(df):
    """Returns a dataframe of number of missing entries per column in df"""
    miss_val = df.isnull().sum()
    miss_val_percent = (df.isnull().sum() * 100) / len(df)
    miss_val_table = pd.concat([miss_val, miss_val_percent], axis=1)
    miss_val_table = miss_val_table.rename(columns={0:'Missing Values', 1: '% of Missing Values'})
    
    # sort by mssing values
    miss_val_table = miss_val_table[miss_val_table.iloc[:,1] != 0].sort_values('% of Missing Values', ascending=False).round(1)
     # print some summary information
    print(f'The dataframe has {str(df.shape[1])} columns.\nThere are {str(miss_val_table.shape[0])} columns with missing values')
    return miss_val_table

In [None]:
missing_values_table(train_events)

###  References

1. https://www.kaggle.com/code/marcinstasko/zzz-tutorial-on-code-profiling-and-pipelinin