<a href="https://www.kaggle.com/code/adelinmil/cmi-eda-chunked-ds-memory-reduction?scriptVersionId=142860977" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

## 0-Imports

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import timedelta

import random

from tqdm import tqdm
import os

import warnings
warnings.filterwarnings('ignore')

## 1-Fixing Problems With train_event (thorough EDA)

In [None]:
sleep_events = pd.read_csv('/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv',
                          dtype = {'night': object})

In [None]:
sleep_events.head()

In [None]:
sleep_events.describe(include = ['O'])

In [None]:
sleep_events.info()

In [None]:
pd.DataFrame({'# of null values': sleep_events.isna().sum(), '% of null values': sleep_events.isna().sum() / len(sleep_events)})\
.style.background_gradient(subset = ['% of null values'])

In [None]:
sleep_events.dropna(inplace = True)
sleep_events.isna().sum()

In [None]:
sleep_events.reset_index(drop = True, inplace = True)

In [None]:
sleep_events.night.value_counts()[(sleep_events.night.value_counts() % 2 != 0)]

In [None]:
# Example for  logic1
sleep_events[sleep_events['night'] == '3'].groupby('series_id').get_group('655f19eabf1e')

In [None]:
# Example for logic2
sleep_events[sleep_events['night'] == '20'].groupby('series_id').get_group('0ce74d6d2106')

In [None]:
fault_indecies = []

for idx in range(0, len(sleep_events) - 1):
    
    logic1 = (sleep_events.loc[idx, 'event'] == sleep_events.loc[idx + 1, 'event'])\
            and (sleep_events.loc[idx, 'night'] == sleep_events.loc[idx + 1, 'night']) 
    
    logic2 = (sleep_events.loc[idx, 'night'] != sleep_events.loc[idx + 1, 'night'])\
            and (sleep_events.loc[idx, 'night'] != sleep_events.loc[idx - 1, 'night'])
    
    if logic1 or logic2:
        fault_indecies.append(idx)
    
fault_indecies  # NOTE: after dropping na  the index is reset

In [None]:
sleep_events.drop(fault_indecies, axis = 0, inplace = True)

In [None]:
sleep_events.head()

## 2- Memory Reduction

In [None]:
ts = pd.read_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet')
ts.head()

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:4] == 'uint':
                if c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
                    df[col] = df[col].astype(np.uint8)
                elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
                    df[col] = df[col].astype(np.uint16)
                elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
                    df[col] = df[col].astype(np.uint32)
                elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
                    df[col] = df[col].astype(np.uint64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


ts = reduce_mem_usage(ts)

In [None]:
series_ids = set(ts.series_id.unique()).intersection(sleep_events.series_id.unique())

# Visualization Of One Series

In [None]:
group = ts[ts['series_id'] == '03d92c9f6f8a']
event = sleep_events[sleep_events['series_id'] == '03d92c9f6f8a']

In [None]:
group['timestamp'] = pd.to_datetime(group['timestamp']).apply(lambda t: t.tz_localize(None))
event['timestamp'] = pd.to_datetime(event['timestamp']).apply(lambda t: t.tz_localize(None))

In [None]:
combined = pd.merge(group, event, on=['series_id', 'timestamp'], how = 'outer')
    
combined['event'] = combined['event'].map({np.nan: 0, 'onset': -1, 'wakeup': 1})
combined.rename(columns = {'step_x': 'step'}, inplace = True)
combined.drop('step_y', axis = 1, inplace = True)

for index in combined[combined.event == 1].index:
    
    night_start = combined.timestamp.iloc[index] - timedelta(hours = 16)
    night_end = combined.timestamp.iloc[index] + timedelta(hours = 8)

    combined.loc[(night_start <= combined.timestamp) &  (combined.timestamp <= night_end), 'night'] = combined.night.loc[index] 

combined.dropna(axis = 0, inplace = True)

In [None]:
combined.reset_index(drop = True, inplace = True)
combined.head()

In [None]:
nights = event.night.unique()

f, axs = plt.subplots(nrows=len(nights), ncols=1)
f.set_figheight(55)
f.set_figwidth(15)

plt.subplots_adjust(
    hspace = 0.55
)

for idx, ax in enumerate(axs.ravel()):
    
    night = combined[combined.night == nights[idx]].set_index('timestamp')
    ax.set_title(f'Night {nights[idx]}', fontsize=12, weight = 'bold')
    
    (night.anglez / max(night.anglez)).plot(color = 'black', ax = ax) # noramlized
    (night.enmo / max(night.enmo)).plot(color = '#808000', ax = ax) # normalized
    night.event.plot(color = 'red', ax = ax)
    
    
    ax.legend()

In [None]:
combined.anglez.plot(figsize = (35, 7), color = 'gray').set_title(f'Series (03d92c9f6f8a) - anglez', fontsize = 25)

for index in combined[combined['event'] == 1].index:
    plt.axvline(x =  index, color = 'r', linestyle = 'dashed')
    
for index in combined[combined['event'] == -1].index:
    plt.axvline(x =  index, color = 'g', linestyle = 'dashed')

In [None]:
combined.enmo.plot(figsize = (35, 7), color = 'black').set_title(f'Series (03d92c9f6f8a) - enmo', fontsize = 25)

for index in combined[combined['event'] == 1].index:
    plt.axvline(x =  index, color = 'r', linestyle = 'dashed')
    
for index in combined[combined['event'] == -1].index:
    plt.axvline(x =  index, color = 'g', linestyle = 'dashed')

In [None]:
combined[combined.event == 1].timestamp.dt.hour.plot(kind = 'hist', figsize = (12, 4), bins = 25).set_title(f'distribution of sleep hour - series (03d92c9f6f8a)')

In [None]:
combined[combined.event == -1].timestamp.dt.hour.plot(kind = 'hist', figsize = (12, 4), bins = 55).set_title(f'distribution of wakeup hour - series (03d92c9f6f8a)')

## 4-Create Chunked Dataset

In [None]:
def is_asleep(combined):
    is_asleep = False
    for index in combined.index:

        if combined.loc[index, 'event'] == -1:
            is_asleep = True
        elif combined.loc[index, 'event'] == 1:
            is_asleep = False
            combined.loc[index, 'event'] = 1

        if is_asleep:
            combined.loc[index, 'event'] = 1
    
    combined.rename(columns = {'event': 'asleep'}, inplace = True)
    combined['asleep'] = combined['asleep'].astype(np.uint16)
    
    return combined

In [None]:
def to_parquet(group, event, series_id):
    
    # convert timestamp column to datetime
    group['timestamp'] = pd.to_datetime(group['timestamp']).apply(lambda t: t.tz_localize(None))
    event['timestamp'] = pd.to_datetime(event['timestamp']).apply(lambda t: t.tz_localize(None))
    
    combined = pd.merge(group, event, on=['series_id', 'timestamp'], how = 'outer')
    
    combined['event'] = combined['event'].map({np.nan: 0, 'onset': -1, 'wakeup': 1}).astype(np.int16)
    combined.rename(columns = {'step_x': 'step'}, inplace = True)
    combined.drop(['series_id', 'night', 'step_y'], axis = 1, inplace = True)
    
    
    combined = is_asleep(combined)
    
    directory = random.choices(['train', 'validation'], weights = (80,20))
    if directory == ['train']:
        combined.to_parquet(os.path.join('train',series_id + '.parquet'), index = False)
    else:
        combined.to_parquet(os.path.join('validation',series_id + '.parquet'), index = False)

In [None]:
if not os.path.isdir("train"):
    os.makedirs("train")
    
if not os.path.isdir("validation"):
    os.makedirs("validation")

In [None]:
for s_id in tqdm(series_ids):
    
    group = ts[ts['series_id'] == s_id]
    event = sleep_events[sleep_events['series_id'] == s_id]
    
    to_parquet(group, event, s_id)