In [1]:
import polars as pl
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from pandas.api.types import is_datetime64_any_dtype, is_integer_dtype, is_float_dtype
import gc
import warnings
warnings.filterwarnings("ignore")

In [2]:
train_events = pd.read_csv("/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv")

In [3]:
train_events.isna().sum()

series_id       0
night           0
event           0
step         4923
timestamp    4923
dtype: int64

In [4]:
series_has_NaN = train_events.groupby('series_id')['step'].apply(lambda x: x.isnull().any())
series_has_NaN.value_counts()

step
True     240
False     37
Name: count, dtype: int64

In [5]:
no_NaN_series = series_has_NaN[~series_has_NaN].index.tolist()
no_NaN_series

['08db4255286f',
 '0a96f4993bd7',
 '0cfc06c129cc',
 '1087d7b0ff2e',
 '10f8bc1f7b07',
 '18b61dd5aae8',
 '29c75c018220',
 '31011ade7c0a',
 '3452b878e596',
 '349c5562ee2c',
 '3664fe9233f9',
 '483d6545417f',
 '55a47ff9dc8a',
 '5acc9d63b5fd',
 '5f94bb3e1bed',
 '655f19eabf1e',
 '67f5fc60e494',
 '72bbd1ac3edf',
 '76237b9406d5',
 '7822ee8fe3ec',
 '89bd631d1769',
 '8e32047cbc1f',
 '939932f1822d',
 '9ee455e4770d',
 'a596ad0b82aa',
 'a9a2f7fac455',
 'a9e5f5314bcb',
 'af91d9a50547',
 'b364205aba43',
 'c535634d7dcd',
 'c6788e579967',
 'c68260cc9e8f',
 'ca730dbf521d',
 'd150801f3145',
 'd25e479ecbb7',
 'd515236bdeec',
 'd5e47b94477e']

In [6]:
# also drop these two "truncated" events series seen in EDA:
no_NaN_series.remove('31011ade7c0a') # incomplete events data
no_NaN_series.remove('a596ad0b82aa') # incomplete events data

In [7]:
import polars as pl

def get_train_series_polars(series):
    # Load the train series data using Polars
    train_series = pl.read_parquet("/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet")
    
    # Filter the DataFrame for the specified series_id
    train_series = train_series.filter(pl.col('series_id') == series)
    
    # Convert the 'timestamp' column to datetime format with timezone handling
    train_series = train_series.with_columns(
        pl.col("timestamp").str.strptime(pl.Datetime, format='%Y-%m-%dT%H:%M:%S%z').alias("timestamp")
    )
    
    # Extract the date from the 'timestamp' column
    train_series = train_series.with_columns(
        pl.col("timestamp").dt.date().alias("date")
    )
    
    # Convert the Polars DataFrame to a Pandas DataFrame
    train_series = train_series.to_pandas()

    return train_series

In [8]:
smaller_train_data = []

for series_id in no_NaN_series:
    train = get_train_series_polars(series_id)
    smaller_train_data.append(train)
    del train
    gc.collect();

In [9]:
all_train = pd.concat(smaller_train_data).reset_index(drop=True)
all_train["series_id"].nunique()

35

In [10]:
def extract_full_day_data(df):
    # Count the number of steps for each series_id and date
    steps_per_day = df.groupby(['series_id', 'date'], as_index=False)['step'].count()
    
    # Filter to keep only the series_id and date combinations with 17280 steps
    valid_days = steps_per_day[steps_per_day['step'] == 17280]

    # Merge the original DataFrame with the valid_days to filter the rows
    filtered_df = pd.merge(df, valid_days[['series_id', 'date']], on=['series_id', 'date'], how='inner')
    
    return filtered_df

In [11]:
all_data = extract_full_day_data(all_train)

In [12]:
from tqdm.auto import tqdm 
targets = []
data = []
ids = all_data.series_id.unique()

for viz_id in tqdm(ids):
    viz_targets = []
    viz_events = train_events[train_events.series_id == viz_id]
    viz_series = all_data.loc[(all_data.series_id == viz_id)].copy().reset_index()
    viz_series['dt'] = pd.to_datetime(viz_series.timestamp, format='%Y-%m-%dT%H:%M:%S%z').astype("datetime64[ns, UTC-04:00]")

    for i in range(len(viz_events) - 1):
        if viz_events.iloc[i].event == 'onset' and viz_events.iloc[i + 1].event == 'wakeup' and viz_events.iloc[i].night == viz_events.iloc[i + 1].night:
            start, end = viz_events.timestamp.iloc[i], viz_events.timestamp.iloc[i + 1]

            matching_start_rows = viz_series.loc[viz_series.timestamp == start]
            matching_end_rows = viz_series.loc[viz_series.timestamp == end]
            
            if not matching_start_rows.empty and not matching_end_rows.empty:
                start_id = matching_start_rows.index.values[0]
                end_id = matching_end_rows.index.values[0]
                viz_targets.append((start_id, end_id))
            else:
                print(f"No match found for start timestamp: {start} or end timestamp: {end}")
                continue  # Skip this iteration if no match is found
    
    targets.append(viz_targets)
    data.append(viz_series[['anglez', 'enmo', 'step']])

  0%|          | 0/35 [00:00<?, ?it/s]

No match found for start timestamp: 2018-05-17T21:11:00-0400 or end timestamp: 2018-05-18T06:25:00-0400
No match found for start timestamp: 2018-04-05T23:03:00-0400 or end timestamp: 2018-04-06T08:42:00-0400
No match found for start timestamp: 2017-09-29T22:46:00-0400 or end timestamp: 2017-09-30T05:31:00-0400
No match found for start timestamp: 2018-01-20T00:07:00-0500 or end timestamp: 2018-01-20T08:04:00-0500
No match found for start timestamp: 2019-06-26T23:22:00-0400 or end timestamp: 2019-06-27T06:36:00-0400
No match found for start timestamp: 2019-02-09T22:00:00-0500 or end timestamp: 2019-02-10T03:28:00-0500
No match found for start timestamp: 2018-02-27T01:00:00-0500 or end timestamp: 2018-02-27T09:06:00-0500
No match found for start timestamp: 2019-02-20T21:35:00-0500 or end timestamp: 2019-02-21T01:56:00-0500
No match found for start timestamp: 2017-12-10T01:03:00-0500 or end timestamp: 2017-12-10T06:33:00-0500
No match found for start timestamp: 2017-11-27T21:37:00-0500 or 

In [13]:
import joblib
joblib.dump((targets, data, ids), 'train_data.pkl')
len(data)

35