In [1]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import polars as pl
import numpy as np
import pandas as pd
from datetime import datetime

from pandas.api.types import is_datetime64_any_dtype, is_integer_dtype, is_float_dtype
import gc
import warnings
warnings.filterwarnings("ignore")

In [3]:
dt_transforms = [
    pl.col('timestamp').str.to_datetime(),
    pl.col('timestamp').str.to_datetime().dt.date().alias('date'),
    pl.col('timestamp').str.to_datetime().dt.time().alias('time'),
    pl.col("timestamp").str.to_datetime().dt.weekday().alias("weekday"),
    pl.col("timestamp").str.to_datetime().dt.hour().alias("hour"),
    pl.col("timestamp").str.to_datetime().dt.minute().alias("minute")
    ]

#load train series data
train_series = (pl.scan_parquet("/kaggle/input/chi-clean-dataset/Zzzs_train.parquet")
                .with_columns((dt_transforms))
                .collect()
                .to_pandas()
               )       

In [4]:
train_series.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13165560 entries, 0 to 13165559
Data columns (total 11 columns):
 #   Column     Dtype              
---  ------     -----              
 0   series_id  object             
 1   step       uint32             
 2   timestamp  datetime64[us, UTC]
 3   anglez     float32            
 4   enmo       float32            
 5   awake      int64              
 6   date       datetime64[ms]     
 7   time       object             
 8   weekday    int8               
 9   hour       int8               
 10  minute     int8               
dtypes: datetime64[ms](1), datetime64[us, UTC](1), float32(2), int64(1), int8(3), object(2), uint32(1)
memory usage: 690.6+ MB


In [5]:
train_series.head()

Unnamed: 0,series_id,step,timestamp,anglez,enmo,awake,date,time,weekday,hour,minute
0,08db4255286f,0,2018-11-05 14:00:00+00:00,-30.845301,0.0447,1,2018-11-05,14:00:00,1,14,0
1,08db4255286f,1,2018-11-05 14:00:05+00:00,-34.181801,0.0443,1,2018-11-05,14:00:05,1,14,0
2,08db4255286f,2,2018-11-05 14:00:10+00:00,-33.877102,0.0483,1,2018-11-05,14:00:10,1,14,0
3,08db4255286f,3,2018-11-05 14:00:15+00:00,-34.282101,0.068,1,2018-11-05,14:00:15,1,14,0
4,08db4255286f,4,2018-11-05 14:00:20+00:00,-34.385799,0.0768,1,2018-11-05,14:00:20,1,14,0


In [6]:
train_series.isna().sum()

series_id    0
step         0
timestamp    0
anglez       0
enmo         0
awake        0
date         0
time         0
weekday      0
hour         0
minute       0
dtype: int64

In [7]:
def optimize_dataframe(df):
    for col in df.columns:
        col_type = df[col].dtype

        if is_datetime64_any_dtype(df[col]):
            continue  # Skip datetime columns

        c_min = df[col].min()
        c_max = df[col].max()

        if is_integer_dtype(df[col]):
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)
        elif is_float_dtype(df[col]):
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
    return df

In [8]:
columns_to_select = [col for col in train_series.columns if col != 'timestamp']

#  use this list to select the columns
f_train_data = train_series[columns_to_select]

# optimize DataFrame reduce memory
optimized_train_data = optimize_dataframe(f_train_data)

In [9]:
def extract_full_day_data(df):
    # Count the number of steps for each series_id and date
    steps_per_day = df.groupby(['series_id', 'date'], as_index=False)['step'].count()
    
    # Filter to keep only the series_id and date combinations with 17280 steps
    valid_days = steps_per_day[steps_per_day['step'] == 17280]
    
    # Merge the original DataFrame with the valid_days to filter the rows
    filtered_df = pd.merge(df, valid_days[['series_id', 'date']], on=['series_id', 'date'], how='inner')
    
    return filtered_df

all_data = extract_full_day_data(optimized_train_data)

In [10]:
del train_series
del f_train_data
del optimized_train_data
gc.collect()

0

In [11]:
# Creating lagged absolute feature
all_data['anglez'] = all_data['anglez'].abs()

In [12]:
# Creating lagged features
all_data['anglez_diff'] = all_data.groupby('series_id')['anglez'].diff().abs().fillna(0)
all_data['enmo_diff'] = all_data.groupby('series_id')['enmo'].diff().abs().fillna(0)

In [13]:
# Compute rolling standard deviation 1 min 
all_data['anglez_roll_std_1m'] = all_data.groupby('series_id')['anglez'].rolling(12,center=True, min_periods=1).std().reset_index(level=0, drop=True)
all_data['enmo_roll_std_1m'] = all_data.groupby('series_id')['enmo'].rolling(12,center=True, min_periods=1).std().reset_index(level=0, drop=True)

In [14]:
# Creating rolling mean features 1 min
all_data['anglez_roll_mean_1m'] = all_data.groupby('series_id')['anglez'].rolling(12,center=True, min_periods=1).mean().reset_index(level=0, drop=True)
all_data['enmo_roll_mean_1m'] = all_data.groupby('series_id')['enmo'].rolling(12,center=True, min_periods=1).mean().reset_index(level=0, drop=True)

In [15]:
# Compute rolling standard deviation 30 min
all_data['anglez_roll_std_30m'] = all_data.groupby('series_id')['anglez'].rolling(360,center=True, min_periods=1).std().reset_index(level=0, drop=True)
all_data['enmo_roll_std_30m'] = all_data.groupby('series_id')['enmo'].rolling(360,center=True, min_periods=1).std().reset_index(level=0, drop=True)

In [16]:
# Creating rolling mean features 30 min
all_data['anglez_roll_mean_30m'] = all_data.groupby('series_id')['anglez'].rolling(360,center=True, min_periods=1).mean().reset_index(level=0, drop=True)
all_data['enmo_roll_mean_30m'] = all_data.groupby('series_id')['enmo'].rolling(360,center=True, min_periods=1).mean().reset_index(level=0, drop=True)

In [17]:
# Compute rolling standard deviation 1 hour
all_data['anglez_roll_std_1h'] = all_data.groupby('series_id')['anglez'].rolling(720,center=True, min_periods=1).std().reset_index(level=0, drop=True)
all_data['enmo_roll_std_1h'] = all_data.groupby('series_id')['enmo'].rolling(720,center=True, min_periods=1).std().reset_index(level=0, drop=True)

In [18]:
# Creating rolling mean features 1 hour
all_data['anglez_roll_mean_1h'] = all_data.groupby('series_id')['anglez'].rolling(720,center=True, min_periods=1).mean().reset_index(level=0, drop=True)
all_data['enmo_roll_mean_1h'] = all_data.groupby('series_id')['enmo'].rolling(720,center=True, min_periods=1).mean().reset_index(level=0, drop=True)

In [19]:
# Convert hours to radians
# Calculate sine and cosine of the hour column directly
all_data['sin_hour'] = np.sin(all_data['hour'] * (2 * np.pi / 24))
all_data['cos_hour'] = np.cos(all_data['hour'] * (2 * np.pi / 24))

In [20]:
# Convert hours to radians
# Calculate sine and cosine of the hour column directly
all_data['sin_minute'] = np.sin(all_data['minute'] * (2 * np.pi / 60))
all_data['cos_minute'] = np.cos(all_data['minute'] * (2 * np.pi / 60))

In [21]:
all_data = all_data.drop(columns=['hour','minute'])

In [22]:
gc.collect()
# optimize DataFrame reduce memory
all_data = optimize_dataframe(all_data)

In [23]:
series_has_NaN = all_data.groupby('series_id')['step'].apply(lambda x: x.isnull().any())
series_has_NaN.value_counts()

step
False    35
Name: count, dtype: int64

In [24]:
all_data

Unnamed: 0,series_id,step,anglez,enmo,awake,date,time,weekday,anglez_diff,enmo_diff,...,anglez_roll_mean_30m,enmo_roll_mean_30m,anglez_roll_std_1h,enmo_roll_std_1h,anglez_roll_mean_1h,enmo_roll_mean_1h,sin_hour,cos_hour,sin_minute,cos_minute
0,08db4255286f,7200,8.765625,0.007801,1,2018-11-06,00:00:00,2,0.000000,0.000000,...,24.375000,0.051208,17.953125,0.079102,26.296875,0.059235,0.000000,1.00000,0.000000,1.000000
1,08db4255286f,7201,2.109375,0.003500,1,2018-11-06,00:00:05,2,6.656250,0.004303,...,24.406250,0.050995,17.953125,0.079102,26.250000,0.059448,0.000000,1.00000,0.000000,1.000000
2,08db4255286f,7202,1.858398,0.004299,1,2018-11-06,00:00:10,2,0.250977,0.000799,...,24.453125,0.050873,17.984375,0.078979,26.187500,0.059479,0.000000,1.00000,0.000000,1.000000
3,08db4255286f,7203,1.626953,0.000200,1,2018-11-06,00:00:15,2,0.231445,0.004101,...,24.484375,0.050690,18.000000,0.079041,26.109375,0.059723,0.000000,1.00000,0.000000,1.000000
4,08db4255286f,7204,2.388672,0.000400,1,2018-11-06,00:00:20,2,0.761719,0.000200,...,24.515625,0.050507,18.000000,0.079651,26.062500,0.060272,0.000000,1.00000,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12700795,d5e47b94477e,380875,37.500000,0.037811,1,2017-12-01,23:59:35,5,4.968750,0.011780,...,21.265625,0.047699,14.296875,0.049042,20.125000,0.042755,-0.258789,0.96582,-0.104553,0.994629
12700796,d5e47b94477e,380876,32.593750,0.033905,1,2017-12-01,23:59:40,5,4.906250,0.003906,...,21.359375,0.047943,14.312500,0.049103,20.109375,0.042694,-0.258789,0.96582,-0.104553,0.994629
12700797,d5e47b94477e,380877,27.671875,0.022400,1,2017-12-01,23:59:45,5,4.921875,0.011505,...,21.468750,0.048187,14.328125,0.049164,20.125000,0.042725,-0.258789,0.96582,-0.104553,0.994629
12700798,d5e47b94477e,380878,24.750000,0.023605,1,2017-12-01,23:59:50,5,2.921875,0.001205,...,21.484375,0.048401,14.335938,0.049225,20.156250,0.042786,-0.258789,0.96582,-0.104553,0.994629


In [25]:
all_data.to_parquet("train_dataset.parquet", index=False)