In [1]:
import pandas as pd
import numpy as np
import gc
import time
import json
from datetime import datetime
import matplotlib.pyplot as plt
import os
import joblib
import random
import math
from tqdm.auto import tqdm 
from scipy.interpolate import interp1d

from math import pi, sqrt, exp
import sklearn,sklearn.model_selection
import torch
from torch import nn,Tensor
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, SubsetRandomSampler
from sklearn.metrics import average_precision_score
from timm.scheduler import CosineLRScheduler
plt.style.use("ggplot")

from pyarrow.parquet import ParquetFile
import pyarrow as pa 
import ctypes



In [2]:
class PATHS:
    MAIN_DIR = "/kaggle/input/child-mind-institute-detect-sleep-states/"
    # CSV FILES : 
    SUBMISSION = MAIN_DIR + "sample_submission.csv"
    TRAIN_EVENTS = MAIN_DIR + "train_events.csv"
    # PARQUET FILES:
    TRAIN_SERIES = MAIN_DIR + "train_series.parquet"
    TEST_SERIES = MAIN_DIR + "test_series.parquet"

In [3]:
class data_reader:
    def __init__(self, demo_mode):
        super().__init__()
        # MAPPING FOR DATA LOADING :
        self.names_mapping = {
            "submission" : {"path" : PATHS.SUBMISSION, "is_parquet" : False, "has_timestamp" : False}, 
            "train_events" : {"path" : PATHS.TRAIN_EVENTS, "is_parquet" : False, "has_timestamp" : True},
            "train_series" : {"path" : PATHS.TRAIN_SERIES, "is_parquet" : True, "has_timestamp" : True},
            "test_series" : {"path" : PATHS.TEST_SERIES, "is_parquet" : True, "has_timestamp" : True}
        }
        self.valid_names = ["submission", "train_events", "train_series", "test_series"]
        self.demo_mode = demo_mode
    
    def verify(self, data_name):
        if data_name not in self.valid_names:
            print("PLEASE ENTER A VALID DATASET NAME, VALID NAMES ARE : ", valid_names)
        return
    
    def cleaning(self, data):
        "cleaning function : drop na values"
        before_cleaning = len(data)
        print("Number of missing timestamps : ", len(data[data["timestamp"].isna()]))
        data = data.dropna(subset=["timestamp"])
        after_cleaning = len(data)
        print("Percentage of removed steps : {:.1f}%".format(100 * (before_cleaning - after_cleaning) / before_cleaning) )
#         print(data.isna().any())
#         data = data.bfill()
        return data

    @staticmethod
    def reduce_memory_usage(data):
        start_mem = data.memory_usage().sum() / 1024**2
        print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
        for col in data.columns:
            col_type = data[col].dtype    
            if col_type != object:
                c_min = data[col].min()
                c_max = data[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        data[col] = data[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        data[col] = data[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        data[col] = data[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        data[col] = data[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        data[col] = data[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        data[col] = data[col].astype(np.float32)
                    else:
                        data[col] = data[col].astype(np.float64)
            else:
                data[col] = data[col].astype('category')

        end_mem = data.memory_usage().sum() / 1024**2
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
        return data
    
    def load_data(self, data_name):
        self.verify(data_name)
        data_props = self.names_mapping[data_name]
        if data_props["is_parquet"]:
            data = pd.read_parquet(data_props["path"])
        else:
            data = pd.read_csv(data_props["path"])
        gc.collect()
        if data_props["has_timestamp"]:
            print('cleaning')
            data = self.cleaning(data)
            gc.collect()
        data = self.reduce_memory_usage(data)
        return data

In [4]:
reader = data_reader(demo_mode=False)
train_series = reader.load_data(data_name="train_series")
events = reader.load_data(data_name="train_events")

cleaning
Number of missing timestamps :  0
Percentage of removed steps : 0.0%
Memory usage of dataframe is 3416.54 MB
Memory usage after optimization is: 2059.05 MB
Decreased by 39.7%
cleaning
Number of missing timestamps :  4923
Percentage of removed steps : 33.9%
Memory usage of dataframe is 0.44 MB
Memory usage after optimization is: 0.50 MB
Decreased by -13.5%


In [5]:
def drop_nulls(df):
    NAN_SERIES_IDS = [
    '0f9e60a8e56d',
    '390b487231ce',
    '2fc653ca75c7',
    'c7b1283bb7eb',
    '89c7daa72eee',
    'e11b9d69f856',
    'c5d08fc3e040',
    'a3e59c2ce3f6',
    ]
    df = df[~df['series_id'].isin(NAN_SERIES_IDS)]
    return df

In [6]:
series = drop_nulls(train_series)

In [7]:
targets = []
data = []
ids = series.series_id.unique()

for viz_id in tqdm(ids):
    viz_targets = []
    viz_events = events[events.series_id == viz_id]
    v_series = series.loc[(series.series_id == viz_id)].copy().reset_index()
    v_series['dt'] = pd.to_datetime(v_series.timestamp, format='%Y-%m-%dT%H:%M:%S%z').astype("datetime64[ns, UTC-04:00]")
    v_series['date'] = v_series['dt'].dt.date
    steps_per_day = v_series.groupby(['date'], as_index=False)['step'].count()
    valid_days = steps_per_day[steps_per_day['step'] == 17280]
    viz_series = pd.merge(v_series, valid_days[['date']], on=['date'], how='inner')
    
    for i in range(len(viz_events) - 1):
        if viz_events.iloc[i].event == 'onset' and viz_events.iloc[i + 1].event == 'wakeup' and viz_events.iloc[i].night == viz_events.iloc[i + 1].night:
            start, end = viz_events.timestamp.iloc[i], viz_events.timestamp.iloc[i + 1]

            matching_start_rows = viz_series.loc[viz_series.timestamp == start]
            matching_end_rows = viz_series.loc[viz_series.timestamp == end]
            
            if not matching_start_rows.empty and not matching_end_rows.empty:
                start_id = matching_start_rows.index.values[0]
                end_id = matching_end_rows.index.values[0]
                viz_targets.append((start_id, end_id))
            else:
                print(f"No match found for start timestamp: {start} or end timestamp: {end}")
                continue  # Skip this iteration if no match is found
    
    targets.append(viz_targets)
    data.append(viz_series[['anglez', 'enmo', 'step']])
    
joblib.dump((targets, data, ids), 'train_data.pkl')
len(data)


  0%|          | 0/269 [00:00<?, ?it/s]

No match found for start timestamp: 2018-08-14T22:26:00-0400 or end timestamp: 2018-08-15T06:41:00-0400
No match found for start timestamp: 2018-05-31T20:14:00-0400 or end timestamp: 2018-06-01T06:47:00-0400
No match found for start timestamp: 2018-08-22T23:11:00-0400 or end timestamp: 2018-08-23T08:22:00-0400
No match found for start timestamp: 2018-11-30T05:36:00-0500 or end timestamp: 2018-11-30T13:37:00-0500
No match found for start timestamp: 2018-05-03T21:58:00-0400 or end timestamp: 2018-05-04T07:22:00-0400
No match found for start timestamp: 2018-05-17T21:11:00-0400 or end timestamp: 2018-05-18T06:25:00-0400
No match found for start timestamp: 2017-12-08T22:37:00-0500 or end timestamp: 2017-12-09T04:06:00-0500
No match found for start timestamp: 2017-08-21T23:11:00-0400 or end timestamp: 2017-08-22T06:52:00-0400
No match found for start timestamp: 2018-12-17T21:56:00-0500 or end timestamp: 2018-12-18T07:14:00-0500
No match found for start timestamp: 2018-11-13T22:18:00-0500 or 

269