Import libraries

In [1]:
import pandas as pd
import json
from tqdm.notebook import tqdm
from pandarallel import pandarallel
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

<font size ="10"> Data Compression (Input Data) </font>

In [2]:
data = pd.read_parquet("train_series.parquet")
data.head(10)

Unnamed: 0,series_id,step,timestamp,anglez,enmo
0,038441c925bb,0,2018-08-14T15:30:00-0400,2.6367,0.0217
1,038441c925bb,1,2018-08-14T15:30:05-0400,2.6368,0.0215
2,038441c925bb,2,2018-08-14T15:30:10-0400,2.637,0.0216
3,038441c925bb,3,2018-08-14T15:30:15-0400,2.6368,0.0213
4,038441c925bb,4,2018-08-14T15:30:20-0400,2.6368,0.0215
5,038441c925bb,5,2018-08-14T15:30:25-0400,2.6367,0.0217
6,038441c925bb,6,2018-08-14T15:30:30-0400,2.6367,0.0217
7,038441c925bb,7,2018-08-14T15:30:35-0400,2.6367,0.0218
8,038441c925bb,8,2018-08-14T15:30:40-0400,2.798,0.0223
9,038441c925bb,9,2018-08-14T15:30:45-0400,3.0847,0.0217


In [3]:
data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127946340 entries, 0 to 127946339
Data columns (total 5 columns):
 #   Column     Dtype  
---  ------     -----  
 0   series_id  object 
 1   step       uint32 
 2   timestamp  object 
 3   anglez     float32
 4   enmo       float32
dtypes: float32(2), object(2), uint32(1)
memory usage: 3.3+ GB


Map series_id to the number of unique values and save mapping to json



In [4]:
unique_values = data["series_id"].unique()
value_to_label_mapping = {value:index for index, value in enumerate(unique_values)}
data["series_id"] = data["series_id"].map(value_to_label_mapping)

# Dump the dictionary to the JSON file
with open("LabelMap.json", "w") as json_file:
    json.dump(value_to_label_mapping, json_file)

Convert timestamp to Datetime format

In [3]:
def convert_to_datetime(value):
    return pd.to_datetime(value, format='%Y-%m-%dT%H:%M:%S%z')

def to_localize(t):
    return t.tz_localize(None)

In [None]:
pandarallel.initialize(progress_bar=True)
data['timestamp'] = data['timestamp'].parallel_apply(convert_to_datetime).parallel_apply(to_localize)

In [6]:
print(data['timestamp'].min())
print(data['timestamp'].max())

2017-08-04 17:30:00
2019-07-09 09:44:55


In [15]:
data["series_id"] = data["series_id"].astype(np.uint16)

In [16]:
data.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127946340 entries, 0 to 127946339
Data columns (total 5 columns):
 #   Column     Dtype         
---  ------     -----         
 0   series_id  uint16        
 1   step       uint32        
 2   timestamp  datetime64[ns]
 3   anglez     float32       
 4   enmo       float32       
dtypes: datetime64[ns](1), float32(2), uint16(1), uint32(1)
memory usage: 2.6 GB


In [17]:
data.head(10)

Unnamed: 0,series_id,step,timestamp,anglez,enmo
0,0,0,2018-08-14 15:30:00,2.6367,0.0217
1,0,1,2018-08-14 15:30:05,2.6368,0.0215
2,0,2,2018-08-14 15:30:10,2.637,0.0216
3,0,3,2018-08-14 15:30:15,2.6368,0.0213
4,0,4,2018-08-14 15:30:20,2.6368,0.0215
5,0,5,2018-08-14 15:30:25,2.6367,0.0217
6,0,6,2018-08-14 15:30:30,2.6367,0.0217
7,0,7,2018-08-14 15:30:35,2.6367,0.0218
8,0,8,2018-08-14 15:30:40,2.798,0.0223
9,0,9,2018-08-14 15:30:45,3.0847,0.0217


In [18]:
data.to_parquet("train_prep.parquet", index = False)

Test Preprocessing

In [25]:
test = pd.read_parquet("test_series.parquet")
with open("LabelMap.json", 'r') as json_file:
    data_dict = json.load(json_file)
test["series_id"] = test["series_id"].map(data_dict)
test["series_id"] = test["series_id"].astype(np.uint16)
test["timestamp"] = test["timestamp"].apply(convert_to_datetime).apply(to_localize)
test.to_parquet("test_prep.parquet", index = False)

Events Preprocessing

In [4]:
events = pd.read_csv("train_events.csv")
with open("LabelMap.json", 'r') as json_file:
    data_dict = json.load(json_file)
events = events.dropna()
events["series_id"] = events["series_id"].map(data_dict).astype(np.uint16)
events["timestamp"] = events["timestamp"].apply(convert_to_datetime).apply(to_localize)
events["night"] = events["night"].astype(np.uint16)
events["step"] = events["step"].astype(np.uint32)
events["event"] = events["event"].map({"onset":1,"wakeup":2}).astype(np.uint8)
events.to_csv("events_compressed.csv", index = False)

<font size ="10"> Data Compression (Modified Data) </font>

In [None]:
# Group events by 'series_id'
import pandas as pd
import numpy as np
from tqdm import tqdm

events = pd.read_csv("events_modified.csv")
grouped = events.groupby('series_id')
train = pd.read_parquet("train_compressed.parquet")