In [1]:
#############################################################
# 1. Libraries

import pandas as pd
import numpy as np 
import os
import glob
from tqdm import tqdm

import matplotlib.pyplot as plt
import gc

#############################################################

In [2]:
#############################################################
# 2. Paths & Global Variables

## 2.1 Paths

path = '../01_Data/'

df_train = pd.read_csv(path + 'train.csv')
df_sample_submission = pd.read_csv(path + 'sample_submission.csv') 

train_paths = glob.glob(path + 'train/*')
test_paths = glob.glob(path + 'test/*')

unique_segments_id_train = set(df_train['segment_id'])
unique_segments_id_test = set(df_sample_submission['segment_id'])

dict_unique_segments_id = { v : k for k, v in enumerate(unique_segments_id_train)}
dict_unique_segments_id_inv = { k : v for k, v in enumerate(unique_segments_id_test)}

## 2.2 Global Variables

SEQ_LENGTH = 60_001

#############################################################

In [3]:
#############################################################
# 3. Global Functions

def buildSequences(df, dict_segment_paths, path_output, mask_value=-1.0):
    for i, segment in enumerate(tqdm(dict_segment_paths, total=len(dict_segment_paths), position=0)):
        x = np.zeros((1, SEQ_LENGTH, 10))
        segment_path = dict_segment_paths[segment]
        values = pd.read_csv(segment_path).fillna(mask_value).values[-SEQ_LENGTH:]
        np.save(path_output + str(segment) + '.npy', values)
        
    return x

def scale(x, mean_, std_):
    return (x - mean_) / std_


def unscale(x, mean_, std_):
    return (x * std_) + mean_

#############################################################

In [4]:
#############################################################
# 4. Preprocess

dict_segment_paths_train = {
    segment : path + 'train/' + str(segment) + '.csv' for segment in unique_segments_id_train
}

dict_segment_paths_test = {
    segment : path + 'test/' + str(segment) + '.csv' for segment in unique_segments_id_test
}

#############################################################

In [6]:
#############################################################
# 5. Build Sequences

save=True

mask_value = 0.0

X_train = buildSequences(df_train, dict_segment_paths=dict_segment_paths_train, 
                         path_output=path + '01_GeneratedSequences/train/',
                         mask_value=mask_value)

X_test = buildSequences(df=None, dict_segment_paths=dict_segment_paths_test, 
                        path_output=path + '01_GeneratedSequences/test/',
                        mask_value=mask_value) 
    
#############################################################

100%|██████████████████████████████████████████████████████████████████████████████| 4431/4431 [04:52<00:00, 15.14it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 4520/4520 [04:41<00:00, 16.06it/s]
