# Preparation

In [3]:
import pandas as pd
from datetime import timedelta
import os

import sys
sys.path.append( '../../src/' )

from ml.preprocessing import *

In [4]:
path = [ '..', '..', 'datasets' ]
out_path = [ '..', '..', 'datasets', 'resolutions' ]

df = pd.read_csv( os.path.join( *path, 'ds-gym-1sec.csv' ) , parse_dates = [ 'date' ] ).drop( columns = [ 'alt' ] )

## Removing null values

In [5]:
print( 'Legnth: ', len( df ) )
print( 'Null values:' )
len( df ) - df.count()

Legnth:  10129
Null values:


date    0
pre     0
hum     4
tem     0
occ     0
dtype: int64

In [6]:
df = df.dropna( axis = 0, subset = [ 'occ', 'hum' ], thresh = 2 )

In [7]:
print( 'Legnth: ', len( df ) )
print( 'Null values:' )
len( df ) - df.count()

Legnth:  10125
Null values:


date    0
pre     0
hum     0
tem     0
occ     0
dtype: int64

In [8]:
# date format mm-dd-yyyy

temp = df[ ( df.date >= '2019-01-01' ) & ( df.date <= '2019-02' ) ][ 'date' ] + timedelta( days = 264 )
df.loc[ temp.index, 'date' ] = temp

## Genereting other resolutions

In [9]:
from sklearn.model_selection import train_test_split 
import os

path = [ '..', '..', 'datasets' ]
out_path = [ '..', '..', 'datasets', 'resolutions' ]

def split_and_save( df, out_path, resolution ):
    x_train, x_test, y_train , y_test = train_test_split( df[ df.columns[ :-1 ] ], df.occ, test_size = 0.20, random_state = 0 )
    x_train[ 'occ' ] = y_train
    x_test[ 'occ' ] = y_test
    x_train.to_csv( os.path.join( *out_path, 'ds-%s-train.csv' % ( resolution ) ) )
    x_test.to_csv( os.path.join( *out_path, 'ds-%s-test.csv'% ( resolution ) ) )
    
# adding seconds resolution to time index

df[ 'seconds' ] = df.groupby( pd.Grouper( key = 'date', freq = 'min' ) ).cumcount()
df[ 'date' ] = df.date + pd.to_timedelta( df.seconds, unit = 's' )
df = df.drop( columns = [ 'seconds' ] ).set_index( 'date' )

### 10 (sampled and averaged)

In [10]:
split_and_save( resample_df( df, '10S' ), out_path, 'gym-10sec' )
split_and_save( resample_df_avg( df, '10S' ).round( 2 ), out_path, 'gym-10sec-avg' )

### 30 seconds 

In [11]:
split_and_save( resample_df( df, '30S' ), out_path, 'gym-30sec' )
split_and_save( resample_df_avg( df, '30S' ).round( 2 ), out_path, 'gym-30sec-avg' )

### 1 minute (sampled and averaged)

In [12]:
split_and_save( resample_df( df, '1min' ), out_path, 'gym-1min' )
split_and_save( resample_df_avg( df, '1min' ).round( 2 ), out_path, 'gym-1min-avg' )

### 5 minutes (sampled and averaged)

In [13]:
split_and_save( resample_df( df, '5min' ), out_path, 'gym-5min' )
split_and_save( resample_df_avg( df, '5min' ).round( 2 ), out_path, 'gym-5min-avg' )