In [53]:
import numpy as np
import pandas as pd

from datetime import date
from sklearn.preprocessing import FunctionTransformer
# from sklego.preprocessing import RepeatingBasisFunction

import seaborn as sns
import matplotlib.pyplot as plt

# Read in data 

In [54]:
dfcpu = pd.read_csv('../data/raw/cpu4.csv')
dfcpu = dfcpu.sort_values(by='timestamp', ascending=True)

## Feature Engineering

    1. Extract date parts from datetime
    2. Create lags for 5 timestamps
    3. Check for missing data
    4. Encode hour Information into feature for ML Models
       a. approach #1 - dummy variables
       b. approach #2 - cyclical encoding with sine/cosine transformation 

In [55]:
dfcpu['datetime'] = pd.to_datetime(dfcpu['timestamp'],unit='s')
dfcpu['date'] = dfcpu['datetime'].dt.date
dfcpu['time'] = dfcpu['datetime'].dt.time
dfcpu['month'] = dfcpu['datetime'].dt.month_name()
dfcpu['day'] = dfcpu['datetime'].dt.day_name()
dfcpu['hour'] = dfcpu['datetime'].dt.hour
dfcpu['minute'] = dfcpu['datetime'].dt.minute
dfcpu['value(t-1)'] =  dfcpu['value'].shift(1)
dfcpu['value(t-2)'] =  dfcpu['value'].shift(2)
dfcpu['value(t-3)'] =  dfcpu['value'].shift(3)
dfcpu['value(t-4)'] =  dfcpu['value'].shift(4)
dfcpu['value(t-5)'] =  dfcpu['value'].shift(5)
dfcpu.set_index('datetime', inplace = True)
# dfcpu = dfcpu.replace({'label': {0.0: False, 1.0: True}})

In [56]:
# Check for missing dates

mindt = dfcpu.index.min()
maxdt = dfcpu.index.max()
# print(mindt, maxdt)
date_range = pd.DataFrame(pd.date_range(mindt, maxdt, freq='5Min')).set_index(0)
# date_range
missing_dates  = date_range[~date_range.index.isin(dfcpu.index)]
missing_dates

##### No Missing Data

In [57]:
## dummy Encoding
dfdummyhour = pd.DataFrame(data=pd.get_dummies(dfcpu.hour, drop_first=True, prefix="hour"))
dfdummyhour

Unnamed: 0_level_0,hour_1,hour_2,hour_3,hour_4,hour_5,hour_6,hour_7,hour_8,hour_9,hour_10,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-07-24 16:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2016-07-24 16:05:00,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2016-07-24 16:10:00,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2016-07-24 16:15:00,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2016-07-24 16:20:00,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-09-23 15:35:00,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2016-09-23 15:40:00,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2016-09-23 15:45:00,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2016-09-23 15:50:00,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [58]:
def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))

def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

In [59]:
## cyclical encoding with sine/cosine transformation
dfcyclical = dfcpu[['hour']].copy()
dfcyclical["hour_sin"] = sin_transformer(24).fit_transform(dfcyclical)["hour"]
dfcyclical["hour_cos"] = cos_transformer(24).fit_transform(dfcyclical)["hour"]
dfcyclical

Unnamed: 0_level_0,hour,hour_sin,hour_cos
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-07-24 16:00:00,16,-0.866025,-0.500000
2016-07-24 16:05:00,16,-0.866025,-0.500000
2016-07-24 16:10:00,16,-0.866025,-0.500000
2016-07-24 16:15:00,16,-0.866025,-0.500000
2016-07-24 16:20:00,16,-0.866025,-0.500000
...,...,...,...
2016-09-23 15:35:00,15,-0.707107,-0.707107
2016-09-23 15:40:00,15,-0.707107,-0.707107
2016-09-23 15:45:00,15,-0.707107,-0.707107
2016-09-23 15:50:00,15,-0.707107,-0.707107


In [60]:
# Merge all the features into one dataframe
dfcpu = pd.concat([dfcpu,dfdummyhour, dfcyclical], axis = 1)
dfcpu.head()

Unnamed: 0_level_0,timestamp,value,label,date,time,month,day,hour,minute,value(t-1),...,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23,hour,hour_sin,hour_cos
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-07-24 16:00:00,1469376000,0.8473,0,2016-07-24,16:00:00,July,Sunday,16,0,,...,0,0,0,0,0,0,0,16,-0.866025,-0.5
2016-07-24 16:05:00,1469376300,-0.036137,0,2016-07-24,16:05:00,July,Sunday,16,5,0.8473,...,0,0,0,0,0,0,0,16,-0.866025,-0.5
2016-07-24 16:10:00,1469376600,0.074292,0,2016-07-24,16:10:00,July,Sunday,16,10,-0.036137,...,0,0,0,0,0,0,0,16,-0.866025,-0.5
2016-07-24 16:15:00,1469376900,0.074292,0,2016-07-24,16:15:00,July,Sunday,16,15,0.074292,...,0,0,0,0,0,0,0,16,-0.866025,-0.5
2016-07-24 16:20:00,1469377200,-0.036137,0,2016-07-24,16:20:00,July,Sunday,16,20,0.074292,...,0,0,0,0,0,0,0,16,-0.866025,-0.5


In [62]:
# Export the dataframe into new csv.
dfcpu.to_csv('../data/processed/cpu4.csv', index= True)