# DateTime Encoding with SciKitLearn

The following code explores several different approaches to encoding time-series data

In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# 1. One-Hot Encoding of DateTime Strings

Treats the datetime values as nominal categorical data creating a new feature column for each distinct datetime value.

Pros:
- Fascilitates identification of patterns where the datetime values precisely match eachother.

Cons:
- Does not fascilitate identification of patterns related to the ordinal nature of datetime values.
- If there are a a very large number of unique datetime values in the dataset then this approach will result in a large number of feature columns.

In [51]:
# Create a sample pandas DataFrame
data = {
    'DateTime': ['2023-01-29', '2023-01-29', '2021-11-29'],
    'Person_ID': [0, 1, 2]
}

df = pd.DataFrame(data)
df

Unnamed: 0,DateTime,Person_ID
0,2023-01-29,0
1,2023-01-29,1
2,2021-11-29,2


In [52]:
# Use Pandas get dummies to perform category encoding
df = pd.get_dummies(df)
df


Unnamed: 0,Person_ID,DateTime_2021-11-29,DateTime_2023-01-29
0,0,0,1
1,1,0,1
2,2,1,0


# 2. Ordinal Encoding of DateTime Values

Treats the datetime values as ordinal categorical data creating a new feature column for each distinct datetime value.

Pros:
- Fascilitates identification of patterns where the datetime values precisely match eachother.
- Fascilitates identification of patterns related to the ordinal nature of datetime values.

Cons:
- Does not fascilitate identification of patterns related to individual datetime components.

In [54]:
# Create a sample pandas DataFrame
data = {
    'DateTime': ['2023-01-29 10:05:53', '2022-04-29 15:30:17', '2021-11-29 20:45:34'],
    'Person_ID': [0, 1, 2]
}

df = pd.DataFrame(data)
df

Unnamed: 0,DateTime,Person_ID
0,2023-01-29 10:05:53,0
1,2022-04-29 15:30:17,1
2,2021-11-29 20:45:34,2


In [55]:
# Transform datetime strings into pandas datetime objects
df['DateTime'] = pd.to_datetime(df['DateTime'])
df

Unnamed: 0,DateTime,Person_ID
0,2023-01-29 10:05:53,0
1,2022-04-29 15:30:17,1
2,2021-11-29 20:45:34,2


In [56]:
df['DateTime_Ord'] = df['DateTime'].apply(lambda x: x.toordinal())
df

Unnamed: 0,DateTime,Person_ID,DateTime_Ord
0,2023-01-29 10:05:53,0,738549
1,2022-04-29 15:30:17,1,738274
2,2021-11-29 20:45:34,2,738123


In [57]:
# Delete original DateTime column
df = df.drop(columns=['DateTime'])
df

Unnamed: 0,Person_ID,DateTime_Ord
0,0,738549
1,1,738274
2,2,738123


# 3. Ordinal Encoding via Extraction of DateTime components

Creates feature columns based on the components of datetime values (Year, Month, Day, Hour, Minute, Second).

Pros:
- Fascilitates identification of patterns where the datetime values precisely match eachother.
- Fascilitates identification of patterns related to the ordinal nature of datetime values.
- Fascilitate identification of patterns related to individual datetime components.

Cons:
- Does not fascilitate identification of patterns related to the cyclical nature of date time values (eg. Jan 1st 2023 is 'closer' to Dec 31st 2022 than it is to Jan 3rd 2023.)

In [59]:
# Create a sample pandas DataFrame
data = {
    'DateTime': ['2023-01-29 10:05:53', '2022-04-29 15:30:17', '2021-11-29 20:45:34'],
    'Person_ID': [0, 1, 2]
}

df = pd.DataFrame(data)
df

Unnamed: 0,DateTime,Person_ID
0,2023-01-29 10:05:53,0
1,2022-04-29 15:30:17,1
2,2021-11-29 20:45:34,2


In [60]:
# Transform datetime strings into pandas datetime objects
df['DateTime'] = pd.to_datetime(df['DateTime'])
df

Unnamed: 0,DateTime,Person_ID
0,2023-01-29 10:05:53,0
1,2022-04-29 15:30:17,1
2,2021-11-29 20:45:34,2


In [61]:
# Explode datetime objects into features of interest (Year, Month, Day, Hour, Minute, Second)
df['DateTime_Year'] = df['DateTime'].dt.year
df['DateTime_Month'] = df['DateTime'].dt.month
df['DateTime_Day'] = df['DateTime'].dt.day
df['DateTime_Hour'] = df['DateTime'].dt.hour
df['DateTime_Minute'] = df['DateTime'].dt.minute
df['DateTime_Second'] = df['DateTime'].dt.second
df

Unnamed: 0,DateTime,Person_ID,DateTime_Year,DateTime_Month,DateTime_Day,DateTime_Hour,DateTime_Minute,DateTime_Second
0,2023-01-29 10:05:53,0,2023,1,29,10,5,53
1,2022-04-29 15:30:17,1,2022,4,29,15,30,17
2,2021-11-29 20:45:34,2,2021,11,29,20,45,34


In [62]:
# Delete original DateTime column
df = df.drop(columns=['DateTime'])
df

Unnamed: 0,Person_ID,DateTime_Year,DateTime_Month,DateTime_Day,DateTime_Hour,DateTime_Minute,DateTime_Second
0,0,2023,1,29,10,5,53
1,1,2022,4,29,15,30,17
2,2,2021,11,29,20,45,34


## 4. Cyclical Encoding of DateTime Components

Improves on the previous approach by utilising continuous mathematical functions to fascilitate the identification of patterns related to the cyclical nature of datetime components.

Each feature column (Year, Month, Day, Hour, Minute, Second) will be used to create an additional two columns recording it's corresponding cosine and sine values.

The reason it is necessary to represent feature columns using two seperate continuous mathematical functions (cosine and sine) is to mitigate issues related to 'similarity collisions' wherein two different values acted apon by a mathematical function produce the same output.

In [64]:
def cyclic_encode(value, max_value):
    angle = 2 * np.pi * value / max_value
    cos_value = np.cos(angle)
    sin_value = np.sin(angle)
    return cos_value, sin_value

# Encode years
max_year = df['DateTime_Year'].max()
df['DateTime_Year_Cos'], df['DateTime_Year_Sin'] = zip(*df['DateTime_Year'].apply(cyclic_encode, args=(max_year,)))

# Encode months
max_month = 12
df['DateTime_Month_Cos'], df['DateTime_Month_Sin'] = zip(*df['DateTime_Month'].apply(cyclic_encode, args=(max_month,)))

# Encode days
max_day = 31
df['DateTime_Day_Cos'], df['DateTime_Day_Sin'] = zip(*df['DateTime_Day'].apply(cyclic_encode, args=(max_day,)))

# Encode hours
max_hour = 23
df['DateTime_Hour_Cos'], df['DateTime_Hour_Sin'] = zip(*df['DateTime_Hour'].apply(cyclic_encode, args=(max_hour,)))

# Encode minutes
max_minute = 59
df['DateTime_Minute_Cos'], df['DateTime_Minute_Sin'] = zip(*df['DateTime_Minute'].apply(cyclic_encode, args=(max_hour,)))

# Encode Seconds
max_second = 59
df['DateTime_Second_Cos'], df['DateTime_Second_Sin'] = zip(*df['DateTime_Second'].apply(cyclic_encode, args=(max_second,)))

df

Unnamed: 0,Person_ID,DateTime_Year,DateTime_Month,DateTime_Day,DateTime_Hour,DateTime_Minute,DateTime_Second,DateTime_Year_Cos,DateTime_Year_Sin,DateTime_Month_Cos,DateTime_Month_Sin,DateTime_Day_Cos,DateTime_Day_Sin,DateTime_Hour_Cos,DateTime_Hour_Sin,DateTime_Minute_Cos,DateTime_Minute_Sin,DateTime_Second_Cos,DateTime_Second_Sin
0,0,2023,1,29,10,5,53,1.0,-2.449294e-16,0.866025,0.5,0.918958,-0.394356,-0.917211,0.398401,0.203456,0.979084,0.802712,-0.596367
1,1,2022,4,29,15,30,17,0.999995,-0.00310587,-0.5,0.866025,0.918958,-0.394356,-0.57668,-0.81697,-0.33488,0.942261,-0.237327,0.97143
2,2,2021,11,29,20,45,34,0.999981,-0.00621171,0.866025,-0.5,0.918958,-0.394356,0.682553,-0.730836,0.962917,-0.269797,-0.887352,-0.461093
