In [4]:
import pandas as pd
import numpy as np
import datetime as dt

In [15]:
from sklearn.preprocessing import StandardScaler 

In [5]:
from disc.dataset_conditioner import _timestamp_transformer

In [6]:
# Load in example train set dataframe
df = pd.read_csv(filepath_or_buffer='demo-dataset/dataset.csv', index_col=0, parse_dates=True)

In [51]:
def weekend_flagger(timestamp):
    
    '''Given a timestamp, returns 1 if it is a Saturday or Sunday, 0 otherwise'''
    
    return timestamp.weekday() == 6 or timestamp.weekday() == 7


def _timestamp_transformer(timestamps, time_of_day_in='seconds', year_normalised=True):
    
    '''
    Function which takes in a Pandas Series of timestamps and returns useful features derived from the timestamp:
    - Time of day 
    - Day of week
    - Month of year
    - Year

    All but the year column are cyclical, so are further decomposed into sin and cos transforms of the original, \
    so that e.g. 11.59pm is considered close to 00.00am, and Sunday and Monday, and December and January \
    are considered close together

    Parameters
    ----------
    timestamps : pandas.Series
        Series of timestamp data
    time_of_day_in : str
        One of ['seconds', 'hours'] - specifies whether time of day is computed in hours (24) or seconds (24*60*60)
    year_normalised : bool
        Specifies whether the non-cyclical `year` column should be demeaned and rescaled or not

    Returns
    -------
    timestamps_transformed : pandas.DataFrame
        df of the above encodings of the timestamps Series passed in
        
    '''
    
    # Ensure input timestamps are in timestamp/datetime format and not string format
    timestamps = pd.to_datetime(timestamps)
    
    # Split timestamps in timestamps series into a pandas DataFrame of component timestamp parts
    timestamps_transformed = timestamps.apply(
        lambda x: {
            'day_of_week': x.weekday(),
            'day_of_month': x.day, 
            'month_of_year': x.month, 
            'year': x.year, 
            'hour_of_day': x.hour, 
            'minute_of_hour': x.minute, 
            'second_of_minute': x.second,
            'is_weekend': weekend_flagger(x)}
    )

    timestamps_transformed = pd.DataFrame(list(timestamps_transformed))

    # Get second of day 
    timestamps_transformed['second_of_day'] = timestamps_transformed['hour_of_day']*60*60 + timestamps_transformed['minute_of_hour']*60 + timestamps_transformed['second_of_minute']

    # Define constants
    seconds_in_day = 24*60*60
    weekdays_in_week = 7
    months_in_year = 12
    hours_in_day = 24

    # Circular transform of second of day
    timestamps_transformed['sin_second_of_day'] = timestamps_transformed['second_of_day'].apply(lambda x: np.sin(2*np.pi*x / seconds_in_day))
    timestamps_transformed['cos_second_of_day'] = timestamps_transformed['second_of_day'].apply(lambda x: np.cos(2*np.pi*x / seconds_in_day))

    # Circular transform of hour of day
    timestamps_transformed['sin_hour_of_day'] = timestamps_transformed['hour_of_day'].apply(lambda x: np.sin(2*np.pi*x / hours_in_day))
    timestamps_transformed['cos_hour_of_day'] =  timestamps_transformed['hour_of_day'].apply(lambda x: np.cos(2*np.pi*x / hours_in_day))

    # Circular transform of day of week
    timestamps_transformed['sin_day_of_week'] = timestamps_transformed['day_of_week'].apply(lambda x: np.sin(2*np.pi*x / weekdays_in_week))
    timestamps_transformed['cos_day_of_week'] = timestamps_transformed['day_of_week'].apply(lambda x: np.cos(2*np.pi*x / weekdays_in_week))

    # Circular transform of month of year
    timestamps_transformed['sin_month_of_year'] = timestamps_transformed['month_of_year'].apply(lambda x: np.sin(2*np.pi*x / months_in_year))
    timestamps_transformed['cos_month_of_year'] = timestamps_transformed['month_of_year'].apply(lambda x: np.cos(2*np.pi*x / months_in_year))

    # Determine list of output columns based on `time_of_day_in` parameter
    if time_of_day_in == 'seconds':
        output_cols = [
            'sin_second_of_day',
            'cos_second_of_day',
            'sin_day_of_week',
            'cos_day_of_week',
            'sin_month_of_year',
            'cos_month_of_year',
            'year',
            'is_weekend',
        ]
    elif time_of_day_in == 'hours':
            output_cols = [
            'sin_hour_of_day',
            'cos_hour_of_day',
            'sin_day_of_week',
            'cos_day_of_week',
            'sin_month_of_year',
            'cos_month_of_year',
            'year',
            'is_weekend',
        ]
    else:
        raise ValueError('`time_of_day_in` should be one of [\'seconds\', \'hours\']')

    # If specified that the `year` column should be normalised (default), use SkLearn 
    # demean and rescaling on this column
    if year_normalised == True:

        # Instantiate sklearn standard scaler (demeans and rescales)
        scaler = StandardScaler()

        # Fit sklearn standard scaler
        rescaled_years = scaler.fit_transform(timestamps_transformed['year'].values.reshape(-1, 1))

        # Update `year` column with rescaled version
        timestamps_transformed['year'] = rescaled_years

    elif year_normalised != False:
        raise ValueError('`year_normalised` should be boolean - `True` for normalising the `year` column,         and `False` otherwise')

    # Keep only desired output columns
    timestamps_transformed = timestamps_transformed[output_cols]
    
    # Append original Series name to all column names of dataframe 
    # (so that, in the case of multiple timestamps Series being transformed, they can be distinguished and 
    # concatenated)
    
    # Get name of timestamps Series
    name = str(timestamps.name)
    
    # Append original Series name to all column strings
    renamed_cols = []
    columns = timestamps_transformed.columns
    for column in columns:
        renamed_cols.append(name + '_' + str(column))   
    
    # Get the dictionary for renaming the columns
    mapper = dict(zip(columns, renamed_cols))
    
    # Rename the output DataFrame columns
    timestamps_transformed = timestamps_transformed.rename(mapper=mapper, axis='columns')

    return timestamps_transformed


In [52]:
_timestamp_transformer(df.datetimes_1)



Unnamed: 0,datetimes_1_sin_second_of_day,datetimes_1_cos_second_of_day,datetimes_1_sin_day_of_week,datetimes_1_cos_day_of_week,datetimes_1_sin_month_of_year,datetimes_1_cos_month_of_year,datetimes_1_year,datetimes_1_is_weekend
0,-0.483028,-0.875605,0.974928,-0.222521,-1.0,-1.83697e-16,-0.816497,False
1,-0.969302,-0.245871,0.0,1.0,-0.5,-0.8660254,-0.816497,False
2,-0.043837,0.999039,0.781831,0.62349,1.224647e-16,-1.0,-0.816497,False
3,0.309017,-0.951057,-0.433884,-0.900969,-2.449294e-16,1.0,-0.816497,False
4,0.955472,-0.295083,0.781831,0.62349,-0.8660254,0.5,-0.816497,False
5,0.231677,0.972793,-0.974928,-0.222521,1.0,6.123234000000001e-17,1.224745,False
6,-0.357214,-0.934023,-0.433884,-0.900969,0.5,-0.8660254,-0.816497,False
7,-0.599548,-0.800339,0.0,1.0,0.8660254,0.5,1.224745,False
8,0.905445,-0.424463,-0.781831,0.62349,1.0,6.123234000000001e-17,1.224745,True
9,0.868451,0.495774,0.433884,-0.900969,0.5,0.8660254,1.224745,False


In [46]:
temp = tk.iloc[8]

In [47]:
temp.weekday()

6

In [48]:
temp.weekday() == 6 or temp.weekday() == 7

True

In [32]:
temp

Timestamp('2019-07-08 17:03:04.331191')

0 = Monday
2 = Wednesday

In [None]:
def weekend_flagger(timestamp):
    
    '''Given a timestamp, returns 1 if it is a Saturday or Sunday, 0 otherwise'''
    
    return timestamp.weekday() == 6 or timestamp.weekday() == 7