### Import libraries

In [1]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
# Demean and rescale numerical columns
from sklearn.preprocessing import StandardScaler

In [7]:
# Package which performs all required encoding of dataset variables for data science projects
#     - Fills in missing values
#     - Encodes numeric variables (demeans and scales to unit variance, unless specified)
#     - Encodes nominal variables (one-hot encodes)
#     - Encodes timestamp variables (generates a set of cyclical features)
#     - Is robust to intended boolean features being read in as strings or ints

#     - Stores important details of train set encodings (means, variances, categories) for use in transforming
#       test set
from hermes_ml.dataset_conditioner import FullEncoder

### Load in example dataset

In [15]:
# Load in example train set dataframe
df = pd.read_csv(filepath_or_buffer='demo-dataset/dataset.csv', index_col=0, parse_dates=True)

In [16]:
# Load in example test set dataframe
df_test = pd.read_csv(filepath_or_buffer='demo-dataset/dataset_test.csv', index_col=0, parse_dates=True)

#### Temporary - convert intended datetime columns (currently strings) to datetime 

`pandas.read_csv` is reading timestamp features in as strings (doesn't seem to be a problem with SQLAlchemy/Redshift)

In the future, this should be rolled into the `timestamp` encoder to make it more robust.

In [17]:
datetime_cols = ['datetimes_1', 'datetimes_2']

for datetime_col in datetime_cols:
    df[datetime_col] = pd.to_datetime(df[datetime_col])
    df_test[datetime_col] = pd.to_datetime(df_test[datetime_col])

### Inspect dataset

In [18]:
df.head(5)

Unnamed: 0,datetimes_1,datetimes_2,numeric_1,numeric_2,numeric_3,boolean_like_1,boolean_like_2,boolean_like_3,boolean_like_4,boolean,nominal,ordinal_1
0,2019-09-25 13:55:32.331191,2002-11-27 04:14:51,46.552984,13.739006,,True,True,False,False,True,Dog,agree
1,2019-07-08 17:03:04.331191,2012-12-20 03:31:36,25.588787,,,False,False,True,False,False,Dog,strongly agree
2,2019-06-18 23:49:57.331191,2008-12-10 18:12:54,25.487432,6.182592,,False,True,False,False,True,Dog,disagree
3,2019-12-13 10:48:00.331191,2000-03-21 22:02:23,4.535463,15.181189,9.776362,False,False,False,True,False,Dog,strongly disagree
4,2019-10-15 07:08:39.331191,2010-12-15 22:52:15,24.660904,1.107595,,True,False,False,False,False,Lizard,neither agree nor disagree


In [19]:
df_test.head(5)

Unnamed: 0,datetimes_1,datetimes_2,numeric_1,numeric_2,numeric_3,boolean_like_1,boolean_like_2,boolean_like_3,boolean_like_4,boolean,nominal,ordinal_1
0,2022-12-05 00:00:19.201864,2060-06-16 11:48:44,1085.363773,,,False,True,False,False,False,Mouse,neither agree nor disagree
1,2019-11-07 02:32:40.201864,2014-07-25 09:19:08,924.379152,42.91531,,False,False,True,True,True,Mouse,strongly disagree
2,2019-07-31 12:27:05.201864,2055-09-05 08:03:09,566.488961,34.995786,,True,True,False,True,True,Dog,disagree
3,2023-07-29 14:37:43.201864,2090-02-05 15:55:21,822.036241,,18.199072,True,False,True,True,False,Cat,strongly agree
4,2020-08-09 03:36:18.201864,2087-03-25 23:16:00,679.786375,,,False,False,True,True,True,Iguana,strongly disagree


### Specify input lookup table

The hermes-ml `FullEncoder` takes a lookup table specifying {`feature`, `dtype`, `missing value fill method`} for each feature

In [20]:
useful_cols = pd.DataFrame(
    data=[
        ['datetimes_1', 'timestamp', 'skip'], 
        ['datetimes_2', 'timestamp', 'skip'], 
        ['numeric_1', 'numeric', 'mean'], 
        ['numeric_2', 'numeric', 'mean'],
        ['numeric_3', 'numeric', 'zeros'],
        ['boolean_like_1', 'bool', 'skip'], 
        ['boolean_like_2', 'bool', 'skip'], 
        ['boolean_like_3', 'bool', 'skip'], 
        ['boolean_like_4', 'bool', 'skip'],
        ['boolean', 'bool', 'skip'], 
        ['nominal', 'nominal', 'skip'], 
        ['ordinal_1', 'ordinal', 'skip'],
    ],
    columns=[
        'feature',
        'dtype',
        'fillna',
    ]
)

#### Visualise the resulting lookup table

In [21]:
useful_cols

Unnamed: 0,feature,dtype,fillna
0,datetimes_1,timestamp,skip
1,datetimes_2,timestamp,skip
2,numeric_1,numeric,mean
3,numeric_2,numeric,mean
4,numeric_3,numeric,zeros
5,boolean_like_1,bool,skip
6,boolean_like_2,bool,skip
7,boolean_like_3,bool,skip
8,boolean_like_4,bool,skip
9,boolean,bool,skip


## Encoder - train set

Run the `kangchenjunga.fit_transform` method on the train set `df` to encode features and store means, variances, categorical columns etc for future use on the test set

In [23]:
# Instantiate the encoder object
enc = FullEncoder()

In [24]:
# Fit encoder on training set and transform it
features_encoded = enc.fit_transform(df, useful_cols)

Filling in missing values...
Missing values filled
Encoding numeric features...
Numeric features encoded
Encoding nominal features...
Nominal features encoded
Encoding timestamp features...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  column.loc[idx_nans[name]] = means[name]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


Timestamp features encoded
Encoding boolean features...
Boolean features encoded




Have a look at the resulting encoded dataframe

In [25]:
enc.means_

numeric_1    25.033286
numeric_2     9.954886
dtype: float64

In [26]:
features_encoded.head(5)

Unnamed: 0,numeric_1,numeric_2,numeric_3,nominal_Cat,nominal_Dog,nominal_Lizard,datetimes_1_sin_second_of_day,datetimes_1_cos_second_of_day,datetimes_1_sin_day_of_week,datetimes_1_cos_day_of_week,...,datetimes_2_sin_day_of_week,datetimes_2_cos_day_of_week,datetimes_2_sin_month_of_year,datetimes_2_cos_month_of_year,datetimes_2_year,boolean_like_1,boolean_like_2,boolean_like_3,boolean_like_4,boolean
0,1.725953,0.797663,-0.395583,0,1,0,-0.483028,-0.875605,0.974928,-0.222521,...,0.974928,-0.222521,-0.5,0.8660254,-1.043281,True,True,False,False,True
1,0.044553,0.0,-0.395583,0,1,0,-0.969302,-0.245871,0.0,1.0,...,0.433884,-0.900969,-2.449294e-16,1.0,1.564922,False,False,True,False,False
2,0.036424,-0.79517,-0.395583,0,1,0,-0.043837,0.999039,0.781831,0.62349,...,0.974928,-0.222521,-2.449294e-16,1.0,0.521641,False,True,False,False,True
3,-1.643995,1.101664,3.141981,0,1,0,0.309017,-0.951057,-0.433884,-0.900969,...,0.781831,0.62349,1.0,6.123234000000001e-17,-1.564922,False,False,False,True,False
4,-0.029866,-1.864939,-0.395583,0,0,1,0.955472,-0.295083,0.781831,0.62349,...,0.974928,-0.222521,-2.449294e-16,1.0,1.043281,True,False,False,False,False


## Encoder - test set

Run the `kangchenjunga.transform` method on the test set `df_test` to encode features using the means, variances, categorical columns etc generated on the train set

In [27]:
# Transform test set using encoding attributes learnt on the train set (means, variances, categories)
features_encoded_test = enc.transform(df_test, useful_cols)

Filling in missing values...
Missing values filled
Encoding numeric features...
Numeric features encoded
Encoding nominal features...
missing_test_cols: {'nominal_Lizard'}
extra_test_cols: {'nominal_Mouse', 'nominal_Iguana'}
set difference after sorting: set()
Test set cols in same order as train set: True
Nominal features encoded
Encoding timestamp features...
Timestamp features encoded
Encoding boolean features...
Boolean features encoded


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  column.loc[idx_nans[name]] = means[name]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [28]:
features_encoded_test.head(5)

Unnamed: 0,numeric_1,numeric_2,numeric_3,nominal_Cat,nominal_Dog,nominal_Lizard,datetimes_1_sin_second_of_day,datetimes_1_cos_second_of_day,datetimes_1_sin_day_of_week,datetimes_1_cos_day_of_week,...,datetimes_2_sin_day_of_week,datetimes_2_cos_day_of_week,datetimes_2_sin_month_of_year,datetimes_2_cos_month_of_year,datetimes_2_year,boolean_like_1,boolean_like_2,boolean_like_3,boolean_like_4,boolean
0,85.04212,0.0,-0.395583,0,0,0,0.001382,0.999999,0.0,1.0,...,0.974928,-0.222521,1.224647e-16,-1.0,0.468768,False,True,False,False,False
1,72.130605,6.947798,-0.395583,0,0,0,0.617951,0.786217,0.433884,-0.900969,...,-0.433884,-0.900969,-0.5,-0.8660254,-1.19764,False,False,True,True,True
2,43.426591,5.278425,-0.395583,0,1,0,-0.117898,-0.993026,0.974928,-0.222521,...,-0.781831,0.62349,-1.0,-1.83697e-16,0.287637,True,True,False,True,True
3,63.922354,0.0,6.189728,1,0,0,-0.635124,-0.77241,-0.974928,-0.222521,...,-0.781831,0.62349,0.8660254,0.5,1.555556,True,False,True,True,False
4,52.51343,0.0,-0.395583,0,0,0,0.809786,0.586726,-0.781831,0.62349,...,0.781831,0.62349,1.0,6.123234000000001e-17,1.446877,False,False,True,True,True


### Save/load encoder to file

In [29]:
enc.save_encoder('demo_encoding')

Previous dataset encodings can be loaded from file

In [32]:
from hermes_ml.dataset_conditioner import load_encoder

In [33]:
enc_copy = load_encoder('demo_encoding')

In [34]:
features_encoded_test_after_reload = enc_copy.transform(df_test, useful_cols)

Filling in missing values...
Missing values filled
Encoding numeric features...
Numeric features encoded
Encoding nominal features...
missing_test_cols: {'nominal_Lizard'}
extra_test_cols: {'nominal_Mouse', 'nominal_Iguana'}
set difference after sorting: set()
Test set cols in same order as train set: True
Nominal features encoded
Encoding timestamp features...
Timestamp features encoded
Encoding boolean features...
Boolean features encoded


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  column.loc[idx_nans[name]] = means[name]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [35]:
features_encoded_test.head(3)

Unnamed: 0,numeric_1,numeric_2,numeric_3,nominal_Cat,nominal_Dog,nominal_Lizard,datetimes_1_sin_second_of_day,datetimes_1_cos_second_of_day,datetimes_1_sin_day_of_week,datetimes_1_cos_day_of_week,...,datetimes_2_sin_day_of_week,datetimes_2_cos_day_of_week,datetimes_2_sin_month_of_year,datetimes_2_cos_month_of_year,datetimes_2_year,boolean_like_1,boolean_like_2,boolean_like_3,boolean_like_4,boolean
0,85.04212,0.0,-0.395583,0,0,0,0.001382,0.999999,0.0,1.0,...,0.974928,-0.222521,1.224647e-16,-1.0,0.468768,False,True,False,False,False
1,72.130605,6.947798,-0.395583,0,0,0,0.617951,0.786217,0.433884,-0.900969,...,-0.433884,-0.900969,-0.5,-0.8660254,-1.19764,False,False,True,True,True
2,43.426591,5.278425,-0.395583,0,1,0,-0.117898,-0.993026,0.974928,-0.222521,...,-0.781831,0.62349,-1.0,-1.83697e-16,0.287637,True,True,False,True,True


In [36]:
features_encoded_test_after_reload.head(3)

Unnamed: 0,numeric_1,numeric_2,numeric_3,nominal_Cat,nominal_Dog,nominal_Lizard,datetimes_1_sin_second_of_day,datetimes_1_cos_second_of_day,datetimes_1_sin_day_of_week,datetimes_1_cos_day_of_week,...,datetimes_2_sin_day_of_week,datetimes_2_cos_day_of_week,datetimes_2_sin_month_of_year,datetimes_2_cos_month_of_year,datetimes_2_year,boolean_like_1,boolean_like_2,boolean_like_3,boolean_like_4,boolean
0,85.04212,0.0,-0.395583,0,0,0,0.001382,0.999999,0.0,1.0,...,0.974928,-0.222521,1.224647e-16,-1.0,0.468768,False,True,False,False,False
1,72.130605,6.947798,-0.395583,0,0,0,0.617951,0.786217,0.433884,-0.900969,...,-0.433884,-0.900969,-0.5,-0.8660254,-1.19764,False,False,True,True,True
2,43.426591,5.278425,-0.395583,0,1,0,-0.117898,-0.993026,0.974928,-0.222521,...,-0.781831,0.62349,-1.0,-1.83697e-16,0.287637,True,True,False,True,True
