# Automunge under automation

Automunge is available now for pip install:

In [1]:
# !pip install Automunge

Or to upgrade (we currently roll out upgrades pretty frequently):

In [2]:
# !pip install Automunge --upgrade

Once installed, run this in a local session to initialize:

In [3]:
from Automunge import AutoMunge
am = AutoMunge()

Under automation, the automunge(.) function will: 
- normalize numeric features
- binarize bounded categoric features
- hash unbounded categoric features
- encode date-time entries

To demonstrate, let's encode the Titanic set, a well known benchmark:

In [4]:
import pandas as pd

#titanic set
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

Here is what the data looks like in a raw form.

In [5]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


We'll need to designate to automunge any columns that are to be treated as labels or ID sets.

In [6]:
#titanic set
labels_column = 'Survived'
trainID_column = 'PassengerId'

We can then pass these dataframes to the automunge(.) function for processing.

Note that the function call returns 17 sets. some of which may be empty based on parameter configurations. It's an unusual convention but we find that by having one return configuration for all scenarios it keeps things simple.

In [7]:
train, trainID, labels, \
validation1, validationID1, validationlabels1, \
validation2, validationID2, validationlabels2, \
test, testID, testlabels, \
labelsencoding_dict, finalcolumns_train, finalcolumns_test, \
featureimportance, postprocess_dict \
= am.automunge(df_train,
               labels_column = labels_column,
               trainID_column = trainID_column)

_______________
Begin Automunge processing

evaluating column:  Pclass
processing column:  Pclass
    root category:  text
 returned columns:
['Pclass_NArw', 'Pclass_1.0', 'Pclass_2.0', 'Pclass_3.0']

evaluating column:  Name
processing column:  Name
    root category:  hash
 returned columns:
['Name_NArw', 'Name_hash_0', 'Name_hash_1', 'Name_hash_2', 'Name_hash_3', 'Name_hash_4', 'Name_hash_5', 'Name_hash_6', 'Name_hash_7', 'Name_hash_8', 'Name_hash_9', 'Name_hash_10', 'Name_hash_11', 'Name_hash_12', 'Name_hash_13']

evaluating column:  Sex
processing column:  Sex
    root category:  bnry
 returned columns:
['Sex_bnry', 'Sex_NArw']

evaluating column:  Age
processing column:  Age
    root category:  nmbr
 returned columns:
['Age_nmbr', 'Age_NArw']

evaluating column:  SibSp
processing column:  SibSp
    root category:  nmbr
 returned columns:
['SibSp_nmbr', 'SibSp_NArw']

evaluating column:  Parch
processing column:  Parch
    root category:  nmbr
 returned columns:
['Parch_nmbr', 'Pa

The returned data can be accessed in the sets:
train, trainID, labels

In [8]:
train.head()

Unnamed: 0,Sex_bnry,Age_nmbr,SibSp_nmbr,Parch_nmbr,Fare_nmbr,Cabin_hash,Pclass_NArw,Pclass_1.0,Pclass_2.0,Pclass_3.0,...,Parch_NArw,Ticket_NArw,Ticket_hash_0,Ticket_hash_1,Ticket_hash_2,Fare_NArw,Cabin_NArw,Embarked_NArw,Embarked_1010_0,Embarked_1010_1
691,0,-1.976549,-0.474279,0.767199,-0.378068,75,0,0,0,1,...,0,0,523,0,0,0,1,0,0,0
256,0,0.766103,-0.474279,-0.473408,0.945714,75,0,1,0,0,...,0,0,228,898,0,0,1,0,0,0
479,0,-2.130371,-0.474279,0.767199,-0.400792,75,0,0,0,1,...,0,0,84,0,0,0,1,0,1,0
684,1,2.330476,0.43255,0.767199,0.136754,75,0,0,1,0,...,0,0,922,0,0,0,1,0,1,0
369,0,-0.438326,-0.474279,-0.473408,0.746493,93,0,1,0,0,...,0,0,228,782,0,0,0,0,0,0


Note that the column headers of the returned data are different, now including suffix appenders logging the applied transformations.

Any carved out ID sets are included in the trainID set as well as an aggregated set of index numbers (since the function by default shuffles training data).

In [9]:
trainID.head()

Unnamed: 0,PassengerId,Automunge_index
691,692,691
256,257,256
479,480,479
684,685,684
369,370,369


And labels:

In [10]:
labels.head()

Unnamed: 0,Survived_0.0,Survived_1.0
691,0,1
256,0,1
479,0,1
684,1,0
369,0,1


# a few more common parameters

A few options that might come up often:
- if we have test data available at same time as train data, we can also pass a test set
- if we want to carve out a validation set processed on the train set basis we can designate a ratio by the valpercent1 and/or valpercent2 parameter
- if we want to turn off printouts we can turn off with printstatus = False
- if we want to return numpy arrays instead of dataframes can pass pandasoutput = False
- for including with transformations a marker for entries that were subject to infill can pass NArw_marker = True
- for auto ML derived missing data infill can apply MLinfill = True

In [11]:
train, trainID, labels, \
validation1, validationID1, validationlabels1, \
validation2, validationID2, validationlabels2, \
test, testID, testlabels, \
labelsencoding_dict, finalcolumns_train, finalcolumns_test, \
featureimportance, postprocess_dict \
= am.automunge(df_train,
               df_test = df_test,
               labels_column = labels_column,
               trainID_column = trainID_column,
               valpercent1 = 0.2, 
               printstatus = False, 
               pandasoutput = False,
               MLinfill = True,
               NArw_marker = True)

In [12]:
#the test data is returned in test, testID, testlabels
#here as a numpy array based on pandasoutput parameter
test

array([[ 1.        ,  0.37381986, -0.47511193, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.        ,  1.3347664 ,  0.4150978 , ...,  0.        ,
         1.        ,  0.        ],
       [ 1.        ,  2.4879022 , -0.47511193, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.        ,  0.68132275, -0.47511193, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.        , -0.17199777, -0.47511193, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.        , -1.9339893 ,  0.4150978 , ...,  0.        ,
         0.        ,  0.        ]], dtype=float32)

# Processing additional test data

Of the various returned sets, an important one is the final object we call the postprocess_dict. Think of this as a key to processing additioanl data on the original train set basis. If you intend to productionize a model we recomend saving externally such as with the pickle library. Once we have additional data we want to process we can pass it with the postprocess_dict to the postmunge(.) function.

In [13]:
test, testID, testlabels, \
labelsencoding_dict, postreports_dict \
= am.postmunge(postprocess_dict, df_test)

_______________
Begin Postmunge processing

______

processing column:  Pclass
    root category:  text

 returned columns:
['Pclass_NArw', 'Pclass_1.0', 'Pclass_2.0', 'Pclass_3.0']

______

processing column:  Name
    root category:  hash

 returned columns:
['Name_NArw', 'Name_hash_0', 'Name_hash_1', 'Name_hash_2', 'Name_hash_3', 'Name_hash_4', 'Name_hash_5', 'Name_hash_6', 'Name_hash_7', 'Name_hash_8', 'Name_hash_9', 'Name_hash_10', 'Name_hash_11', 'Name_hash_12', 'Name_hash_13']

______

processing column:  Sex
    root category:  bnry

 returned columns:
['Sex_bnry', 'Sex_NArw']

______

processing column:  Age
    root category:  nmbr

 returned columns:
['Age_nmbr', 'Age_NArw']

______

processing column:  SibSp
    root category:  nmbr

 returned columns:
['SibSp_nmbr', 'SibSp_NArw']

______

processing column:  Parch
    root category:  nmbr

 returned columns:
['Parch_nmbr', 'Parch_NArw']

______

processing column:  Ticket
    root category:  hash

 returned columns:
['Tick

In [14]:
test.head()

Unnamed: 0,Sex_bnry,Age_nmbr,SibSp_nmbr,Parch_nmbr,Fare_nmbr,Pclass_NArw,Pclass_1.0,Pclass_2.0,Pclass_3.0,Name_NArw,...,Cabin_1010_0,Cabin_1010_1,Cabin_1010_2,Cabin_1010_3,Cabin_1010_4,Cabin_1010_5,Cabin_1010_6,Embarked_NArw,Embarked_1010_0,Embarked_1010_1
0,1,0.37382,-0.475112,-0.484924,-0.495827,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,0,1.334766,0.415098,-0.484924,-0.51329,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,1,2.487902,-0.475112,-0.484924,-0.456691,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,-0.202748,-0.475112,-0.484924,-0.478278,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0,-0.587127,0.415098,0.739422,-0.401935,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


# Custom transformations

Automunge has a library of transformations (documented in the read me). In general, each of these transformations is fit to properties of the train set to enable processing on a consistent basis of additional data.

Each transformation in the libary has a distinct 4 character string identifier, generally aligned with the suffix appender on the returned set. 

We can designate our assignments in the assigncat parameter as follows:

In [15]:
#here we designate min-max scaling to the column 'Fare'
assigncat = {'mnmx':['Fare']}

train, trainID, labels, \
validation1, validationID1, validationlabels1, \
validation2, validationID2, validationlabels2, \
test, testID, testlabels, \
labelsencoding_dict, finalcolumns_train, finalcolumns_test, \
featureimportance, postprocess_dict \
= am.automunge(df_train,
               labels_column = labels_column,
               trainID_column = trainID_column,
               assigncat = assigncat,
               printstatus = False)

To view the columns returned from a specific input column can use the column map stored in the postprocess_dict.

In [16]:
train[postprocess_dict['column_map']['Fare']].head()

Unnamed: 0,Fare_mnmx,Fare_NArw
86,0.067096,0
74,0.110272,0
890,0.015127,0
707,0.05131,0
640,0.01533,0


# Missing data infill

We noted earlier that the MLinfill parameter activates an autoML method for missing data inputation. Let's take a look at this in action. Here we'll turn on ML infill as well as markers for entries subject to infill with the NArw_marker parameter.

In [17]:
train, trainID, labels, \
validation1, validationID1, validationlabels1, \
validation2, validationID2, validationlabels2, \
test, testID, testlabels, \
labelsencoding_dict, finalcolumns_train, finalcolumns_test, \
featureimportance, postprocess_dict \
= am.automunge(df_train,
               labels_column = labels_column,
               trainID_column = trainID_column,
               MLinfill = True,
               NArw_marker=True,
               printstatus = False)

By inspection, if appears that one of the entries in the Age column was subject to infill:

In [18]:
train[postprocess_dict['column_map']['Age']].head()

Unnamed: 0,Age_nmbr,Age_NArw
305,-2.213435,0
222,1.638276,0
512,0.484608,0
865,0.946075,0
337,0.869164,0


It appears the ML infill is assuming for first row's inputation that this is a very young passenger (remember this is normalized data is reason for the negative value).

Note that the trained models for each feature are saved in the postprocess_dict to enable a consistent inputation basis for subsequent data.

ML infill isn't the only inputation option. Other options like mode, adjacent cell, 0/1, mean, etc can be designated to distinct columns with the assigninfill parameter.

Here we'll demonstrate applying a few different approaches to different columns.

In [19]:
assigninfill = {'MLinfill'  : ['Pclass'],
                'adjinfill' : ['Age'],
                'modeinfill': ['Fare']}

train, trainID, labels, \
validation1, validationID1, validationlabels1, \
validation2, validationID2, validationlabels2, \
test, testID, testlabels, \
labelsencoding_dict, finalcolumns_train, finalcolumns_test, \
featureimportance, postprocess_dict \
= am.automunge(df_train,
               labels_column = labels_column,
               trainID_column = trainID_column,
               assigninfill = assigninfill,
               printstatus = False)

In [20]:
train.head()

Unnamed: 0,Sex_bnry,Age_nmbr,SibSp_nmbr,Parch_nmbr,Fare_nmbr,Cabin_hash,Pclass_NArw,Pclass_1.0,Pclass_2.0,Pclass_3.0,...,Parch_NArw,Ticket_NArw,Ticket_hash_0,Ticket_hash_1,Ticket_hash_2,Fare_NArw,Cabin_NArw,Embarked_NArw,Embarked_1010_0,Embarked_1010_1
53,0,-0.05377,0.43255,-0.473408,-0.12485,75,0,0,1,0,...,0,0,616,0,0,0,1,0,1,0
167,0,1.176808,0.43255,4.489019,-0.086615,75,0,0,0,1,...,0,0,64,0,0,0,1,0,1,0
650,1,-0.515237,-0.474279,-0.473408,-0.489167,75,0,0,0,1,...,0,0,527,0,0,0,1,0,1,0
190,0,0.176964,-0.474279,-0.473408,-0.386454,75,0,0,1,0,...,0,0,746,0,0,0,1,0,1,0
45,1,-0.822881,-0.474279,-0.473408,-0.486064,75,0,0,0,1,...,0,0,69,607,0,0,1,0,1,0


# Conclusion

In closing, as an explanation, the whole point of conducting all of the transformations in a single function is that this application serves to populate a dictionary (the "postprocess_dict") fit to properties of the train data, capturing all of the steps and parameters of transformations, potentially including methods for ML derived missing data inputation, dimensionality reductions, and other various encodings available in the library. This returned dictionary can then be passed to the postmunge(.) function with subsequent data for fully consistent processing on the train set basis.

In [21]:
test, testID, testlabels, \
labelsencoding_dict, postreports_dict \
= am.postmunge(postprocess_dict, df_test)

_______________
Begin Postmunge processing

______

processing column:  Pclass
    root category:  text

 returned columns:
['Pclass_NArw', 'Pclass_1.0', 'Pclass_2.0', 'Pclass_3.0']

______

processing column:  Name
    root category:  hash

 returned columns:
['Name_NArw', 'Name_hash_0', 'Name_hash_1', 'Name_hash_2', 'Name_hash_3', 'Name_hash_4', 'Name_hash_5', 'Name_hash_6', 'Name_hash_7', 'Name_hash_8', 'Name_hash_9', 'Name_hash_10', 'Name_hash_11', 'Name_hash_12', 'Name_hash_13']

______

processing column:  Sex
    root category:  bnry

 returned columns:
['Sex_bnry', 'Sex_NArw']

______

processing column:  Age
    root category:  nmbr

 returned columns:
['Age_nmbr', 'Age_NArw']

______

processing column:  SibSp
    root category:  nmbr

 returned columns:
['SibSp_nmbr', 'SibSp_NArw']

______

processing column:  Parch
    root category:  nmbr

 returned columns:
['Parch_nmbr', 'Parch_NArw']

______

processing column:  Ticket
    root category:  hash

 returned columns:
['Tick

In [22]:
test.head()

Unnamed: 0,Sex_bnry,Age_nmbr,SibSp_nmbr,Parch_nmbr,Fare_nmbr,Cabin_hash,Pclass_NArw,Pclass_1.0,Pclass_2.0,Pclass_3.0,...,Parch_NArw,Ticket_NArw,Ticket_hash_0,Ticket_hash_1,Ticket_hash_2,Fare_NArw,Cabin_NArw,Embarked_NArw,Embarked_1010_0,Embarked_1010_1
0,1,0.369241,-0.474279,-0.473408,-0.490508,75,0,0,0,1,...,0,0,1021,0,0,0,1,0,0,1
1,0,1.330631,0.43255,-0.473408,-0.507194,75,0,0,0,1,...,0,0,214,0,0,0,1,0,1,0
2,1,2.484298,-0.474279,-0.473408,-0.453112,75,0,0,1,0,...,0,0,568,0,0,0,1,0,0,1
3,1,-0.207592,-0.474279,-0.473408,-0.473739,75,0,0,0,1,...,0,0,345,0,0,0,1,0,1,0
4,0,-0.592148,0.43255,0.767199,-0.400792,75,0,0,0,1,...,0,0,84,0,0,0,1,0,1,0
