### Requirements to encode dataset
- category-encoders (pip install category-encoders)
- numpy
- pandas

In [1]:
import pandas as pd
import numpy as np
import random
import category_encoders as ce

In [2]:
base_path = '../..'
all_data_file = f"{base_path}/datasets/german_credit_eval_multiclass.csv"
df = pd.read_csv(all_data_file)

label_column = 'outcome'

# Separate outcome
y = df[label_column]
X = df.drop(label_column, axis=1)

# define feature data types
one_hot_encode_cat_cols = [
             'checkingstatus',
             'others',
             'age',
             'job',
             'employ',
             'property',
             'foreign',
             'history',
             'savings',
             'purpose',
             'housing'
           ]
target_encode_cat_cols = [
            'status', 
            'telephone',
            'otherplans',
]

In [3]:
# target encode 'status', 'telephone' cols
te= ce.TargetEncoder(cols=target_encode_cat_cols, handle_unknown="ignore").fit(X, y)
X = te.transform(X)

In [4]:
# one-hot encode dataset
encoded_df = pd.get_dummies(X, columns=one_hot_encode_cat_cols)
encoded_df[label_column] = y
encoded_df.head()

Unnamed: 0,duration,amount,installment,status,residence,otherplans,cards,liable,telephone,checkingstatus_... < 0 DM,...,purpose_education,purpose_furniture/equipment,purpose_purpose - others,purpose_radio/television,purpose_repairs,purpose_retraining,housing_for free,housing_own,housing_rent,outcome
0,6,1343,1,1.476277,4,1.515971,2,2,1.590604,0,...,0,0,0,0,0,0,0,1,0,1
1,28,4006,3,1.476277,2,1.515971,1,1,1.590604,1,...,0,0,0,0,0,0,0,1,0,2
2,24,2284,4,1.476277,2,1.515971,1,1,1.420792,0,...,0,0,0,1,0,0,0,1,0,1
3,24,1533,4,1.6,3,1.551227,1,1,1.420792,0,...,0,0,0,1,0,0,0,1,0,1
4,12,1101,3,1.510878,2,1.515971,2,1,1.420792,0,...,0,0,0,0,0,0,0,1,0,1


In [5]:
# construct one-hot value mappings
cat_value_mappings = {}
for feature in one_hot_encode_cat_cols:
    one_hot_col_name_prefix = f"{feature}_"
    mapping = {}
    for ec in encoded_df.columns:
        if ec.startswith(one_hot_col_name_prefix):
            value = ec[len(one_hot_col_name_prefix):]
            mapping[ec] = value
    cat_value_mappings[feature] = mapping
    print(f"Feature value -> column mappings for categorical feature '{feature}':")
    for col, val in cat_value_mappings[feature].items():
        print(f"\t{col} -> {val}")

# Save the encoded dataset
preprocessed_dataset_file = "german_credit_eval_multiclass_encoded.csv"
encoded_df.to_csv(f'{base_path}/datasets/{preprocessed_dataset_file}', index=False)

Feature value -> column mappings for categorical feature 'checkingstatus':
	checkingstatus_... < 0 DM -> ... < 0 DM
	checkingstatus_... >= 200 DM / salary assignments for at least 1 year -> ... >= 200 DM / salary assignments for at least 1 year
	checkingstatus_0 <= ... < 200 DM -> 0 <= ... < 200 DM
	checkingstatus_no checking account -> no checking account
Feature value -> column mappings for categorical feature 'others':
	others_co-applicant -> co-applicant
	others_guarantor -> guarantor
	others_others - none -> others - none
Feature value -> column mappings for categorical feature 'age':
	age_<= 25 years -> <= 25 years
	age_> 25 years -> > 25 years
Feature value -> column mappings for categorical feature 'job':
	job_management/ self-employed/highly qualified employee/ officer -> management/ self-employed/highly qualified employee/ officer
	job_skilled employee / official -> skilled employee / official
	job_unemployed/ unskilled - non-resident -> unemployed/ unskilled - non-resident
	

In [6]:
# Construct target encoder mappings
target_mappings = {}
for feature in target_encode_cat_cols:
    feature_mappings = {}
    for ordinal_mapping in te.ordinal_encoder.category_mapping:
        if ordinal_mapping['col'] == feature:
            mapping = ordinal_mapping['mapping']
            for idx, ordinal in enumerate(mapping):
                label = mapping.index[idx]
                if not (isinstance(label, float) and np.isnan(label)):
                    feature_mappings[label] = te.mapping[feature][ordinal]
            break
    target_mappings[feature] = feature_mappings
target_mappings    

{'status': {'male : single': 1.4762773722627738,
  'female : divorced/separated/married': 1.59999999999998,
  'male : married/widowed': 1.5108778688426723,
  'male : divorced/separated': 1.5581978168192525},
 'telephone': {'phone - none': 1.5906040268456376,
  'phone - yes, registered under the customers name': 1.4207920792079207},
 'otherplans': {'none': 1.515970515970516,
  'stores': 1.5512272565961795,
  'bank': 1.5467624217811897}}

In [7]:
# dump the mappings to disk for inverse mappings
import json
mappings={
    "one_hot_encoded_mappings": cat_value_mappings,
    "target_encoded_mappings": target_mappings
}
with open('cat_mappings.json', 'w') as fl:
    json.dump(mappings, fl)