# DecisionTreeEncoder
The DecisionTreeEncoder() encodes categorical variables with predictions of a decision tree model.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from feature_engine.encoding import DecisionTreeEncoder

In [2]:
def load_titanic():
    # Load dataset from OpenML
    data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')
    
    # Replace '?' with NaN
    data = data.replace('?', np.nan)
    
    # Extract first letter of cabin
    data['cabin'] = data['cabin'].astype(str).str[0]
    
    # Convert 'pclass' to categorical
    data['pclass'] = data['pclass'].astype('O')
    
    # Ensure 'age' contains only numeric data by coercing errors to NaN
    data['age'] = pd.to_numeric(data['age'], errors='coerce')
    
    # Fill missing values in 'age' with the median age
    data['age'].fillna(data['age'].median(), inplace=True)
    
    # Ensure 'fare' contains only numeric data by coercing errors to NaN
    data['fare'] = pd.to_numeric(data['fare'], errors='coerce')
    
    # Fill missing values in 'fare' with the median fare
    data['fare'].fillna(data['fare'].median(), inplace=True)
    
    # Fill missing values in 'embarked' with 'C'
    data['embarked'].fillna('C', inplace=True)
    
    # Drop irrelevant columns
    data.drop(labels=['boat', 'body', 'home.dest'], axis=1, inplace=True)
    
    return data

# Load the data
data = load_titanic()

# Display the first few rows of the cleaned data
data.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['age'].fillna(data['age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['fare'].fillna(data['fare'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we 

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C,S


In [3]:
X = data.drop(['survived', 'name', 'ticket'], axis=1)
y = data.survived

# we will encode the below variables, they have no missing values
X[['cabin', 'pclass', 'embarked']].isnull().sum()

cabin       0
pclass      0
embarked    0
dtype: int64

In [4]:
''' Make sure that the variables are type (object).
if not, cast it as object , otherwise the transformer will either send an error (if we pass it as argument) 
or not pick it up (if we leave variables=None). '''

X[['cabin', 'pclass', 'embarked']].dtypes

cabin       object
pclass      object
embarked    object
dtype: object

In [5]:
# let's separate into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

X_train.shape, X_test.shape

((916, 8), (393, 8))

The categorical variable will be first encoded into integers with the OrdinalEncoder(). The integers can be assigned arbitrarily to the categories or following the mean value of the target in each category.

Then a decision tree will be fit using the resulting numerical variable to predict the target variable. Finally, the original categorical variable values will be replaced by the predictions of the decision tree.

In [6]:
'''
Parameters
    ----------

    encoding_method: str, default='arbitrary'
        The categorical encoding method that will be used to encode the original
        categories to numerical values.

        'ordered': the categories are numbered in ascending order according to
        the target mean value per category.

        'arbitrary' : categories are numbered arbitrarily.

    cv : int, default=3
        Desired number of cross-validation fold to be used to fit the decision
        tree.

    scoring: str, default='neg_mean_squared_error'
        Desired metric to optimise the performance for the tree. Comes from
        sklearn metrics. See the DecisionTreeRegressor or DecisionTreeClassifier
        model evaluation documentation for more options:
        https://scikit-learn.org/stable/modules/model_evaluation.html

    regression : boolean, default=True
        Indicates whether the encoder should train a regression or a classification
        decision tree.

    param_grid : dictionary, default=None
        The list of parameters over which the decision tree should be optimised
        during the grid search. The param_grid can contain any of the permitted
        parameters for Scikit-learn's DecisionTreeRegressor() or
        DecisionTreeClassifier().

        If None, then param_grid = {'max_depth': [1, 2, 3, 4]}.

    random_state : int, default=None
        The random_state to initialise the training of the decision tree. It is one
        of the parameters of the Scikit-learn's DecisionTreeRegressor() or
        DecisionTreeClassifier(). For reproducibility it is recommended to set
        the random_state to an integer.

    variables : list, default=None
        The list of categorical variables that will be encoded. If None, the
        encoder will find and select all object type variables.
'''

"\nParameters\n    ----------\n\n    encoding_method: str, default='arbitrary'\n        The categorical encoding method that will be used to encode the original\n        categories to numerical values.\n\n        'ordered': the categories are numbered in ascending order according to\n        the target mean value per category.\n\n        'arbitrary' : categories are numbered arbitrarily.\n\n    cv : int, default=3\n        Desired number of cross-validation fold to be used to fit the decision\n        tree.\n\n    scoring: str, default='neg_mean_squared_error'\n        Desired metric to optimise the performance for the tree. Comes from\n        sklearn metrics. See the DecisionTreeRegressor or DecisionTreeClassifier\n        model evaluation documentation for more options:\n        https://scikit-learn.org/stable/modules/model_evaluation.html\n\n    regression : boolean, default=True\n        Indicates whether the encoder should train a regression or a classification\n        decision 

In [7]:
tree_enc = DecisionTreeEncoder(encoding_method='arbitrary',
                               cv=3,
                               scoring = 'roc_auc',
                               param_grid = {'max_depth': [1, 2, 3, 4]},
                               regression = False,
                               variables=['cabin', 'pclass', 'embarked']
                              )

tree_enc.fit(X_train,y_train) # to fit you need to pass the target y

In [8]:
tree_enc.encoder_dict_

{'cabin': {'n': 0.30484330484330485,
  'E': 0.6116504854368932,
  'C': 0.6116504854368932,
  'D': 0.6981132075471698,
  'B': 0.6981132075471698,
  'A': 0.6981132075471698,
  'F': 0.6981132075471698,
  'T': 0.0,
  'G': 0.5},
 'pclass': {2: 0.43617021276595747,
  3: 0.25903614457831325,
  1: 0.6173913043478261},
 'embarked': {'S': 0.3389570552147239,
  'C': 0.5580110497237569,
  'Q': 0.37349397590361444}}

In [9]:
# transform and visualise the data

train_t = tree_enc.transform(X_train)
test_t = tree_enc.transform(X_test)

test_t.sample(5)

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
322,0.617391,female,36.0,0,0,135.6333,0.61165,0.558011
299,0.617391,male,40.0,0,0,27.7208,0.304843,0.558011
1103,0.259036,male,2.0,4,1,39.6875,0.304843,0.338957
124,0.617391,female,48.0,1,1,79.2,0.698113,0.558011
717,0.259036,male,18.0,1,0,14.4542,0.304843,0.558011


## Automatically select the variables
This encoder will select all categorical variables to encode, when no variables are specified when calling the encoder.

In [10]:
tree_enc = DecisionTreeEncoder(encoding_method='arbitrary',
                               cv=3,
                               scoring = 'roc_auc',
                               param_grid = {'max_depth': [1, 2, 3, 4]},
                               regression = False,
                              )

tree_enc.fit(X_train,y_train) # to fit you need to pass the target y

In [11]:
# transform and visualise the data

train_t = tree_enc.transform(X_train)
test_t = tree_enc.transform(X_test)

test_t.sample(5)

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked
186,0.617391,0.728358,55.0,0,0,27.7208,0.304843,0.558011
211,0.617391,0.187608,47.0,0,0,42.4,0.304843,0.338957
1124,0.259036,0.728358,28.0,0,0,8.1375,0.304843,0.373494
64,0.617391,0.187608,27.0,1,0,53.1,0.61165,0.338957
195,0.617391,0.728358,16.0,0,0,86.5,0.698113,0.338957
