##  Recipe20 - Pipeline example continued .....
## Replacing missing values by a value at the end of the distribution

In this recipe, we will replace missing values by a value at the end of the distribution, estimated with the Gaussian approximation or the inter-quantal range proximity rule, using pandas and Feature-Engine, all open source Python libraries.

In [70]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import feature_engine.imputation as mdi

In [71]:
# load data
data = pd.read_csv('creditApprovalUCI.csv')
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [72]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

X_train.shape, X_test.shape

((483, 15), (207, 15))

In [73]:
# find categorical variables
cat_cols = [c for c in data.columns if data[c].dtypes=='O']
data[cat_cols].isnull().mean().sort_values()

A1     0.0
A4     0.0
A5     0.0
A6     0.0
A7     0.0
A9     0.0
A10    0.0
A12    0.0
A13    0.0
dtype: float64

In [74]:
# find numerical variables
num_cols = [c for c in data.columns if data[c].dtypes!='O']
data[num_cols].isnull().mean().sort_values()

A2     0.0
A3     0.0
A8     0.0
A11    0.0
A14    0.0
A15    0.0
A16    0.0
dtype: float64

In [75]:
# # find the percentage of missing data within those variables

# X_train.isnull().mean()

In [76]:
# first we need to make a list with the numerical vars
features_num_arbitrary = ['A3', 'A8']
features_num_median = ['A2', 'A14']

features_cat_frequent = ['A4', 'A5', 'A6', 'A7']
features_cat_missing = ['A1', 'A9', 'A10']

In [77]:
# we instantiate each imputer within a pipeline

pipe = Pipeline(steps=[
    ('imp_num_arbitrary', mdi.ArbitraryNumberImputer(variables = features_num_arbitrary)),
    ('imp_num_median', mdi.MeanMedianImputer(imputation_method = 'median', variables=features_num_median)),
    ('imp_cat_frequent', mdi.CategoricalImputer(variables = features_cat_frequent, imputation_method='frequent',)),
    ('imp_cat_missing', mdi.CategoricalImputer(variables=features_cat_missing, imputation_method='missing'))
])

In [78]:
# now we fit the preprocessor
pipe.fit(X_train)

Pipeline(steps=[('imp_num_arbitrary',
                 ArbitraryNumberImputer(variables=['A3', 'A8'])),
                ('imp_num_median', MeanMedianImputer(variables=['A2', 'A14'])),
                ('imp_cat_frequent',
                 CategoricalImputer(imputation_method='frequent',
                                    variables=['A4', 'A5', 'A6', 'A7'])),
                ('imp_cat_missing',
                 CategoricalImputer(variables=['A1', 'A9', 'A10']))])

In [79]:
# and now we impute the data
X_train = pipe.transform(X_train)
X_test = pipe.transform(X_test)

In [80]:
X_train.isnull().sum()

A1     0
A2     0
A3     0
A4     0
A5     0
A6     0
A7     0
A8     0
A9     0
A10    0
A11    0
A12    0
A13    0
A14    0
A15    0
dtype: int64

In [81]:
X_test.isnull().sum()

A1     0
A2     0
A3     0
A4     0
A5     0
A6     0
A7     0
A8     0
A9     0
A10    0
A11    0
A12    0
A13    0
A14    0
A15    0
dtype: int64

## Recipe 21
Download and prepare the Credit Approval data set from the UCI Machine Learning Repository.

**Citation:**

Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.


In [82]:
import os, random
import pandas as pd
import numpy as np

from pathlib import Path, PureWindowsPath

# I've explicitly declared my path as being in Windows format, so I can use forward slashes in it.
base_dir = "c:\\Users\Arindam Banerji\CopyFolder\IoT_thoughts\python-projects\kaggle_experiments"
win_path = base_dir + "\\feature-engineering" + "\\fe-recipes"

filename = PureWindowsPath(win_path)

# Convert path to the right format for the current operating system
correct_path = Path(filename)

# Alternate method... neither is clan
# correct_path = os.path.join(
#  "c:\\Users\Arindam Banerji\CopyFolder\IoT_thoughts\python-projects\kaggle_experiments",
#   "feature-engineering", "fe-recipes")

print ("chnaging to ", correct_path )

os.chdir(correct_path)

print (" working in ", os.getcwd())


chnaging to  c:\Users\Arindam Banerji\CopyFolder\IoT_thoughts\python-projects\kaggle_experiments\feature-engineering\fe-recipes
 working in  c:\Users\Arindam Banerji\CopyFolder\IoT_thoughts\python-projects\kaggle_experiments\feature-engineering\fe-recipes


In [83]:
# load data
data = pd.read_csv('crx.data', header=None)

# create variable names according to UCI Machine Learning
# Repo information
varnames = ['A'+str(s) for s in range(1,17)]

# add variable names to dataframe columns
data.columns = varnames

# replace ? by np.nan
data = data.replace('?', np.nan)

# display
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [84]:
# re-cast some variables to the correct types 
data['A2'] = data['A2'].astype('float')
data['A14'] = data['A14'].astype('float')

# encode target to binary
data['A16'] = data['A16'].map({'+':1, '-':0})

# display
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [85]:
# find categorical variables
cat_cols = [c for c in data.columns if data[c].dtypes=='O']
data[cat_cols].head()

Unnamed: 0,A1,A4,A5,A6,A7,A9,A10,A12,A13
0,b,u,g,w,v,t,t,f,g
1,a,u,g,q,h,t,t,f,g
2,a,u,g,q,h,t,f,f,g
3,b,u,g,w,v,t,t,t,g
4,b,u,g,w,v,t,f,f,s


In [86]:
# find numerical variables
num_cols = [c for c in data.columns if data[c].dtypes!='O']
data[num_cols].head()

Unnamed: 0,A2,A3,A8,A11,A14,A15,A16
0,30.83,0.0,1.25,1,202.0,0,1
1,58.67,4.46,3.04,6,43.0,560,1
2,24.5,0.5,1.5,0,280.0,824,1
3,27.83,1.54,3.75,5,100.0,3,1
4,20.17,5.625,1.71,0,120.0,0,1


In [87]:
# fill in missing values

data[num_cols] = data[num_cols].fillna(0)
data[cat_cols] = data[cat_cols].fillna('Missing')

data.isnull().sum()

A1     0
A2     0
A3     0
A4     0
A5     0
A6     0
A7     0
A8     0
A9     0
A10    0
A11    0
A12    0
A13    0
A14    0
A15    0
A16    0
dtype: int64

In [88]:
# save the data
data.to_csv('creditApprovalUCI.csv', index=False)

# Recipe 22
One Hot Encoding 

In [89]:
# to split the datasets
from sklearn.model_selection import train_test_split

# for one hot encoding with sklearn
from sklearn.preprocessing import OneHotEncoder

# for one hot encoding with feature-engine - the below import causes a name space conflict
# from feature_engine.encoding import OneHotEncoder
# from feature_engine.categorical_encoders import OneHotCategoricalEncoder

In [90]:
# let's load the data set

data = pd.read_csv('creditApprovalUCI.csv')

data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [91]:
# make a list with the categorical variables

vars_categorical = ['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [92]:
# let's separate into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['A16'], axis=1),  # predictors
    data['A16'],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((483, 15), (207, 15))

In [93]:
# let's inspect the unique values of A4

X_train['A4'].unique()

array(['u', 'y', 'Missing', 'l'], dtype=object)

In [94]:
# let's one hot encode A4

tmp = pd.get_dummies(X_train['A4'], drop_first=True)

print(tmp.head())

     l  u  y
596  0  1  0
303  0  1  0
204  0  0  1
351  0  0  1
118  0  1  0


In [95]:
# now let's encode all cateogrical variables together: train set

X_train_enc = pd.get_dummies(X_train[vars_categorical], drop_first=True)

print(X_train_enc.head())

     A1_a  A1_b  A4_l  A4_u  A4_y  A5_g  A5_gg  A5_p  A6_aa  A6_c  ...  A7_j  \
596     1     0     0     1     0     1      0     0      0     1  ...     0   
303     1     0     0     1     0     1      0     0      0     0  ...     0   
204     0     1     0     0     1     0      0     1      0     0  ...     0   
351     0     1     0     0     1     0      0     1      0     0  ...     0   
118     0     1     0     1     0     1      0     0      0     0  ...     0   

     A7_n  A7_o  A7_v  A7_z  A9_t  A10_t  A12_t  A13_p  A13_s  
596     0     0     1     0     1      1      1      0      0  
303     0     0     1     0     0      0      0      0      0  
204     0     0     1     0     1      1      0      0      0  
351     0     0     0     0     0      0      0      0      0  
118     0     0     1     0     1      1      1      0      0  

[5 rows x 36 columns]


In [96]:
# and in the test set

X_test_enc = pd.get_dummies(X_test[vars_categorical], drop_first=True)

X_test_enc.head()

Unnamed: 0,A1_a,A1_b,A4_l,A4_u,A4_y,A5_g,A5_gg,A5_p,A6_aa,A6_c,...,A7_j,A7_n,A7_o,A7_v,A7_z,A9_t,A10_t,A12_t,A13_p,A13_s
14,1,0,0,1,0,1,0,0,0,0,...,0,0,0,1,0,1,1,1,0,0
586,0,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,1,1,1,0,0
140,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,1,1,1,0,0
492,0,1,0,1,0,1,0,0,0,0,...,0,0,0,1,0,1,1,0,0,0
350,1,0,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0


In [97]:
# explicit re import since feature-engineer also has a OneHotEncoder() now
from sklearn.preprocessing import OneHotEncoder

# we create and train the encoder

encoder = OneHotEncoder(categories='auto',
                        drop='first',  # to return k-1, use frop=false to return k dummies
                        sparse=False)

In [98]:
# fit the encoder to the train set: it will learn the categories to encode

encoder.fit(X_train[vars_categorical])

OneHotEncoder(drop='first', sparse=False)

In [99]:
# transform the train and test sets

X_train_enc = encoder.transform(X_train[vars_categorical])
X_test_enc = encoder.transform(X_test[vars_categorical])

In [100]:
# let's inspect the train set

print(pd.DataFrame(X_train_enc).head())

    0    1    2    3    4    5    6    7    8    9   ...   26   27   28   29  \
0  1.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  0.0  1.0  ...  0.0  0.0  0.0  1.0   
1  1.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  1.0   
2  0.0  1.0  0.0  0.0  1.0  0.0  0.0  1.0  0.0  0.0  ...  0.0  0.0  0.0  1.0   
3  0.0  1.0  0.0  0.0  1.0  0.0  0.0  1.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4  0.0  1.0  0.0  1.0  0.0  1.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  1.0   

    30   31   32   33   34   35  
0  0.0  1.0  1.0  1.0  0.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.0  1.0  1.0  0.0  0.0  0.0  
3  0.0  0.0  0.0  0.0  0.0  0.0  
4  0.0  1.0  1.0  1.0  0.0  0.0  

[5 rows x 36 columns]


In [101]:
# let's inspect the test set

pd.DataFrame(X_test_enc).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
3,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [102]:
# let's create the encoder

from feature_engine.encoding import OneHotEncoder

ohe_enc = OneHotEncoder(
    top_categories=None,
    drop_last=True)  # to return k-1, false to return k

In [103]:
ohe_enc.variables

In [104]:
# fit the encoder to the train set: it will learn the variables and 
# categories to encode

ohe_enc.fit(X_train)

OneHotEncoder(drop_last=True)

In [105]:
# we can see which variables the encoder will encode

print (ohe_enc.variables)

None


In [106]:
# let's transform train and test set

X_train_enc = ohe_enc.transform(X_train)
X_test_enc = ohe_enc.transform(X_test)

In [107]:
# let's inspect the encoded train set

X_train_enc.head()

Unnamed: 0,A2,A3,A8,A11,A14,A15,A1_a,A1_b,A4_u,A4_y,...,A7_z,A7_bb,A7_j,A7_Missing,A7_n,A9_t,A10_t,A12_t,A13_g,A13_s
596,46.08,3.0,2.375,8,396.0,4159,1,0,1,0,...,0,0,0,0,0,1,1,1,1,0
303,15.92,2.875,0.085,0,120.0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,1,0
204,36.33,2.125,0.085,1,50.0,1187,0,1,0,1,...,0,0,0,0,0,1,1,0,1,0
351,22.17,0.585,0.0,0,100.0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,1,0
118,57.83,7.04,14.0,6,360.0,1332,0,1,1,0,...,0,0,0,0,0,1,1,1,1,0


In [108]:
print (ohe_enc.variables)

None


In [109]:
# let's inspect the encoded test set

X_test_enc.head()

Unnamed: 0,A2,A3,A8,A11,A14,A15,A1_a,A1_b,A4_u,A4_y,...,A7_z,A7_bb,A7_j,A7_Missing,A7_n,A9_t,A10_t,A12_t,A13_g,A13_s
14,45.83,10.5,5.0,7,0.0,0,1,0,1,0,...,0,0,0,0,0,1,1,1,1,0
586,64.08,20.0,17.5,9,0.0,1000,0,1,1,0,...,0,0,0,0,0,1,1,1,1,0
140,31.25,3.75,0.625,9,181.0,0,1,0,1,0,...,0,0,0,0,0,1,1,1,1,0
492,39.25,9.5,6.5,14,240.0,4607,0,1,1,0,...,0,0,0,0,0,1,1,0,1,0
350,26.17,2.0,0.0,0,276.0,1,1,0,1,0,...,0,0,1,0,0,0,0,1,1,0


## Recipe 23 - drop this....

In [110]:
import pandas as pd

import numpy as np

# to split the datasets
from sklearn.model_selection import train_test_split

# for one hot encoding with feature-engine
from feature_engine.encoding import OneHotEncoder


# from feature_engine.categorical_encoders import OneHotCategoricalEncoder

In [111]:
# let's load the data set

data = pd.read_csv('creditApprovalUCI.csv')

data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [112]:
# make a list with the categorical variables

vars_categorical = ['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [113]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['A16'], axis=1),  # predictors
    data['A16'],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((483, 15), (207, 15))

In [114]:
# let's inspect the unique categories of A6

X_train['A6'].unique()

array(['c', 'q', 'w', 'ff', 'm', 'i', 'e', 'cc', 'x', 'd', 'k', 'j',
       'Missing', 'aa', 'r'], dtype=object)

In [115]:
# let's find the top 5 most frequent categories in A6

X_train['A6'].value_counts().sort_values(ascending=False).head(5)

c     93
q     56
w     48
i     41
ff    38
Name: A6, dtype: int64

In [116]:
# let's make a list with the most frequent categories in A6

top_5 = [
    x for x in X_train['A6'].value_counts().sort_values(
        ascending=False).head(5).index
]

top_5

['c', 'q', 'w', 'i', 'ff']

In [117]:
# and now let's create 5 binary variables to encode A6
# in train and test sets

for label in top_5:
    X_train['A6' + '_' + label] = np.where(
        X_train['A6'] == label, 1, 0)
    
    X_test['A6' + '_' + label] = np.where(
        X_test['A6'] == label, 1, 0)

In [118]:
# let's visualise the result

print(X_train[['A6'] + ['A6'+'_'+c for c in top_5]].head(10))

     A6  A6_c  A6_q  A6_w  A6_i  A6_ff
596   c     1     0     0     0      0
303   q     0     1     0     0      0
204   w     0     0     1     0      0
351  ff     0     0     0     0      1
118   m     0     0     0     0      0
247   q     0     1     0     0      0
652   i     0     0     0     1      0
513   e     0     0     0     0      0
230  cc     0     0     0     0      0
250   e     0     0     0     0      0


In [119]:
# Feature-engine : let's divide in train and test sets (again)

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['A16'], axis=1),  # predictors
    data['A16'],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

In [120]:
ohe_enc = OneHotEncoder(
    
    # to indicate how many top categories
    top_categories=5,  
    
    # we can select which variables to encode
    variables=['A6', 'A7'],
    
    # to indicate if we drop one of the binaries
    drop_last=False)

ohe_enc.fit(X_train)

OneHotEncoder(top_categories=5, variables=['A6', 'A7'])

In [121]:
# the encoder stores the variables it will encode

ohe_enc.variables

['A6', 'A7']

In [122]:
# the encoder stores the most frequent labels per variable

ohe_enc.encoder_dict_

{'A6': ['c', 'q', 'w', 'i', 'ff'], 'A7': ['v', 'h', 'ff', 'bb', 'z']}

In [123]:
# let's transform train and test sets

X_train_enc = ohe_enc.transform(X_train)
X_test_enc = ohe_enc.transform(X_test)

In [124]:
# let's inspect the result

X_train_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A8,A9,A10,A11,A12,...,A6_c,A6_q,A6_w,A6_i,A6_ff,A7_v,A7_h,A7_ff,A7_bb,A7_z
596,a,46.08,3.0,u,g,2.375,t,t,8,t,...,1,0,0,0,0,1,0,0,0,0
303,a,15.92,2.875,u,g,0.085,f,f,0,f,...,0,1,0,0,0,1,0,0,0,0
204,b,36.33,2.125,y,p,0.085,t,t,1,f,...,0,0,1,0,0,1,0,0,0,0
351,b,22.17,0.585,y,p,0.0,f,f,0,f,...,0,0,0,0,1,0,0,1,0,0
118,b,57.83,7.04,u,g,14.0,t,t,6,t,...,0,0,0,0,0,1,0,0,0,0


In [125]:
X_test_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A8,A9,A10,A11,A12,...,A6_c,A6_q,A6_w,A6_i,A6_ff,A7_v,A7_h,A7_ff,A7_bb,A7_z
14,a,45.83,10.5,u,g,5.0,t,t,7,t,...,0,1,0,0,0,1,0,0,0,0
586,b,64.08,20.0,u,g,17.5,t,t,9,t,...,0,0,0,0,0,0,1,0,0,0
140,a,31.25,3.75,u,g,0.625,t,t,9,t,...,0,0,0,0,0,0,1,0,0,0
492,b,39.25,9.5,u,g,6.5,t,t,14,f,...,0,0,0,0,0,1,0,0,0,0
350,a,26.17,2.0,u,g,0.0,f,f,0,t,...,0,0,0,0,0,0,0,0,0,0


## Recipe 23 - Restart 

In [126]:
# for integer encoding using sklearn
from sklearn.preprocessing import OrdinalEncoder

# for integer encoding using feature-engine - namespace conflict ....
# from feature_engine.categorical_encoders import OrdinalCategoricalEncoder

In [127]:
data = pd.read_csv('creditApprovalUCI.csv')

data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [128]:
# make a list with the categorical variables

vars_categorical = ['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [129]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['A16'], axis=1),  # predictors
    data['A16'],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((483, 15), (207, 15))

## Ordinal encoding with pandas

In [130]:
# let's create a dictionary with the mappings of categories to numbers for A7

ordinal_mapping = {
    k: i
    for i, k in enumerate(X_train['A7'].unique(), 0)
}

ordinal_mapping

{'v': 0,
 'ff': 1,
 'h': 2,
 'dd': 3,
 'z': 4,
 'bb': 5,
 'j': 6,
 'Missing': 7,
 'n': 8,
 'o': 9}

In [131]:
# replace the labels with the integers

X_train['A7'] = X_train['A7'].map(ordinal_mapping)
X_test['A7'] = X_test['A7'].map(ordinal_mapping)

In [132]:
# let's explore the result

X_train['A7'].head(10)

596    0
303    0
204    0
351    1
118    0
247    2
652    0
513    3
230    0
250    4
Name: A7, dtype: int64

### Putting the code in a function

In [133]:
# we can turn the previous commands into 2 functions

def find_category_mappings(df, variable):
    return {k: i for i, k in enumerate(df[variable].unique(), 0)}


def integer_encode(train, test, variable, ordinal_mapping):

    X_train[variable] = X_train[variable].map(ordinal_mapping)
    X_test[variable] = X_test[variable].map(ordinal_mapping)

In [134]:
# and now we run a loop over the remaining categorical variables
# and encode them to numbers

for variable in vars_categorical:
    
    if variable != 'A7':  # we encoded this one already
        
        mappings = find_category_mappings(X_train, variable)
        
        integer_encode(X_train, X_test, variable, mappings)

In [135]:
# let's inspect the results

X_train[vars_categorical].head()

Unnamed: 0,A1,A4,A5,A6,A7,A9,A10,A12,A13
596,0,0,0,0,0,0,0,0,0
303,0,0,0,1,0,1,1,1,0
204,1,1,1,2,0,0,0,1,0
351,1,1,1,3,1,1,1,1,0
118,1,0,0,4,0,0,0,0,0


## Ordinal encoding with Scikit-learn

In [136]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['A16'], axis=1),  # predictors
    data['A16'],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((483, 15), (207, 15))

In [137]:
# let's create an encoder

le = OrdinalEncoder()

In [138]:
# let's fit the encoder to the train set
le.fit(X_train[vars_categorical])

OrdinalEncoder()

In [139]:
# we can see the unique classes

le.categories_

[array(['Missing', 'a', 'b'], dtype=object),
 array(['Missing', 'l', 'u', 'y'], dtype=object),
 array(['Missing', 'g', 'gg', 'p'], dtype=object),
 array(['Missing', 'aa', 'c', 'cc', 'd', 'e', 'ff', 'i', 'j', 'k', 'm',
        'q', 'r', 'w', 'x'], dtype=object),
 array(['Missing', 'bb', 'dd', 'ff', 'h', 'j', 'n', 'o', 'v', 'z'],
       dtype=object),
 array(['f', 't'], dtype=object),
 array(['f', 't'], dtype=object),
 array(['f', 't'], dtype=object),
 array(['g', 'p', 's'], dtype=object)]

In [140]:
# let's transform train and test sets

X_train_enc = le.transform(X_train[vars_categorical])
X_test_enc = le.transform(X_test[vars_categorical])

In [141]:
#let's inspect the result


pd.DataFrame(X_train_enc, columns=vars_categorical).head()

Unnamed: 0,A1,A4,A5,A6,A7,A9,A10,A12,A13
0,1.0,2.0,1.0,2.0,8.0,1.0,1.0,1.0,0.0
1,1.0,2.0,1.0,11.0,8.0,0.0,0.0,0.0,0.0
2,2.0,3.0,3.0,13.0,8.0,1.0,1.0,0.0,0.0
3,2.0,3.0,3.0,6.0,3.0,0.0,0.0,0.0,0.0
4,2.0,2.0,1.0,10.0,8.0,1.0,1.0,1.0,0.0


## One hot encoding with Feature-Engine

In [142]:
# let's create the encoder

from feature_engine.encoding import OrdinalEncoder

ordinal_enc = OrdinalEncoder(
    encoding_method='arbitrary',
    variables=vars_categorical)

In [143]:
# let's fit the encoder to the train set

ordinal_enc.fit(X_train)

OrdinalEncoder(encoding_method='arbitrary',
               variables=['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12',
                          'A13'])

In [144]:
# let's inspect which variables the encoder will encode

ordinal_enc.variables

['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [145]:
# in the encoder dict we can observe the numbers
# assigned to each category for all the indicated variables

ordinal_enc.encoder_dict_

{'A1': {'a': 0, 'b': 1, 'Missing': 2},
 'A4': {'u': 0, 'y': 1, 'Missing': 2, 'l': 3},
 'A5': {'g': 0, 'p': 1, 'Missing': 2, 'gg': 3},
 'A6': {'c': 0,
  'q': 1,
  'w': 2,
  'ff': 3,
  'm': 4,
  'i': 5,
  'e': 6,
  'cc': 7,
  'x': 8,
  'd': 9,
  'k': 10,
  'j': 11,
  'Missing': 12,
  'aa': 13,
  'r': 14},
 'A7': {'v': 0,
  'ff': 1,
  'h': 2,
  'dd': 3,
  'z': 4,
  'bb': 5,
  'j': 6,
  'Missing': 7,
  'n': 8,
  'o': 9},
 'A9': {'t': 0, 'f': 1},
 'A10': {'t': 0, 'f': 1},
 'A12': {'t': 0, 'f': 1},
 'A13': {'g': 0, 's': 1, 'p': 2}}

In [146]:
# let's transform the train and test sets

X_train = ordinal_enc.transform(X_train)
X_test = ordinal_enc.transform(X_test)

In [147]:
# let's explore the result

X_train.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
596,0,46.08,3.0,0,0,0,0,2.375,0,0,8,0,0,396.0,4159
303,0,15.92,2.875,0,0,1,0,0.085,1,1,0,1,0,120.0,0
204,1,36.33,2.125,1,1,2,0,0.085,0,0,1,1,0,50.0,1187
351,1,22.17,0.585,1,1,3,1,0.0,1,1,0,1,0,100.0,0
118,1,57.83,7.04,0,0,4,0,14.0,0,0,6,0,0,360.0,1332


## Recipe 24 
Replacing category variables by counts ....

In [148]:
# to split the datasets
from sklearn.model_selection import train_test_split

# to encode with feature-engine
from feature_engine.encoding import CountFrequencyEncoder

# from feature_engine.categorical_encoders import CountFrequencyCategoricalEncoder

In [149]:
data = pd.read_csv('creditApprovalUCI.csv')

data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [150]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['A16'], axis=1),  # predictors
    data['A16'],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((483, 15), (207, 15))

## Count encodinng with Pandas

In [151]:
# let's obtain the counts for each category in the variable A7
# strategy is to create a dict mapping labels to counts - then using the map function to make the switch...

count_map = X_train['A7'].value_counts().to_dict()

count_map

{'v': 277,
 'h': 101,
 'ff': 41,
 'bb': 39,
 'z': 7,
 'dd': 5,
 'j': 5,
 'Missing': 4,
 'n': 3,
 'o': 1}

In [152]:
# replace the labels with the counts

X_train['A7'] = X_train['A7'].map(count_map)
X_test['A7'] = X_test['A7'].map(count_map)

In [153]:
# let's explore the result

X_train['A7'].head(10)

596    277
303    277
204    277
351     41
118    277
247    101
652    277
513      5
230    277
250      7
Name: A7, dtype: int64

In [154]:
# if instead of the count we would like the frequency
# we need only divide the count by the total number of observations:

frequency_map = (X_train['A6'].value_counts() / len(X_train) ).to_dict()
frequency_map

{'c': 0.19254658385093168,
 'q': 0.11594202898550725,
 'w': 0.09937888198757763,
 'i': 0.08488612836438923,
 'ff': 0.07867494824016563,
 'k': 0.07867494824016563,
 'aa': 0.07039337474120083,
 'cc': 0.062111801242236024,
 'm': 0.053830227743271224,
 'x': 0.049689440993788817,
 'e': 0.043478260869565216,
 'd': 0.043478260869565216,
 'j': 0.016563146997929608,
 'Missing': 0.008281573498964804,
 'r': 0.002070393374741201}

In [155]:
# replace the labels with the frequency

X_train['A6'] = X_train['A6'].map(frequency_map)
X_test['A6'] = X_test['A6'].map(frequency_map)

## Putting above code into a function

In [156]:
def count_mappings(df, variable):
    return df[variable].value_counts().to_dict()


def frequency_mappings(df, variable):
    return (df[variable].value_counts() / len(df)).to_dict()


def encode(train, test, variable, mapping):
    X_train[variable] = X_train[variable].map(mapping)
    X_test[variable] = X_test[variable].map(mapping)

In [157]:
# make a list with the categorical variables - need to automate  this.....

vars_categorical = ['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [171]:
# In general in sttad of usig, hard coded col names -use the follwing... 
cat_vars = data.select_dtypes(include=['object']).columns.tolist()
cat_vars

# Also for numerical cols use the below code....
# num_vars = data.select_dtypes(exclude=['object']).columns.tolist()

['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [158]:
# encode multiple categorical variables with the functions
# above

for variable in vars_categorical:
    mappings = count_mappings(X_train, variable)
    encode(X_train, X_test, variable, mappings)

## Count and frequency encoding with Feature-Engine

In [159]:
# let's divide into train and test sets (again)

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['A16'], axis=1),  # predictors
    data['A16'],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

In [163]:
# let's create an encoder

# make sure the right import is in place 
from feature_engine.encoding import CountFrequencyEncoder

# does notset the variables field correctly, if left as None 
#count_enc = CountFrequencyEncoder(
#    encoding_method='count', # to do frequency ==> encoding_method='frequency'
#    variables=None)
 
count_enc = CountFrequencyEncoder(
    encoding_method='count', # to do frequency ==> encoding_method='frequency'
    variables=vars_categorical)

In [164]:
# fit the encoder to the train set
count_enc.fit(X_train)

CountFrequencyEncoder(variables=['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10',
                                 'A12', 'A13'])

In [165]:
# the encoder stores the variables it will encode

count_enc.variables

['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [166]:
# in the encoder dict contains the counts or frequencies
# per category per variable

count_enc.encoder_dict_

{'A1': {'b': 335, 'a': 144, 'Missing': 4},
 'A4': {'u': 363, 'y': 115, 'Missing': 4, 'l': 1},
 'A5': {'g': 363, 'p': 115, 'Missing': 4, 'gg': 1},
 'A6': {'c': 93,
  'q': 56,
  'w': 48,
  'i': 41,
  'ff': 38,
  'k': 38,
  'aa': 34,
  'cc': 30,
  'm': 26,
  'x': 24,
  'e': 21,
  'd': 21,
  'j': 8,
  'Missing': 4,
  'r': 1},
 'A7': {'v': 277,
  'h': 101,
  'ff': 41,
  'bb': 39,
  'z': 7,
  'dd': 5,
  'j': 5,
  'Missing': 4,
  'n': 3,
  'o': 1},
 'A9': {'t': 256, 'f': 227},
 'A10': {'f': 271, 't': 212},
 'A12': {'f': 263, 't': 220},
 'A13': {'g': 441, 's': 38, 'p': 4}}

In [167]:
# let's transform train and test sets

X_train_enc = count_enc.transform(X_train)
X_test_enc = count_enc.transform(X_test)

In [168]:
# let's inspect the result

X_train_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
596,144,46.08,3.0,363,363,93,277,2.375,256,212,8,220,441,396.0,4159
303,144,15.92,2.875,363,363,56,277,0.085,227,271,0,263,441,120.0,0
204,335,36.33,2.125,115,115,48,277,0.085,256,212,1,263,441,50.0,1187
351,335,22.17,0.585,115,115,38,41,0.0,227,271,0,263,441,100.0,0
118,335,57.83,7.04,363,363,26,277,14.0,256,212,6,220,441,360.0,1332


In [169]:
X_test_enc.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
14,144,45.83,10.5,363,363,56,277,5.0,256,212,7,220,441,0.0,0
586,335,64.08,20.0,363,363,24,101,17.5,256,212,9,220,441,0.0,1000
140,144,31.25,3.75,363,363,30,101,0.625,256,212,9,220,441,181.0,0
492,335,39.25,9.5,363,363,26,277,6.5,256,212,14,263,441,240.0,4607
350,144,26.17,2.0,363,363,8,5,0.0,227,271,0,220,441,276.0,1
