In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from feature_engine.missing_data_imputers import ArbitraryNumberImputer

In [2]:
data = pd.read_csv('creditApprovalUCI.csv')

In [18]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [4]:
X_train[['A2', 'A3', 'A8', 'A11']].max()

A2     76.750
A3     26.335
A8     20.000
A11    67.000
dtype: float64

In [5]:
for var in ['A2', 'A3', 'A8', 'A11']:
    X_train[var].fillna(99, inplace=True)
    X_test[var].fillna(99, inplace=True)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data[['A2', 'A3', 'A8', 'A11']], data['A16'], test_size=0.3, random_state=0)

In [7]:
imputer = SimpleImputer(strategy='constant', fill_value=99)

In [8]:
imputer.fit(X_train)

SimpleImputer(fill_value=99, strategy='constant')

In [9]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data[['A2', 'A3', 'A8', 'A11']], data['A16'], test_size=0.3, random_state=0)

In [13]:
imputer = ArbitraryNumberImputer(arbitrary_number=99, variables=['A2', 'A3', 'A8', 'A11'])

In [14]:
imputer.fit(X_train)

ArbitraryNumberImputer(arbitrary_number=99, variables=['A2', 'A3', 'A8', 'A11'])

In [15]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

Unnamed: 0,A2,A3,A8,A11
596,46.08,3.000,2.375,8
303,15.92,2.875,0.085,0
204,36.33,2.125,0.085,1
351,22.17,0.585,0.000,0
118,57.83,7.040,14.000,6
...,...,...,...,...
359,36.75,4.710,0.000,0
192,41.75,0.960,2.500,0
629,19.58,0.665,1.665,0
559,22.83,2.290,2.290,7


# Capturing missing values in a bespoke category

In [17]:
from feature_engine.missing_data_imputers import CategoricalVariableImputer

In [19]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [20]:
for var in ['A4', 'A5', 'A6', 'A7']:
    X_train[var].fillna('Missing', inplace=True)
    X_test[var].fillna('Missing', inplace=True)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(data[['A4', 'A5', 'A6', 'A7']], data['A16'], test_size=0.3, random_state=0)

In [22]:
imputer = SimpleImputer(strategy='constant', fill_value='Missing')
imputer.fit(X_train)

SimpleImputer(fill_value='Missing', strategy='constant')

In [23]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [25]:
columns  = ['A4', 'A4', 'A6', 'A7']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(data[['A4', 'A5', 'A6', 'A7']], data['A16'], test_size=0.3, random_state=0)

In [29]:
imputer = CategoricalVariableImputer(variables=['A4', 'A5', 'A6', 'A7'])

In [30]:
imputer.fit(X_train)

CategoricalVariableImputer(variables=['A4', 'A5', 'A6', 'A7'])

In [31]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [32]:
X_train.isnull().mean()

A4    0.0
A5    0.0
A6    0.0
A7    0.0
dtype: float64