## Complete Case Analysis

Complete-case analysis (CCA), also called "list-wise deletion" of cases, consists in discarding observations where values in any of the variables are missing. Complete Case Analysis means literally analyzing only those observations for which there is information in all of the variables in the data set.

In [1]:
import pandas as pd

# to show all the columns of the dataframe in the notebeook
pd.set_option('display.max_columns', None)

In [2]:
# load data
data = pd.read_csv('creditcardIU.csv')
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [3]:
data.shape

(690, 16)

In [4]:
data['class'].value_counts()

-    383
+    307
Name: class, dtype: int64

In [48]:
data.isnull().sum()

A1       12
A2       12
A3        0
A4        6
A5        6
A6        9
A7        9
A8        0
A9        0
A10       0
A11       0
A12       0
A13       0
A14      13
A15       0
class     0
dtype: int64

In [6]:
# let's inspect the percentage of missing values in each variable

data.isnull().mean().sort_values(ascending=False)

A14      0.018841
A1       0.017391
A2       0.017391
A6       0.013043
A7       0.013043
A4       0.008696
A5       0.008696
A3       0.000000
A8       0.000000
A9       0.000000
A10      0.000000
A11      0.000000
A12      0.000000
A13      0.000000
A15      0.000000
class    0.000000
dtype: float64

In [13]:
# create a complete case data set

data_cca = data.dropna()

In [9]:
print('Number of total observations: {}'.format(len(data)))
print('Number of observations with complete cases cda: {}'.format(len(data_cca)))


Number of total observations: 690
Number of observations with complete cases: 653


In [18]:
# we can also indicate for which variables we would like the complete cases

data_cba = data.dropna(subset=[
    'A1',
])

In [19]:
print('Number of total observations: {}'.format(len(data)))
print('Number of observations with complete cases cca: {}'.format(len(data_cca)))
print('Number of observations with complete cases cba: {}'.format(len(data_cba)))

Number of total observations: 690
Number of observations with complete cases: 678


## Mean or median Imputation

Mean / median imputation consists in replacing all occurrences of missing values (NA) in a variable by the mean (if the variable has a Gaussian distribution) or median (if the variable has a skewed distribution).

In this recipe, we will replace missing values by the median or the mean using pandas, Scikit-learn and Feature-Engine, all open source Python libraries.

In [22]:
import pandas as pd
from sklearn.impute import SimpleImputer

In [47]:
# load data
data = pd.read_csv('creditcardIU.csv')
data.head()


Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [11]:
data.fillna(data.mean())

  data.fillna(data.mean())


Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,280.0,824,+
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260.0,0,-
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2,t,g,200.0,394,-
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,200.0,1,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280.0,750,-


## Mean / median imputation with pandas

In [20]:
# replace NA in indicated numerical variables

for var in ['A2', 'A3', 'A8', 'A11', 'A15']:
    
    data[var] = data[var].fillna(data[var].median())
   

A2
A3
A8
A11
A15


In [24]:
# check absence of missing values in imputed variables

data[['A2', 'A3', 'A8', 'A11', 'A15']].isnull().sum()

A2     12
A3      0
A8      0
A11     0
A15     0
dtype: int64

## Mean / median imputation with Scikit-learn

In [39]:
# create a median imputation object with SimpleImputer
imputer = SimpleImputer(strategy='median')

# let's fit the imputer to the train set
# the imputer will learn the median of all variables
imputer.fit(data.select_dtypes(exclude = 'object'))

# we can look at the learnt medians:
imputer.statistics_

array([ 28.46,   2.75,   1.  ,   0.  , 160.  ,   5.  ])

In [40]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      678 non-null    object 
 1   A2      678 non-null    float64
 2   A3      690 non-null    float64
 3   A4      684 non-null    object 
 4   A5      684 non-null    object 
 5   A6      681 non-null    object 
 6   A7      681 non-null    object 
 7   A8      690 non-null    float64
 8   A9      690 non-null    object 
 9   A10     690 non-null    object 
 10  A11     690 non-null    int64  
 11  A12     690 non-null    object 
 12  A13     690 non-null    object 
 13  A14     677 non-null    float64
 14  A15     690 non-null    int64  
 15  class   690 non-null    object 
dtypes: float64(4), int64(2), object(10)
memory usage: 86.4+ KB


In [49]:
data.select_dtypes(exclude = 'object')

Unnamed: 0,A2,A3,A8,A11,A14,A15
0,30.83,0.000,1.25,1,202.0,0
1,58.67,4.460,3.04,6,43.0,560
2,24.50,0.500,1.50,0,280.0,824
3,27.83,1.540,3.75,5,100.0,3
4,20.17,5.625,1.71,0,120.0,0
...,...,...,...,...,...,...
685,21.08,10.085,1.25,0,260.0,0
686,22.67,0.750,2.00,2,200.0,394
687,25.25,13.500,2.00,1,200.0,1
688,17.92,0.205,0.04,0,280.0,750


In [42]:
# and now we impute the train and test sets
# NOTE: the data is returned as a numpy array!!!

df = imputer.transform(data.select_dtypes(exclude = 'object'))

In [43]:
df

array([[3.083e+01, 0.000e+00, 1.250e+00, 1.000e+00, 2.020e+02, 0.000e+00],
       [5.867e+01, 4.460e+00, 3.040e+00, 6.000e+00, 4.300e+01, 5.600e+02],
       [2.450e+01, 5.000e-01, 1.500e+00, 0.000e+00, 2.800e+02, 8.240e+02],
       ...,
       [2.525e+01, 1.350e+01, 2.000e+00, 1.000e+00, 2.000e+02, 1.000e+00],
       [1.792e+01, 2.050e-01, 4.000e-02, 0.000e+00, 2.800e+02, 7.500e+02],
       [3.500e+01, 3.375e+00, 8.290e+00, 0.000e+00, 0.000e+00, 0.000e+00]])

In [50]:
# check that missing values were removed

pd.DataFrame(df, columns = data.select_dtypes(exclude = 'object').columns).isnull().sum()

A2     0
A3     0
A8     0
A11    0
A14    0
A15    0
dtype: int64

## Replacing missing values by an arbitrary number

In this recipe, we will replace missing values by an arbitrary number using pandas, Scikit-learn and Feature-Engine, all open source Python libraries.

In [30]:
# to impute missing data with sklearn
from sklearn.impute import SimpleImputer


In [51]:
# load data
data = pd.read_csv('creditcardIU.csv')
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [52]:
# find the percentage of missing data per variable

data.isnull().mean()

A1       0.017391
A2       0.017391
A3       0.000000
A4       0.008696
A5       0.008696
A6       0.013043
A7       0.013043
A8       0.000000
A9       0.000000
A10      0.000000
A11      0.000000
A12      0.000000
A13      0.000000
A14      0.018841
A15      0.000000
class    0.000000
dtype: float64

## Arbitrary imputation with pandas

In [53]:
# find the maximum value per variable
data[['A2','A3', 'A8', 'A11']].max()

A2     80.25
A3     28.00
A8     28.50
A11    67.00
dtype: float64

In [34]:
# replace NA with 99 in indicated numerical variables

for var in ['A2','A3', 'A8', 'A11']:
    
    data[var].fillna(99, inplace=True)

In [35]:
# check absence of missing values
data[['A2','A3', 'A8', 'A11']].isnull().sum()

A2     0
A3     0
A8     0
A11    0
dtype: int64

In [39]:
data[data['A2']==99]

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class
83,a,99.0,3.5,u,g,d,v,3.0,t,f,0,t,g,300.0,0,-
86,b,99.0,0.375,u,g,d,v,0.875,t,f,0,t,s,928.0,0,-
92,b,99.0,5.0,y,p,aa,v,8.5,t,f,0,f,g,0.0,0,-
97,b,99.0,0.5,u,g,c,bb,0.835,t,f,0,t,s,320.0,0,-
254,b,99.0,0.625,u,g,k,v,0.25,f,f,0,f,g,380.0,2010,-
286,a,99.0,1.5,u,g,ff,ff,0.0,f,t,2,t,g,200.0,105,-
329,b,99.0,4.0,y,p,i,v,0.085,f,f,0,t,g,411.0,0,-
445,a,99.0,11.25,u,g,ff,ff,0.0,f,f,0,f,g,,5200,-
450,b,99.0,3.0,y,p,i,bb,7.0,f,f,0,f,g,0.0,1,-
500,b,99.0,4.0,u,g,x,v,5.0,t,t,3,t,g,290.0,2279,+


## Arbitrary imputation with Scikit-learn

In [36]:
# create an instance of the simple imputer
imputer = SimpleImputer(strategy='constant', fill_value=999)

# we fit the imputer to the train set
imputer.fit(data)

# we can look at the constant values:
imputer.statistics_

array([999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999,
       999, 999, 999], dtype=object)

In [37]:
# and now we impute the train and test set
# NOTE: the data is returned as a numpy array!!!

df = imputer.transform(data)

In [38]:
# check that missing values were removed
pd.DataFrame(df, columns =data.columns).isnull().sum()

A1       0
A2       0
A3       0
A4       0
A5       0
A6       0
A7       0
A8       0
A9       0
A10      0
A11      0
A12      0
A13      0
A14      0
A15      0
class    0
dtype: int64

## Adding a bespoke category

In this recipe, we will create a 'Missing' category to replace missing values in categorical variables using pandas, Scikit-learn and Feature-Engine, all open source Python libraries.

In [40]:
import pandas as pd

# to impute missing data with sklearn
from sklearn.impute import SimpleImputer


In [41]:
# load data
data = pd.read_csv('creditcardIU.csv')
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [43]:
# find the percentage of missing data per variable

data.isnull().mean()

A1       0.017391
A2       0.017391
A3       0.000000
A4       0.008696
A5       0.008696
A6       0.013043
A7       0.013043
A8       0.000000
A9       0.000000
A10      0.000000
A11      0.000000
A12      0.000000
A13      0.000000
A14      0.018841
A15      0.000000
class    0.000000
dtype: float64

## Adding a bespoke category with pandas

In [44]:
# replace NA in some categorical variables

for var in ['A4', 'A5', 'A6', 'A7']:

    data[var].fillna('Missing', inplace=True)

In [45]:
# check absence of missing values
data[['A4', 'A5', 'A6', 'A7']].isnull().sum()

A4    0
A5    0
A6    0
A7    0
dtype: int64

## Adding a bespoke category with Scikit-learn

In [46]:
# create an instance of the simple imputer
imputer = SimpleImputer(strategy='constant', fill_value='Missing')

# we fit the imputer to the train set
imputer.fit(data)

# we can look at the new category:
imputer.statistics_

array(['Missing', 'Missing', 'Missing', 'Missing', 'Missing', 'Missing',
       'Missing', 'Missing', 'Missing', 'Missing', 'Missing', 'Missing',
       'Missing', 'Missing', 'Missing', 'Missing'], dtype=object)

In [47]:
# and now we impute the train and test set
# NOTE: the data is returned as a numpy array!!!

df = imputer.transform(data)

In [48]:
pd.DataFrame(df, columns =data.columns).isnull().sum()

A1       0
A2       0
A3       0
A4       0
A5       0
A6       0
A7       0
A8       0
A9       0
A10      0
A11      0
A12      0
A13      0
A14      0
A15      0
class    0
dtype: int64

## Random Sample Imputation

In this recipe, we will perform random sample imputation using pandas and Feature Engine.

In [68]:
import pandas as pd


In [69]:
# load data
data = pd.read_csv('creditcardIU.csv')
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [70]:
# find the percentage of missing data within those variables

data.isnull().mean()

A1       0.017391
A2       0.017391
A3       0.000000
A4       0.008696
A5       0.008696
A6       0.013043
A7       0.013043
A8       0.000000
A9       0.000000
A10      0.000000
A11      0.000000
A12      0.000000
A13      0.000000
A14      0.018841
A15      0.000000
class    0.000000
dtype: float64

## Random Sample imputation with pandas

In [54]:
# extract a random sample (as many values as missing values in the variable)

number_missing_values = data['A2'].isnull().sum()
number_missing_values

12

In [66]:
# extract a random sample (as many values as missing values in the variable)

random_sample_train = data['A2'].dropna().sample(number_missing_values, random_state=0)

In [67]:
random_sample_train



284    23.25
148    30.50
449    20.00
397    23.58
600    29.50
398    26.17
531    24.58
312    16.33
442    30.58
578    39.17
14     45.83
483    23.75
Name: A2, dtype: float64

In [63]:
random_sample_train

0

In [61]:
# re-index the random sample so that we can join it to our original data

random_sample_train.index = data[data['A2'].isnull()].index

random_sample_train

83     23.25
86     30.50
92     20.00
97     23.58
254    29.50
286    26.17
329    24.58
445    16.33
450    30.58
500    39.17
515    45.83
608    23.75
Name: A2, dtype: float64

In [62]:
# replace the missing values
data.loc[data['A2'].isnull(), 'A2'] = random_sample_train

data['A2'].isnull().sum()

0

In [84]:
# repeat in a loop for the rest of the variables
# and for both train and test set

for var in ['A1', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8']:

    # extract a random sample
    random_sample_train = data[var].dropna().sample(
        data[var].isnull().sum(), random_state=0)

    # re index the random sample
    random_sample_train.index = data[data[var].isnull()].index

    # replace the NA 
    data.loc[data[var].isnull(), var] = random_sample_train
    
# check missing data
data[['A1', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8']].isnull().sum()

A1    0
A3    0
A4    0
A5    0
A6    0
A7    0
A8    0
dtype: int64

## Adding a missing value indicator variable

In this recipe, we will add binary variables to indicate that a value is missing using pandas, Scikit-learn and Feature-Engine, all open source Python libraries.

In [64]:
import pandas as pd
import numpy as np


In [65]:
# load data
data = pd.read_csv('creditcardIU.csv')
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [66]:
# find the percentage of missing data within those variables

data.isnull().mean()

A1       0.017391
A2       0.017391
A3       0.000000
A4       0.008696
A5       0.008696
A6       0.013043
A7       0.013043
A8       0.000000
A9       0.000000
A10      0.000000
A11      0.000000
A12      0.000000
A13      0.000000
A14      0.018841
A15      0.000000
class    0.000000
dtype: float64

## Add missing indicator with pandas

In [67]:
# add missing indicator

for var in ['A1', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8']:
    data[var+'_NA'] = np.where(data[var].isnull(), 1, 0)
    
# check the new missing indicator variables
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class,A1_NA,A3_NA,A4_NA,A5_NA,A6_NA,A7_NA,A8_NA
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+,0,0,0,0,0,0,0
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+,0,0,0,0,0,0,0
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+,0,0,0,0,0,0,0
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+,0,0,0,0,0,0,0
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+,0,0,0,0,0,0,0


In [68]:
data[data['A4'].isnull()]

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class,A1_NA,A3_NA,A4_NA,A5_NA,A6_NA,A7_NA,A8_NA
206,a,71.58,0.0,,,,,0.0,f,f,0,f,p,,0,+,0,0,1,1,1,1,0
270,b,37.58,0.0,,,,,0.0,f,f,0,f,p,,0,+,0,0,1,1,1,1,0
330,b,20.42,0.0,,,,,0.0,f,f,0,f,p,,0,-,0,0,1,1,1,1,0
456,b,34.58,0.0,,,,,0.0,f,f,0,f,p,,0,-,0,0,1,1,1,1,0
592,b,23.17,0.0,,,,,0.0,f,f,0,f,p,,0,+,0,0,1,1,1,1,0
622,a,25.58,0.0,,,,,0.0,f,f,0,f,p,,0,+,0,0,1,1,1,1,0


In [59]:
# the mean of the missing indicator should be the same as the 
# percentage of missing values in the original variable

data['A1'].isnull().mean(), data['A1_NA'].mean()

(0.017391304347826087, 0.017391304347826087)

## Adding missing indicator with Scikit-learn

In [69]:
import pandas as pd
from sklearn.impute import MissingIndicator

In [70]:
data = pd.read_csv('creditcardIU.csv')

In [71]:
indicator = MissingIndicator(error_on_new=True, features='missing-only')
indicator.fit(data) 

In [75]:
indicator.features_

array([ 0,  1,  3,  4,  5,  6, 13], dtype=int64)

In [74]:
# we can see the features with na:
# the result shows the column index in the NumPy array

data.columns[indicator.features_]

Index(['A1', 'A2', 'A4', 'A5', 'A6', 'A7', 'A14'], dtype='object')

In [76]:
# with Sklearn we need to join the missing indicators dataframe
# to the original X_train

# let's create a column name for each of the new MissingIndicators
indicator_cols = [c+'_NA' for c in data.columns[indicator.features_]]

# and now let's concatenate the original dataset with the missing indicators
data = pd.concat([
    data.reset_index(),
    pd.DataFrame(indicator.transform(data), columns = indicator_cols)],
    axis=1)

data.head()

Unnamed: 0,index,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class,A1_NA,A2_NA,A4_NA,A5_NA,A6_NA,A7_NA,A14_NA
0,0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+,False,False,False,False,False,False,False
1,1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+,False,False,False,False,False,False,False
2,2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+,False,False,False,False,False,False,False
3,3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+,False,False,False,False,False,False,False
4,4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+,False,False,False,False,False,False,False
