In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
#from sklearn.preprocessing import Imputer

In [2]:
np.random.seed(1)

In [3]:
df = pd.read_csv("imputation_data_test.csv")

## Pandas fillna - median: cannot impute non-numeric data

In [4]:
df

Unnamed: 0,Gender,Age,PhD,School,Salary
0,Male,20.0,0.0,A,100.0
1,Female,30.0,0.0,B,120.0
2,Male,28.0,1.0,C,140.0
3,Male,54.0,1.0,A,200.0
4,Male,18.0,1.0,B,
5,Male,50.0,1.0,,100.0
6,Female,60.0,,A,115.0
7,Male,,1.0,A,100.0
8,,20.0,0.0,A,180.0


In [5]:
df_imputed = df.fillna(df.median())
df_imputed

Unnamed: 0,Gender,Age,PhD,School,Salary
0,Male,20.0,0.0,A,100.0
1,Female,30.0,0.0,B,120.0
2,Male,28.0,1.0,C,140.0
3,Male,54.0,1.0,A,200.0
4,Male,18.0,1.0,B,117.5
5,Male,50.0,1.0,,100.0
6,Female,60.0,1.0,A,115.0
7,Male,29.0,1.0,A,100.0
8,,20.0,0.0,A,180.0


## Pandas fillna - median: cannot impute non-numeric data

In [6]:
df

Unnamed: 0,Gender,Age,PhD,School,Salary
0,Male,20.0,0.0,A,100.0
1,Female,30.0,0.0,B,120.0
2,Male,28.0,1.0,C,140.0
3,Male,54.0,1.0,A,200.0
4,Male,18.0,1.0,B,
5,Male,50.0,1.0,,100.0
6,Female,60.0,,A,115.0
7,Male,,1.0,A,100.0
8,,20.0,0.0,A,180.0


In [7]:
df_imputed = df.fillna(df.mean())
df_imputed

Unnamed: 0,Gender,Age,PhD,School,Salary
0,Male,20.0,0.0,A,100.0
1,Female,30.0,0.0,B,120.0
2,Male,28.0,1.0,C,140.0
3,Male,54.0,1.0,A,200.0
4,Male,18.0,1.0,B,131.875
5,Male,50.0,1.0,,100.0
6,Female,60.0,0.625,A,115.0
7,Male,35.0,1.0,A,100.0
8,,20.0,0.0,A,180.0


## Pandas fillna - df.mode().iloc[0]: most prequency

In [8]:
df

Unnamed: 0,Gender,Age,PhD,School,Salary
0,Male,20.0,0.0,A,100.0
1,Female,30.0,0.0,B,120.0
2,Male,28.0,1.0,C,140.0
3,Male,54.0,1.0,A,200.0
4,Male,18.0,1.0,B,
5,Male,50.0,1.0,,100.0
6,Female,60.0,,A,115.0
7,Male,,1.0,A,100.0
8,,20.0,0.0,A,180.0


In [9]:
df_imputed = df.fillna(df.mode().iloc[0])
df_imputed

Unnamed: 0,Gender,Age,PhD,School,Salary
0,Male,20.0,0.0,A,100.0
1,Female,30.0,0.0,B,120.0
2,Male,28.0,1.0,C,140.0
3,Male,54.0,1.0,A,200.0
4,Male,18.0,1.0,B,100.0
5,Male,50.0,1.0,A,100.0
6,Female,60.0,1.0,A,115.0
7,Male,20.0,1.0,A,100.0
8,Male,20.0,0.0,A,180.0


## TransformerMixin: most prequency for non-numerical and median for numerical

In [10]:
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):
    def __init__(self):
        """Impute missing values.
        input/output: pandas dataframe
        - Columns of dtype object are imputed with the most frequent value in column.
        - Columns of other types are imputed with median/mean of column.
        """
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            # numerical --> mean, categorical --> median
            #if X[c].dtype == np.dtype('O') else X[c].mean() for c in X], index=X.columns)  
                               
            # numerical, categorical --> median                   
            if X[c].dtype == np.dtype('O') else X[c].median() for c in X], index=X.columns)
        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [11]:
df_imputed = DataFrameImputer().fit_transform(df)
print(df_imputed)

   Gender   Age  PhD School  Salary
0    Male  20.0  0.0      A   100.0
1  Female  30.0  0.0      B   120.0
2    Male  28.0  1.0      C   140.0
3    Male  54.0  1.0      A   200.0
4    Male  18.0  1.0      B   117.5
5    Male  50.0  1.0      A   100.0
6  Female  60.0  1.0      A   115.0
7    Male  29.0  1.0      A   100.0
8    Male  20.0  0.0      A   180.0


## Binary, categorial --> most frequency; continuous --> median

In [12]:
# number of uniques of each column, ignore NaN
nu = np.array([len(pd.unique(df[col].dropna())) for col in df.columns])
print('number of uniques of each variable:')
print(nu)

number of uniques of each variable:
[2 7 2 3 6]


In [13]:
def define_variable_type(df,nu):
    i_binary = [] ; i_category = [] ; i_continuous = []
    for i in range(len(nu)):
        if nu[i] == 2: # binary 
            i_binary.append(i)
        elif nu[i] < 5: # note: not always correct, depending on data
            i_category.append(i)
        else:
            i_continuous.append(i)

    print('i_binary:',i_binary)
    print('i_category:',i_category)   
    #i_binary, i_category, i_continuous
    
    variable_type  = np.ones(len(nu))     # binary
    variable_type[i_category] = 2   # categorical
    variable_type[i_continuous] = 3 # continuous

    return variable_type #,i_binary,i_category,i_continuous

In [14]:
variable_type = define_variable_type(df,nu)
print(variable_type)

i_binary: [0, 2]
i_category: [3]
[1. 3. 1. 2. 3.]


In [15]:
def impute_missing(df,variable_type):
    # impute binary and categorical variables by the most frequency (in each column)
    # continuous variable by median
    # input: df: pandas data frame, variable_type: list
    # output: df2: pandas data frame
    df2 = df.copy()
    for i,col in enumerate(df.columns):
        if variable_type[i] < 3: # binary or caterogy
            df2[col] = df[col].fillna(df[col].mode().iloc[0])
        else: # continuous
            df2[col] = df[col].fillna(df[col].median())    
    return df2       

In [16]:
df_imputed = impute_missing(df,variable_type)
print(df_imputed)

   Gender   Age  PhD School  Salary
0    Male  20.0  0.0      A   100.0
1  Female  30.0  0.0      B   120.0
2    Male  28.0  1.0      C   140.0
3    Male  54.0  1.0      A   200.0
4    Male  18.0  1.0      B   117.5
5    Male  50.0  1.0      A   100.0
6  Female  60.0  1.0      A   115.0
7    Male  29.0  1.0      A   100.0
8    Male  20.0  0.0      A   180.0


In [17]:
def convert_binary_and_category(x,variable_type):
    """
    convert binary to +-1, category to one hot; remain continuous.
    """
    
    onehot_encoder = OneHotEncoder(sparse=False,categories='auto')

    # create 2 initial columns
    x_new = np.zeros((x.shape[0],2))

    for i,i_type in enumerate(variable_type):
        if i_type == 1: # binary
            unique_value = np.unique(x[:,i])
            x1 = np.array([-1. if value == unique_value[0] else 1. for value in x[:,i]])        
            x_new = np.hstack((x_new,x1[:,np.newaxis]))

        elif i_type == 2: # category
            x1 = onehot_encoder.fit_transform(x[:,i].reshape(-1,1))
            x_new = np.hstack((x_new,x1))
            
        else: # continuous      
            x_new = np.hstack((x_new,x[:,i][:,np.newaxis]))      

    # drop the 2 initial column
    x_new = x_new[:,2:]
    
    return x_new.astype(float)

In [20]:
# convert x
x = np.array(df_imputed)
x_new = convert_binary_and_category(x,variable_type)

print(x_new.shape)
print(x_new)

(9, 7)
[[  1.   20.   -1.    1.    0.    0.  100. ]
 [ -1.   30.   -1.    0.    1.    0.  120. ]
 [  1.   28.    1.    0.    0.    1.  140. ]
 [  1.   54.    1.    1.    0.    0.  200. ]
 [  1.   18.    1.    0.    1.    0.  117.5]
 [  1.   50.    1.    1.    0.    0.  100. ]
 [ -1.   60.    1.    1.    0.    0.  115. ]
 [  1.   29.    1.    1.    0.    0.  100. ]
 [  1.   20.   -1.    1.    0.    0.  180. ]]
