# Handling Missing Values and Creating Polynomial Features

In [1]:
import numpy as np
from sklearn.impute import SimpleImputer

In [2]:
A = np.array([[np.nan, np.nan, 2], [4, np.nan, 1], [10, 5, 9]])
print(A)

[[nan nan  2.]
 [ 4. nan  1.]
 [10.  5.  9.]]


In [3]:
# Computing the arithmetic mean along the columns ignoring NaNs.
np.nanmean(A, axis=0)

array([7., 5., 4.])

In [4]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

A_transformed = imp_mean.fit_transform(A)
A_transformed

array([[ 7.,  5.,  2.],
       [ 4.,  5.,  1.],
       [10.,  5.,  9.]])

In [5]:
from sklearn.preprocessing import PolynomialFeatures

$(X_1, X_2, X_3)$ gets transformed to: $(1, X_{1}, X_{2}, X_{3}, X_{1}^2, X_1X_2, X_1X_3, X_2^2, X_2X_3, X_3^2)$ meaning that we got a bias, original features, squared terms and all interactions. 


In [6]:
poly = PolynomialFeatures(2)
poly.fit_transform(A_transformed)

array([[  1.,   7.,   5.,   2.,  49.,  35.,  14.,  25.,  10.,   4.],
       [  1.,   4.,   5.,   1.,  16.,  20.,   4.,  25.,   5.,   1.],
       [  1.,  10.,   5.,   9., 100.,  50.,  90.,  25.,  45.,  81.]])

# Doing It in “One Step” With a Pipeline

In [7]:
from sklearn.pipeline import Pipeline

my_pipe = Pipeline([
        ('missing_values', SimpleImputer(missing_values=np.nan, strategy='mean')),
        ('polynomial', PolynomialFeatures(2)),
    ])

C = my_pipe.fit_transform(A)
C

array([[  1.,   7.,   5.,   2.,  49.,  35.,  14.,  25.,  10.,   4.],
       [  1.,   4.,   5.,   1.,  16.,  20.,   4.,  25.,   5.,   1.],
       [  1.,  10.,   5.,   9., 100.,  50.,  90.,  25.,  45.,  81.]])