Missing Values

In [None]:
import numpy as np
import pandas as pd
X = pd.DataFrame(np.array([5,2,3,np.NaN,np.NaN,4,-3,2,1,8,np.NaN,4,10,np.NaN,5]).reshape(5,3))
X.columns = ['f1','f2','f3']
print(X)
z = X

     f1   f2   f3
0   5.0  2.0  3.0
1   NaN  NaN  4.0
2  -3.0  2.0  1.0
3   8.0  NaN  4.0
4  10.0  NaN  5.0


In [None]:
X.isnull().sum()

f1    1
f2    3
f3    0
dtype: int64

In [None]:
X.isnull().sum().sum()

4

In [None]:
from sklearn.impute import MissingIndicator
indicator = MissingIndicator(missing_values=np.NaN)
indicator = indicator.fit_transform(X)
indicator = pd.DataFrame(indicator,columns=['a1','a2'])
print(indicator)

      a1     a2
0  False  False
1   True   True
2  False  False
3  False   True
4  False   True


In [None]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')
X= pd.DataFrame(imp.fit_transform(X))
X.columns = ['f1','f2','f3']
print(X)

     f1   f2   f3
0   5.0  2.0  3.0
1   5.0  2.0  4.0
2  -3.0  2.0  1.0
3   8.0  2.0  4.0
4  10.0  2.0  5.0


In [None]:
df = z.dropna(axis=0)
df.reset_index(inplace=True)
print(df)

   index   f1   f2   f3
0      0  5.0  2.0  3.0
1      2 -3.0  2.0  1.0


Handling Categorical Values

In [None]:
import numpy as np
import pandas as pd
X = pd.DataFrame(
    np.array(['M', 'O-', 'medium',
             'M', 'O-', 'high',
              'F', 'O+', 'high',
              'F', 'AB', 'low',
              'F', 'B+', 'medium'])
              .reshape((5,3)))
X.columns = ['gender','blood_type','edu_level']
df = X
print(X)

  gender blood_type edu_level
0      M         O-    medium
1      M         O-      high
2      F         O+      high
3      F         AB       low
4      F         B+    medium


In [None]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder(dtype='int')
X.edu_level = encoder.fit_transform(X.edu_level.values.reshape(-1,1))
print(X)

  gender blood_type  edu_level
0      M         O-          2
1      M         O-          0
2      F         O+          0
3      F         AB          1
4      F         B+          2


In [None]:
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()
df['edu_level'] = le.fit_transform(df['edu_level'])
print(df)

  gender blood_type  edu_level
0      M         O-          2
1      M         O-          0
2      F         O+          0
3      F         AB          1
4      F         B+          2


In [None]:
from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder(dtype='int')
n = pd.DataFrame(
    onehot.fit_transform(X[['gender','blood_type']])
    .toarray(),
    columns=['F','M','AB','B+','O+','O-'])
n['edu_level']=X.edu_level
print(n)


   F  M  AB  B+  O+  O-  edu_level
0  0  1   0   0   0   1          2
1  0  1   0   0   0   1          0
2  1  0   0   0   1   0          0
3  1  0   1   0   0   0          1
4  1  0   0   1   0   0          2


Numerical Features

In [None]:
import pandas as pd
from sklearn.datasets import load_iris
#for Discretization
from sklearn.preprocessing import KBinsDiscretizer

In [None]:
disc = KBinsDiscretizer(n_bins = 3, encode = 'ordinal', strategy='uniform')
d = load_iris()
df = pd.DataFrame(data=d.data, columns = d.feature_names)

In [None]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [None]:
disc.fit_transform(df)

array([[0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 2., 0., 0.],
       [0., 2., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 2., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [1., 2., 0., 0.],
       [1., 2., 0., 0.],
       [0., 2., 0., 0.],
       [0., 1., 0., 0.],
       [1., 2., 0., 0.],
       [0., 2., 0., 0.],
       [0., 1., 0., 0.],
       [0., 2., 0., 0.],
       [0., 2., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 2., 0., 0.],
       [1., 2., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [1., 1., 0., 0.],
       [0., 2., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],


In [None]:
bins = disc.bin_edges_
print(bins)

[array([4.3, 5.5, 6.7, 7.9]) array([2. , 2.8, 3.6, 4.4])
 array([1.        , 2.96666667, 4.93333333, 6.9       ])
 array([0.1, 0.9, 1.7, 2.5])]


In [None]:
#Binarisation
from sklearn.preprocessing import Binarizer
print(df.head())
b = Binarizer(threshold = 5)

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2


In [None]:
b.fit_transform(df['sepal length (cm)'].values.reshape(-1,1))

array([[1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],

Scalers

In [None]:
import pandas as pd
from sklearn.datasets import load_iris
d = load_iris()
df = pd.DataFrame(data = d.data, columns = d.feature_names)
print(df.head())

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [None]:
#standard scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
print(scaler.fit_transform(df['petal length (cm)'].values.reshape(-1,1)))

[[-1.34022653]
 [-1.34022653]
 [-1.39706395]
 [-1.2833891 ]
 [-1.34022653]
 [-1.16971425]
 [-1.34022653]
 [-1.2833891 ]
 [-1.34022653]
 [-1.2833891 ]
 [-1.2833891 ]
 [-1.22655167]
 [-1.34022653]
 [-1.51073881]
 [-1.45390138]
 [-1.2833891 ]
 [-1.39706395]
 [-1.34022653]
 [-1.16971425]
 [-1.2833891 ]
 [-1.16971425]
 [-1.2833891 ]
 [-1.56757623]
 [-1.16971425]
 [-1.05603939]
 [-1.22655167]
 [-1.22655167]
 [-1.2833891 ]
 [-1.34022653]
 [-1.22655167]
 [-1.22655167]
 [-1.2833891 ]
 [-1.2833891 ]
 [-1.34022653]
 [-1.2833891 ]
 [-1.45390138]
 [-1.39706395]
 [-1.34022653]
 [-1.39706395]
 [-1.2833891 ]
 [-1.39706395]
 [-1.39706395]
 [-1.39706395]
 [-1.22655167]
 [-1.05603939]
 [-1.34022653]
 [-1.22655167]
 [-1.34022653]
 [-1.2833891 ]
 [-1.34022653]
 [ 0.53540856]
 [ 0.42173371]
 [ 0.64908342]
 [ 0.13754657]
 [ 0.47857113]
 [ 0.42173371]
 [ 0.53540856]
 [-0.26031542]
 [ 0.47857113]
 [ 0.08070915]
 [-0.14664056]
 [ 0.25122143]
 [ 0.13754657]
 [ 0.53540856]
 [-0.08980313]
 [ 0.36489628]
 [ 0.42173

In [None]:
#MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
print(scaler.fit_transform(df['sepal width (cm)'].values.reshape(-1,1)))

[[0.625     ]
 [0.41666667]
 [0.5       ]
 [0.45833333]
 [0.66666667]
 [0.79166667]
 [0.58333333]
 [0.58333333]
 [0.375     ]
 [0.45833333]
 [0.70833333]
 [0.58333333]
 [0.41666667]
 [0.41666667]
 [0.83333333]
 [1.        ]
 [0.79166667]
 [0.625     ]
 [0.75      ]
 [0.75      ]
 [0.58333333]
 [0.70833333]
 [0.66666667]
 [0.54166667]
 [0.58333333]
 [0.41666667]
 [0.58333333]
 [0.625     ]
 [0.58333333]
 [0.5       ]
 [0.45833333]
 [0.58333333]
 [0.875     ]
 [0.91666667]
 [0.45833333]
 [0.5       ]
 [0.625     ]
 [0.66666667]
 [0.41666667]
 [0.58333333]
 [0.625     ]
 [0.125     ]
 [0.5       ]
 [0.625     ]
 [0.75      ]
 [0.41666667]
 [0.75      ]
 [0.5       ]
 [0.70833333]
 [0.54166667]
 [0.5       ]
 [0.5       ]
 [0.45833333]
 [0.125     ]
 [0.33333333]
 [0.33333333]
 [0.54166667]
 [0.16666667]
 [0.375     ]
 [0.29166667]
 [0.        ]
 [0.41666667]
 [0.08333333]
 [0.375     ]
 [0.375     ]
 [0.45833333]
 [0.41666667]
 [0.29166667]
 [0.08333333]
 [0.20833333]
 [0.5       ]
 [0.33

In [None]:
from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()
print(scaler.fit_transform(df['sepal length (cm)'].values.reshape(-1,1)))

[[0.64556962]
 [0.62025316]
 [0.59493671]
 [0.58227848]
 [0.63291139]
 [0.6835443 ]
 [0.58227848]
 [0.63291139]
 [0.55696203]
 [0.62025316]
 [0.6835443 ]
 [0.60759494]
 [0.60759494]
 [0.5443038 ]
 [0.73417722]
 [0.72151899]
 [0.6835443 ]
 [0.64556962]
 [0.72151899]
 [0.64556962]
 [0.6835443 ]
 [0.64556962]
 [0.58227848]
 [0.64556962]
 [0.60759494]
 [0.63291139]
 [0.63291139]
 [0.65822785]
 [0.65822785]
 [0.59493671]
 [0.60759494]
 [0.6835443 ]
 [0.65822785]
 [0.69620253]
 [0.62025316]
 [0.63291139]
 [0.69620253]
 [0.62025316]
 [0.55696203]
 [0.64556962]
 [0.63291139]
 [0.56962025]
 [0.55696203]
 [0.63291139]
 [0.64556962]
 [0.60759494]
 [0.64556962]
 [0.58227848]
 [0.67088608]
 [0.63291139]
 [0.88607595]
 [0.81012658]
 [0.87341772]
 [0.69620253]
 [0.82278481]
 [0.72151899]
 [0.79746835]
 [0.62025316]
 [0.83544304]
 [0.65822785]
 [0.63291139]
 [0.74683544]
 [0.75949367]
 [0.7721519 ]
 [0.70886076]
 [0.84810127]
 [0.70886076]
 [0.73417722]
 [0.78481013]
 [0.70886076]
 [0.74683544]
 [0.77

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
print(scaler.fit_transform(df['petal width (cm)'].values.reshape(-1,1)))

[[-0.73333333]
 [-0.73333333]
 [-0.73333333]
 [-0.73333333]
 [-0.73333333]
 [-0.6       ]
 [-0.66666667]
 [-0.73333333]
 [-0.73333333]
 [-0.8       ]
 [-0.73333333]
 [-0.73333333]
 [-0.8       ]
 [-0.8       ]
 [-0.73333333]
 [-0.6       ]
 [-0.6       ]
 [-0.66666667]
 [-0.66666667]
 [-0.66666667]
 [-0.73333333]
 [-0.6       ]
 [-0.73333333]
 [-0.53333333]
 [-0.73333333]
 [-0.73333333]
 [-0.6       ]
 [-0.73333333]
 [-0.73333333]
 [-0.73333333]
 [-0.73333333]
 [-0.6       ]
 [-0.8       ]
 [-0.73333333]
 [-0.73333333]
 [-0.73333333]
 [-0.73333333]
 [-0.8       ]
 [-0.73333333]
 [-0.73333333]
 [-0.66666667]
 [-0.66666667]
 [-0.73333333]
 [-0.46666667]
 [-0.6       ]
 [-0.66666667]
 [-0.73333333]
 [-0.73333333]
 [-0.73333333]
 [-0.73333333]
 [ 0.06666667]
 [ 0.13333333]
 [ 0.13333333]
 [ 0.        ]
 [ 0.13333333]
 [ 0.        ]
 [ 0.2       ]
 [-0.2       ]
 [ 0.        ]
 [ 0.06666667]
 [-0.2       ]
 [ 0.13333333]
 [-0.2       ]
 [ 0.06666667]
 [ 0.        ]
 [ 0.06666667]
 [ 0.13333

normalization

In [None]:
X = [[1, -1, 2],
     [4,1,2],
     [0,1,-1]]

In [None]:
from sklearn.preprocessing import normalize
import pandas as pd
import math
#max norm
X_norm = normalize(X,norm='max')
print(X_norm)

[[ 0.5  -0.5   1.  ]
 [ 1.    0.25  0.5 ]
 [ 0.    1.   -1.  ]]


In [None]:
#without sklearn
df = pd.DataFrame(data=X)
norm_max = list(max(list(abs(i) for i in df.iloc[r])) for r in range(len(X)))
print(norm_max)

[2, 4, 1]


In [None]:
import numpy as np
norm_data = np.array(list(list(i/norm_max[r] for i in df.iloc[r]) for r in range(len(X))))
print(norm_data)

[[ 0.5  -0.5   1.  ]
 [ 1.    0.25  0.5 ]
 [ 0.    1.   -1.  ]]


In [None]:
#l1 norm
X_norm = normalize(X,norm='l1')
print(X_norm)

[[ 0.25       -0.25        0.5       ]
 [ 0.57142857  0.14285714  0.28571429]
 [ 0.          0.5        -0.5       ]]


In [None]:
#without sklearn
df = pd.DataFrame(data=X)
norm_max = list(sum(list(abs(i) for i in df.iloc[r])) for r in range(len(X)))
print(norm_max)

[4, 7, 2]


In [None]:
norm_data = np.array(list(list(i/norm_max[r] for i in df.iloc[r]) for r in range(len(X))))
print(norm_data)

[[ 0.25       -0.25        0.5       ]
 [ 0.57142857  0.14285714  0.28571429]
 [ 0.          0.5        -0.5       ]]


In [None]:
#l2 norm
X_norm = normalize(X, norm='l2')
print(X_norm)

[[ 0.40824829 -0.40824829  0.81649658]
 [ 0.87287156  0.21821789  0.43643578]
 [ 0.          0.70710678 -0.70710678]]


In [None]:
#alternative
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
print(normalizer.transform(X))

[[ 0.40824829 -0.40824829  0.81649658]
 [ 0.87287156  0.21821789  0.43643578]
 [ 0.          0.70710678 -0.70710678]]


In [None]:
#without sklearn
df = pd.DataFrame(data=X)
norm_l2 = list(math.sqrt(sum(list(i**2 for i in df.iloc[r]))) for r in range(len(X)))
print(norm_l2)

[2.449489742783178, 4.58257569495584, 1.4142135623730951]


In [None]:
norm_data = list(((list(i/norm_l2[r] for i in df.iloc[r]))) for r in range(len(X)))
print(np.array(norm_data))

[[ 0.40824829 -0.40824829  0.81649658]
 [ 0.87287156  0.21821789  0.43643578]
 [ 0.          0.70710678 -0.70710678]]


Polynomial Features

In [None]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
X = np.arange(6).reshape(3,2)
print(X)
p = PolynomialFeatures(2)
p.fit_transform(X)

[[0 1]
 [2 3]
 [4 5]]


array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

In [None]:
#degree = 3
X = np.arange(9).reshape(3,3)
print(X)
p = PolynomialFeatures(degree=3, interaction_only = True)
p.fit_transform(X)

[[0 1 2]
 [3 4 5]
 [6 7 8]]


array([[  1.,   0.,   1.,   2.,   0.,   0.,   2.,   0.],
       [  1.,   3.,   4.,   5.,  12.,  15.,  20.,  60.],
       [  1.,   6.,   7.,   8.,  42.,  48.,  56., 336.]])

In [None]:
#spline transformer
from sklearn.preprocessing import SplineTransformer
X = np.arange(5).reshape(5,1)
print(X)
spline = SplineTransformer(degree=2, n_knots=3)
print(spline.fit_transform(X))

[[0]
 [1]
 [2]
 [3]
 [4]]
[[0.5   0.5   0.    0.   ]
 [0.125 0.75  0.125 0.   ]
 [0.    0.5   0.5   0.   ]
 [0.    0.125 0.75  0.125]
 [0.    0.    0.5   0.5  ]]


In [None]:
#custom transformers
from sklearn.preprocessing import FunctionTransformer
X = np.arange(9).reshape(3,3)
print(X)

[[0 1 2]
 [3 4 5]
 [6 7 8]]


In [None]:
t = FunctionTransformer(np.log1p, validate = True)
t.fit_transform(X)

array([[0.        , 0.69314718, 1.09861229],
       [1.38629436, 1.60943791, 1.79175947],
       [1.94591015, 2.07944154, 2.19722458]])

In [None]:
import pandas as pd
X = pd.DataFrame(np.arange(9).reshape(3,3))
X.apply(lambda y : np.log1p(y))

Unnamed: 0,0,1,2
0,0.0,0.693147,1.098612
1,1.386294,1.609438,1.791759
2,1.94591,2.079442,2.197225
