Python Data Preprocessing Techniques

a. Rescaling Data

In [4]:
import pandas, scipy, numpy
from sklearn.preprocessing import MinMaxScaler

In [5]:
df = pandas.read_csv("/home/anshul/code/machine-learning/machine-learning-data-preprocessing/Data/winequality-red.csv", sep=';')

In [6]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [8]:
array = df.values

In [10]:
# separating the data into input and output components
x = array[:,0:8]
y = array[:,8]

In [26]:
x

array([[7.400e+00, 7.000e-01, 0.000e+00, ..., 1.100e+01, 3.400e+01,
        9.978e-01],
       [7.800e+00, 8.800e-01, 0.000e+00, ..., 2.500e+01, 6.700e+01,
        9.968e-01],
       [7.800e+00, 7.600e-01, 4.000e-02, ..., 1.500e+01, 5.400e+01,
        9.970e-01],
       ...,
       [6.300e+00, 5.100e-01, 1.300e-01, ..., 2.900e+01, 4.000e+01,
        9.957e-01],
       [5.900e+00, 6.450e-01, 1.200e-01, ..., 3.200e+01, 4.400e+01,
        9.955e-01],
       [6.000e+00, 3.100e-01, 4.700e-01, ..., 1.800e+01, 4.200e+01,
        9.955e-01]])

In [11]:
scaler = MinMaxScaler(feature_range=(0,1))
rescaledX = scaler.fit_transform(x)

In [12]:
# setting the precision for the output
numpy.set_printoptions(precision=3)

In [13]:
rescaledX[0:5,:]

array([[0.248, 0.397, 0.   , 0.068, 0.107, 0.141, 0.099, 0.568],
       [0.283, 0.521, 0.   , 0.116, 0.144, 0.338, 0.216, 0.494],
       [0.283, 0.438, 0.04 , 0.096, 0.134, 0.197, 0.17 , 0.509],
       [0.584, 0.11 , 0.56 , 0.068, 0.105, 0.225, 0.191, 0.582],
       [0.248, 0.397, 0.   , 0.068, 0.107, 0.141, 0.099, 0.568]])

b. Standardizing Data

In [15]:
from sklearn.preprocessing import StandardScaler

In [16]:
scaler = StandardScaler().fit(x)
rescaledX = scaler.transform(x)

In [17]:
rescaledX[0:5,:]

array([[-0.528,  0.962, -1.391, -0.453, -0.244, -0.466, -0.379,  0.558],
       [-0.299,  1.967, -1.391,  0.043,  0.224,  0.873,  0.624,  0.028],
       [-0.299,  1.297, -1.186, -0.169,  0.096, -0.084,  0.229,  0.134],
       [ 1.655, -1.384,  1.484, -0.453, -0.265,  0.108,  0.412,  0.664],
       [-0.528,  0.962, -1.391, -0.453, -0.244, -0.466, -0.379,  0.558]])

c. Normalizing Data

In [19]:
from sklearn.preprocessing import Normalizer

In [20]:
scaler = Normalizer().fit(x)
normalizedX = scaler.transform(x)

In [21]:
normalizedX[0:5,:]

array([[2.024e-01, 1.914e-02, 0.000e+00, 5.196e-02, 2.079e-03, 3.008e-01,
        9.299e-01, 2.729e-02],
       [1.083e-01, 1.222e-02, 0.000e+00, 3.611e-02, 1.361e-03, 3.472e-01,
        9.306e-01, 1.385e-02],
       [1.377e-01, 1.342e-02, 7.061e-04, 4.060e-02, 1.624e-03, 2.648e-01,
        9.533e-01, 1.760e-02],
       [1.767e-01, 4.416e-03, 8.833e-03, 2.997e-02, 1.183e-03, 2.681e-01,
        9.464e-01, 1.574e-02],
       [2.024e-01, 1.914e-02, 0.000e+00, 5.196e-02, 2.079e-03, 3.008e-01,
        9.299e-01, 2.729e-02]])

d. Binarizing Data

In [27]:
from sklearn.preprocessing import Binarizer

In [28]:
binarizer = Binarizer(threshold=0.0).fit(x)

In [29]:
binaryX = binarizer.transform(x)

In [30]:
binaryX[0:5,:]

array([[1., 1., 0., 1., 1., 1., 1., 1.],
       [1., 1., 0., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 0., 1., 1., 1., 1., 1.]])

e. Mean Removal

In [31]:
from sklearn.preprocessing import scale

In [32]:
data_standardized = scale(df)
data_standardized.mean(axis=0)

array([ 3.555e-16,  1.733e-16, -8.887e-17, -1.244e-16,  3.910e-16,
       -6.221e-17,  4.444e-17,  2.364e-14,  2.862e-15,  6.754e-16,
        1.066e-16,  8.887e-17])

In [33]:
data_standardized.std(axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

f. One Hot Encoding

In [38]:
from sklearn.preprocessing import OneHotEncoder

In [42]:
encoder = OneHotEncoder()
encoder = encoder.fit([[0,1,6,2],
[1,5,3,5],
[2,4,2,7],
[1,0,4,2]
])

In [43]:
encoder.transform([[2,4,3,4]]).toarray()

ValueError: Found unknown categories [4] in column 3 during transform

g. Label Encoding

In [44]:
from sklearn.preprocessing import LabelEncoder

In [45]:
label_encoder=LabelEncoder()

In [46]:
input_classes=['Havells', 'Philips', 'Syska', 'Eveready', 'Lloyd']

In [47]:
label_encoder.fit(input_classes)

LabelEncoder()

In [48]:
for i, item in enumerate(label_encoder.classes_):
    print(item, '-->', i)

Eveready --> 0
Havells --> 1
Lloyd --> 2
Philips --> 3
Syska --> 4
