In [1]:
import numpy as np

In [2]:
from sklearn.preprocessing import MinMaxScaler

In [3]:
data = np.random.randint(0,100,(10,2))

In [4]:
data

array([[72, 15],
       [80, 16],
       [37, 50],
       [62, 45],
       [67, 36],
       [61, 79],
       [10,  2],
       [ 5, 73],
       [66, 85],
       [55,  6]])

## Using Scikit Learn for Preprocessing Data

Lets assume that data is the data we have. If we are going to use this data in our neural network model one of the first things that we are going to do is to scale the data.

In [9]:
scaler_model = MinMaxScaler()

Basically create a `MinMaxScalar` instance

In [12]:
scaler_model.fit(data) # we are fitting the data to the MinMaxScaler model



MinMaxScaler(copy=True, feature_range=(0, 1))

In [13]:
scaler_model.transform(data) # transforming the data 

array([[0.89333333, 0.15662651],
       [1.        , 0.1686747 ],
       [0.42666667, 0.57831325],
       [0.76      , 0.51807229],
       [0.82666667, 0.40963855],
       [0.74666667, 0.92771084],
       [0.06666667, 0.        ],
       [0.        , 0.85542169],
       [0.81333333, 1.        ],
       [0.66666667, 0.04819277]])

In [14]:
# Here as we can clearly see that the minimum value has become zero and the maximum value has become 1. 
# There are obviously other scalers found

In [15]:
scaler_model.fit_transform(data)



array([[0.89333333, 0.15662651],
       [1.        , 0.1686747 ],
       [0.42666667, 0.57831325],
       [0.76      , 0.51807229],
       [0.82666667, 0.40963855],
       [0.74666667, 0.92771084],
       [0.06666667, 0.        ],
       [0.        , 0.85542169],
       [0.81333333, 1.        ],
       [0.66666667, 0.04819277]])

The **ScalerModel** should only be fitted to your training data. The idea is that we need to fit to our training data and then
transform our training data and then transform our test data

In [16]:
import pandas as pd

In [18]:
mydata = np.random.randint(0,101,(50,4))

In [21]:
data = pd.DataFrame(data=mydata, columns=['f1', 'f2', 'f3', 'label'])

Here the first 3 columns are essentially columns of the 3 features and the label is a separate label

In [23]:
data

Unnamed: 0,f1,f2,f3,label
0,68,44,40,37
1,44,11,25,36
2,34,11,100,47
3,29,77,51,92
4,28,47,47,99
5,41,94,35,37
6,94,66,70,10
7,11,60,66,81
8,0,75,65,5
9,68,26,53,91


In [24]:
# lets split this into a training and test dataset

In [25]:
X = data[['f1', 'f2', 'f3']]

In [26]:
y = data['label']

In [27]:
# now we have the features and the labels

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [31]:
# test_size is simply the percentage of the data that we want to go to the test set
# random_state is simply like the seed in a random number generator, the idea is to always shuffle before getting into the train
# and test sets

In [32]:
type(X_train)

pandas.core.frame.DataFrame

In [33]:
X_train.shape

(33, 3)

In [34]:
X_test.shape

(17, 3)