## **Data preprocessing can refer to manipulation or dropping of data before it is used in order to ensure or enhance performance**

In [12]:
# SCALING

from pandas import read_csv
from numpy import set_printoptions
from sklearn import preprocessing

# Read the CSV file and prepare the array

names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv("pima-indians-diabetes.csv", names=names)
array = dataframe.values

# Using MinMaxScaler class to rescale the data in the range of 0 and 1.

data_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))     # ( X - min(X) )/( max(X) - min(X) )
data_rescaled = data_scaler.fit_transform(array)

# Setting the precision to 2

set_printoptions(precision=2)

# Showing the first 10 rows in the output

print ("\nScaled data:\n", data_rescaled[0:10])
print ("\nReScaled data:\n", data_rescaled[0:5])



Scaled data:
 [[0.35 0.74 0.59 0.35 0.   0.5  0.23 0.48 1.  ]
 [0.06 0.43 0.54 0.29 0.   0.4  0.12 0.17 0.  ]
 [0.47 0.92 0.52 0.   0.   0.35 0.25 0.18 1.  ]
 [0.06 0.45 0.54 0.23 0.11 0.42 0.04 0.   0.  ]
 [0.   0.69 0.33 0.35 0.2  0.64 0.94 0.2  1.  ]
 [0.29 0.58 0.61 0.   0.   0.38 0.05 0.15 0.  ]
 [0.18 0.39 0.41 0.32 0.1  0.46 0.07 0.08 1.  ]
 [0.59 0.58 0.   0.   0.   0.53 0.02 0.13 0.  ]
 [0.12 0.99 0.57 0.45 0.64 0.45 0.03 0.53 1.  ]
 [0.47 0.63 0.79 0.   0.   0.   0.07 0.55 1.  ]]

ReScaled data:
 [[0.35 0.74 0.59 0.35 0.   0.5  0.23 0.48 1.  ]
 [0.06 0.43 0.54 0.29 0.   0.4  0.12 0.17 0.  ]
 [0.47 0.92 0.52 0.   0.   0.35 0.25 0.18 1.  ]
 [0.06 0.45 0.54 0.23 0.11 0.42 0.04 0.   0.  ]
 [0.   0.69 0.33 0.35 0.2  0.64 0.94 0.2  1.  ]]


In [13]:
# The code for L1 Normalization is

'''
It may be defined as the normalization technique that modifies the dataset values in a way that in each row the sum of the absolute values will always be up to 1. 
It is also called Least Absolute Deviations.
'''

from pandas import read_csv
from numpy import set_printoptions
from sklearn.preprocessing import Normalizer

# Read the CSV file and prepare the array

names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv("pima-indians-diabetes.csv", names=names)
array = dataframe.values

# Using Normalizer class with L1 to normalize the data.

Data_normalizer = Normalizer(norm='l1').fit(array)
Data_normalized = Data_normalizer.transform(array)

# Setting the precision to 2

set_printoptions(precision=2)

# Showing the first 3 rows in the output

print ("\nNormalized data:\n", Data_normalized [0:3])


Normalized data:
 [[0.02 0.43 0.21 0.1  0.   0.1  0.   0.14 0.  ]
 [0.   0.36 0.28 0.12 0.   0.11 0.   0.13 0.  ]
 [0.03 0.59 0.21 0.   0.   0.07 0.   0.1  0.  ]]


In [15]:
# The code for L2 Normalization is

from pandas import read_csv
from numpy import set_printoptions
from sklearn.preprocessing import Normalizer

# Read the CSV file and prepare the array

names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv("pima-indians-diabetes.csv", names=names)
array = dataframe.values

# Using Normalizer class with L2 to normalize the data.

Data_normalizer = Normalizer(norm='l2').fit(array)
Data_normalized = Data_normalizer.transform(array)

# Setting the precision to 2

set_printoptions(precision=2)

# Showing the first 3 rows in the output

print ("\nNormalized data:\n", Data_normalized [0:3])


Normalized data:
 [[0.03 0.83 0.4  0.2  0.   0.19 0.   0.28 0.01]
 [0.01 0.72 0.56 0.24 0.   0.22 0.   0.26 0.  ]
 [0.04 0.92 0.32 0.   0.   0.12 0.   0.16 0.01]]


In [16]:
#STANDARDISATION

'''
Transforming data with gaussian distribution

Standard normal Distribution = z = (x-mean) / standard deviation

Why Gaussian is helpful?
can be used to calculate the probability for any individual observation from the sample space.
'''

from pandas import read_csv
from numpy import set_printoptions
from sklearn.preprocessing import StandardScaler

# Read the CSV file and prepare the array

names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv("pima-indians-diabetes.csv", names=names)
array = dataframe.values

# Using StandardScaler class to rescale the data.

data_scaler = StandardScaler().fit(array)
data_rescaled = data_scaler.transform(array)

# Setting the precision to 2

set_printoptions(precision=2)

# Showing the first 3 rows in the output

print ("\nRescaled data:\n", data_rescaled [0:3])


Rescaled data:
 [[ 0.64  0.85  0.15  0.91 -0.69  0.2   0.47  1.43  1.37]
 [-0.84 -1.12 -0.16  0.53 -0.69 -0.68 -0.37 -0.19 -0.73]
 [ 1.23  1.94 -0.26 -1.29 -0.69 -1.1   0.6  -0.11  1.37]]


In [17]:
# BINARIZATION

'''
the process of dividing data into two groups and assigning one out. of two values to all the members of the same group. 
This is usually accomplished by defining a threshold t and assigning the value 0 to all the data points below the threshold and 1 to those above it.
'''

from pandas import read_csv
from numpy import set_printoptions
from sklearn.preprocessing import Binarizer

# Read the CSV file and prepare the array

names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv("pima-indians-diabetes.csv", names=names)
array = dataframe.values

# Using Binarize class to convert the data into binary values.

binarizer = Binarizer(threshold=0.5).fit(array)
Data_binarized = binarizer.transform(array)

# Showing the first 3 rows in the output

print ("\nBinary data:\n", Data_binarized [0:3])


Binary data:
 [[1. 1. 1. 1. 0. 1. 1. 1. 1.]
 [1. 1. 1. 1. 0. 1. 0. 1. 0.]
 [1. 1. 1. 0. 0. 1. 1. 1. 1.]]
