# Feature Scaling

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

We will use a small dataset that contains (Physics,Biology and Maths) marks of a classroom of students.

In [2]:
df = pd.read_csv("data/grades.csv")

Show the first 5 rows of data.

In [3]:
df.head()

Unnamed: 0,Name,Physics,Biology,Maths
0,Arnold,80,78,70
1,Bob,60,98,45
2,Cassandra,34,56,65
3,Donovan,56,65,32
4,Emily,98,23,55


We can perform some preprocessing techniques to scale our data. 

In [None]:
Min-Max normalization involves scaling features to lie between a given minimum and maximum value, often between zero and one.

In [None]:
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()

In [None]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()

In [None]:
physics=df["Physics"].values
physics

The scaler expects a 2d array, so we will use the reshape function to convert our 1d data to a 2d data array.

In [None]:
physics.shape

In [None]:
physics_2d=physics.reshape(-1,1)

In [None]:
physics_2d.shape

In [None]:
physics_scaled=scaler.fit_transform(physics_2d)

In [None]:
physics_scaled

In [None]:
df_new=pd.DataFrame(physics_scaled)

In [None]:
df_new.hist()

In [None]:
splot=sns.distplot(df_new)
splot.set_xlim(0, 1)

In [None]:
df_new.skew()

As you can see, Min-Max Scaler does not make your data look more like a normal distribution.

In many Machine Learning modeling scenarios, **normality** of the features in a dataset is desirable. Power transforms are a family of parametric, monotonic transformations that aim to map data from any distribution to as close to a **Gaussian distribution** as possible in order to stabilize variance and **minimize skewness**.

In [None]:
scaler = preprocessing.PowerTransformer(method='box-cox', standardize=False)

In [None]:
physics_scaled=scaler.fit_transform(physics_2d)

In [None]:
physics_scaled

In [None]:
df_new=pd.DataFrame(physics_scaled)
df_new.hist()

In [None]:
splot=sns.distplot(df_new)

In [None]:
df["Physics"].skew()

In [None]:
df_new.skew()

In [None]:
np.random.randint(low=1, high=100, size=4)

In [None]:
np.random.random_sample()

In [None]:
from scipy.stats import skewnorm

In [None]:
rand_vars = skewnorm.rvs(5, size=10000)

In [None]:
fig, ax = plt.subplots(1, 1)
# ax.hist(rand_vars, density=True, histtype='stepfilled')
ax.hist(rand_vars)
plt.show()

In [None]:
df=pd.DataFrame(rand_vars)

In [None]:
df.skew()

In [None]:
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(np.log1p, validate=True)

In [None]:
data_trans=transformer.transform(df)

In [None]:
df_new=pd.DataFrame(data_trans)

In [None]:
fig, ax = plt.subplots(1, 1)
ax.hist(data_trans)
plt.show()

In [None]:
df_new.skew()

In [None]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()

In [None]:
df_scaled=scaler.fit_transform(df)

In [None]:
df_new=pd.DataFrame(df_scaled)

In [None]:
df_new.skew()

In [None]:
scaler = preprocessing.PowerTransformer(standardize=False)

In [None]:
df_scaled=scaler.fit_transform(df)

In [None]:
df_new=pd.DataFrame(df_scaled)
df_new.skew()

In [None]:
fig, ax = plt.subplots(1, 1)
ax.hist(df_scaled, density=True, histtype='stepfilled')
plt.show()