# Handling Numerical Data 

In [1]:
import numpy as np
from sklearn import preprocessing

## 4.1 Rescaling a Feature

In [2]:
feature = np.array([[-500.5],
                    [-100.1],
                    [0],
                    [100.1],
                    [500.5]])
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
scaled_feature = minmax_scale.fit_transform(feature)
scaled_feature

array([[0. ],
       [0.4],
       [0.5],
       [0.6],
       [1. ]])

Note: mechanism of `MinMaxScaler`:  

$x_i'=\displaystyle\frac{x_i-\min{x}}{\max{x}-\min{x}} $, x denotes a vector.

## 4.2 Standardizing a Feature

In [3]:
x = np.array([[-1000.1],
              [-200.2],
              [500.5],
              [600.6],
              [9000.9]])
scaler = preprocessing.StandardScaler()
standardized = scaler.fit_transform(x)
standardized

array([[-0.76058269],
       [-0.54177196],
       [-0.35009716],
       [-0.32271504],
       [ 1.97516685]])

Note: mechanism of `StandardScaler`:  

$x_i'=\displaystyle\frac{x_i-\bar{x}}{\sigma} $, x denotes a vector, $\sigma$ denotes std, while $\bar{x}$ denotes mean.  

<b><font color=#FE2D00> Attention: </font></b>

if there are significant outliers, it can negatively impact standardized processing. Instead we use `RobustScaler` to solve this problem.

In [4]:
robust_scaler = preprocessing.RobustScaler()
robust_scaler.fit_transform(x)

array([[-1.87387612],
       [-0.875     ],
       [ 0.        ],
       [ 0.125     ],
       [10.61488511]])

## 4.3 Normalizing Observations

In [6]:
features = np.array([[0.5, 0.5],
                     [1.1, 3.4],
                     [1.5, 20.2],
                     [1.63, 34.4],
                     [10.9, 3.3]])
normalizer = preprocessing.Normalizer(norm='l2')
normalizer.transform(features)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

Mechanism of `Normalizer:  `
when option `norm`='l2', we have $\begin{Vmatrix}
x
\end{Vmatrix}_2 = \sqrt{x_1^2+\cdots+x_n^2}$. 

## 4.4 Generating Polynomial and Interaction Features

In [8]:
features_poly = np.array([[2, 3],
                          [2, 3],
                          [2, 3]])
polynomial_interaction = preprocessing.PolynomialFeatures(
    degree=2, include_bias=False)
polynomial_interaction.fit_transform(features_poly)

array([[2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.]])

Explaination:  
when degree=2, `PolynomialFeatrues` generates $x_1,x_2,x_1^2, x_2^2, x_1x_2$ from $x_1,x_2$

## 4.5 Transforming Features

In [9]:
def add_ten(x):
    return x+10

In [10]:
features_fun = np.array([[2, 3],
                         [1, 4],
                         [5, 3]])
ten_transformer = preprocessing.FunctionTransformer(add_ten)
ten_transformer.transform(features_fun)

array([[12, 13],
       [11, 14],
       [15, 13]])

it is similar to Pandas function `apply`

In [12]:
import pandas as pd
df = pd.DataFrame(features_fun, columns=['feature1', 'feature2'])
df.apply(add_ten)

Unnamed: 0,feature1,feature2
0,12,13
1,11,14
2,15,13


## 4.6 Detecting Outliers

In [13]:
from sklearn.datasets import make_blobs
from sklearn.covariance import EllipticEnvelope
features_outliers, _ = make_blobs(
    n_features=2, n_samples=10, centers=1, random_state=1)
features_outliers[0, 0] = 100000
features_outliers[0, 1] = 100000
outlier_detector = EllipticEnvelope(contamination=.1)
outlier_detector.fit(features_outliers)
outlier_detector.predict(features_outliers)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

if you feel there is a hign probability that your data contain lots of outliers, set `contamination` to a small value.Sometimes we can make use of quartile to detect the potential outliers

In [14]:
feature_temp = features[:, 0]


def indicies_of_outliers(x):
    q1, q3 = np.percentile(x, [25, 75])
    iqr = q3-q1
    lower_bound = q1-(iqr*1.5)
    upper_bound = q3+(iqr*1.5)
    return np.where((x > upper_bound) | (x < lower_bound))


indicies_of_outliers(feature_temp)

(array([4], dtype=int64),)

## 4.7 Handling Outliers

In [15]:
houses = pd.DataFrame()
houses['Price']=[534433,334235,296455,1232343]
houses['Bathrooms'] = [2,3.5,2,116]
houses['Square_Feet'] = [1500,2500,1500,48000]
houses[houses['Bathrooms']<20]

Unnamed: 0,Price,Bathrooms,Square_Feet
0,534433,2.0,1500
1,334235,3.5,2500
2,296455,2.0,1500


In [16]:
houses["Outliers"]=np.where(houses["Bathrooms"]<20,0,1)
houses["Log_of_Square_Feet"]=[np.log(x) for x in houses['Square_Feet']]
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outliers,Log_of_Square_Feet
0,534433,2.0,1500,0,7.31322
1,334235,3.5,2500,0,7.824046
2,296455,2.0,1500,0,7.31322
3,1232343,116.0,48000,1,10.778956


## 4.8 Discretizating Features

In [17]:
age = np.array([[6],
                [12],
                [20],
                [36],
                [65]])
binarizer = preprocessing.Binarizer(18)
binarizer.fit_transform(age)



array([[0],
       [0],
       [1],
       [1],
       [1]])

In [18]:
np.digitize(age, bins=[20,30,64])

array([[0],
       [0],
       [1],
       [2],
       [3]], dtype=int64)

In [19]:
np.digitize(age, bins=[20,30,64], right=True)

array([[0],
       [0],
       [0],
       [2],
       [3]], dtype=int64)

## 4.9 Grouping Observations Using Clustering

In [22]:
from sklearn.cluster import KMeans
features2, _ = make_blobs(n_samples=50, n_features=2,
                          centers=3, random_state=1)
dataframe = pd.DataFrame(features2, columns=['feature1', 'feature2'])
clusterer = KMeans(3, random_state=0)
clusterer.fit(features2)
dataframe['group'] = clusterer.predict(features2)
dataframe.head()

Unnamed: 0,feature1,feature2,group
0,-9.877554,-3.336145,2
1,-7.28721,-8.353986,0
2,-6.943061,-7.023744,0
3,-7.440167,-8.791959,0
4,-6.641388,-8.075888,0


## 4.10 Deleting Observation with Missing Value

In [23]:
data = np.array([[1.1, 11.1],
                 [2.2, 22.2],
                 [3.3, 33.3],
                 [4.4, np.nan]])
data[~np.isnan(data).any(axis=1)]

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3]])

In [24]:
dataframe2 = pd.DataFrame(data, columns=['feature1', 'feature2'])
dataframe2.dropna()

Unnamed: 0,feature1,feature2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3


## 4.11 Imputing Missing Values??