# CHAPTER-4 Handling Numerical Data

Transforming raw numerical data into features built for Machine Learning algorithms

## 4.1 Rescaling a feature

In [18]:
# Rescaling the values of a numerical feature to be between two values.
# Rescaling is very common in Machine Learning

import numpy as np
from sklearn import preprocessing

In [19]:
# creating a feature
# feature vecotr is like a sample object

feature = np.array([[-500.5],
                  [-100.1],
                  [0],
                  [100.1],
                  [900.9]])

In [20]:
# creating a scaler

minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))

# scale feature

scaled_feature = minmax_scale.fit_transform(feature)

# we can also use fit() to calculate min and max and then transform() 
# both does the same job but it is btter to use seperately in some scenarios where we want to use the same transformation again

In [21]:
scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

## 4.2 Standardizing a feature

In [22]:
# Principal component analysis works better with Standardization scaling
# Neural Networks work btter with min-max scaling

In [23]:
# creating a feature

feature1 = np.array([[-1000.1],
                     [-200.2],
                     [500.5],
                     [600.6],
                     [9000.9]])

# creating a scaler
scaler = preprocessing.StandardScaler()

# rescaleing the feature to be standard normally distributed i.e., mean = 0 and standard deviation = 1

In [24]:
# transforming the feature
standardized = scaler.fit_transform(feature1)

In [25]:
standardized

array([[-0.76058269],
       [-0.54177196],
       [-0.35009716],
       [-0.32271504],
       [ 1.97516685]])

In [29]:
print("Mean: ", round(standardized.mean()))
print("Standard deviation: ", standardized.std())

Mean:  0
Standard deviation:  1.0


In [30]:
# creating a scaler
scaler = preprocessing.RobustScaler()

rob_scaler = scaler.fit_transform(feature1)

rob_scaler

array([[-1.87387612],
       [-0.875     ],
       [ 0.        ],
       [ 0.125     ],
       [10.61488511]])

## 4.3 Normalizing Observations

Rescaling the feature values of observations to have unit norm(sum of thier length is 1). This is often used when we have many equvivalent features(text classification)

In [32]:
from sklearn.preprocessing import Normalizer

In [41]:
# Creating a feature matrix

feature2 = np.array([[0.5, 0.5],
                    [1.1, 3.4],
                    [1.5, 20.2],
                    [1.63, 34.4],
                    [10.9, 3.3]])

In [42]:
# creating a normalizer

normalizer = Normalizer(norm="l2")

In [43]:
normalizer.transform(feature2)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [47]:
norm_l1 = Normalizer(norm="l1").transform(feature2)
norm_l1
# this norm="l1" scales values such that they sum to 1

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.04524008, 0.95475992],
       [0.76760563, 0.23239437]])

In [48]:
norm_l2 = Normalizer(norm="l2").transform(feature2)
norm_l2

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [50]:
print('sum first elements of norm_l1: ',norm_l1[0,0]+norm_l1[0,1])

sum of norm_1:  1.0


## 4.4 Generating Polynomial and Interaction features

Scikit-learn offers built-in methods to create polynomial and interaction features.

Polynomial features are often created when we want to include the notion that there exists a nonlinear relationship between the features and the target. Also when effect of one feature dependent on another feature

In [52]:
from sklearn.preprocessing import PolynomialFeature

In [54]:
# Creating a feature matrix

feature3 = np.array([[2, 3],
                      [2, 3],
                      [2, 3]])

In [55]:
polynomial_interaction = PolynomialFeatures(degree = 2, include_bias = False)

# degree is for the maximum degree of the polynomial

In [56]:
polynomial_interaction.fit_transform(feature3)

array([[2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.]])

In [57]:
interaction = PolynomialFeatures(degree = 2, interaction_only = True,  include_bias = False)

interaction.fit_transform(feature3)

array([[2., 3., 6.],
       [2., 3., 6.],
       [2., 3., 6.]])

## 4.5 Transforming Features

To do custom transformation to or more features

In [60]:
from sklearn.preprocessing import FunctionTransformer

In [65]:
# creating a feature

feature4 = np.array([[2, 3],
                     [2, 3],
                     [2, 3]])

In [66]:
# defining a funcion

def add_ten(x):
    return x + 10

# we can also create much complex functions for applying to these transformations 

In [70]:
# creating a transformer

ten_transformer = FunctionTransformer(add_ten)

# transforming feature matrix

ten_transformer.transform(feature4)

array([[12, 13],
       [12, 13],
       [12, 13]])

In [71]:
# This transformation can also be done in pandas using apply

import pandas as pd

In [75]:
df = pd.DataFrame(feature4, columns = ["feature_1", "feature_2"])

df.apply(add_ten)

Unnamed: 0,feature_1,feature_2
0,12,13
1,12,13
2,12,13


## 4.6 Detecting outliers

Assuming the data is normally distributed and drawing a ellipse around the data, and the observations inside the ellipse is denoted as 1 and outside are with -1

In [2]:
from sklearn.covariance import EllipticEnvelope

# an object for detecting outlier in a Guassian distributed dataset

from sklearn.datasets import make_blobs

# for generating isotropic Gaussian blobs for clustering

In [3]:
# creating a simulated data

features, _ = make_blobs(n_samples = 10,
                        n_features = 2,
                        centers = 1,
                        random_state = 1)

# n_samples: total number of points equally divided among clusters
# n_features: no of features for each sample
# centers: no of centers to generate or fixed center location
# random_state: random number generation for dataset creation 

In [9]:
# replacing the first observations with extreme values

features[0,0] = 10000
features[0,1] = 10000

In [21]:
outlier_detector = EllipticEnvelope(contamination = 0.1)

# EllipticEnvelope:Object for detecting outliers
# contamination: proportion of the outliers in the dataset(estimate of cleansiness of data)
# contamination is a value to be assumed, lower value if we assume less outliers, higher values if we assume more outliers

In [22]:
outlier_detector.fit(features)

In [23]:
outlier_detector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [26]:
# features

In [29]:
# we can also find outliers using interquartile range

# creating a feature
feature = features[:,0]

# function to return index of outliers

def indicesOfOutliers(x):
    q1, q3 = np.percentile(x, [25,75])
    iqr = q3 - q1
    lowerBound = q1 - (iqr * 1.5)
    upperBound = q3 + (iqr * 1.5)
    return np.where((x > upperBound) | (x < lowerBound))

In [30]:
indicesOfOutliers(feature)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(array([0], dtype=int64),)

## 4.7 Handling Outliers

In [31]:
# 1. Dropping the outliers

In [33]:
# creating a dataframe
houses = pd.DataFrame()

houses['Prices'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] =  [2, 3.5, 2, 116]
houses['Square_Feet'] = [1500, 2500, 1500, 48000]

<IPython.core.display.Javascript object>

In [34]:
# filtering observations

houses[houses['Bathrooms'] < 20]

Unnamed: 0,Prices,Bathrooms,Square_Feet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


In [35]:
# 2. Marking the outlier

In [39]:
# creating a feature based on boolean condition
houses['Outlier'] = np.where(houses['Bathrooms'] < 20, 0, 1)

# print data
houses

<IPython.core.display.Javascript object>

Unnamed: 0,Prices,Bathrooms,Square_Feet,Outlier
0,534433,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,4322032,116.0,48000,1


In [42]:
# 3. Transforming the feature to dampen the effect of outlier

houses["Log_Of_Square_Feet"] = [np.log(x) for x in houses["Square_Feet"]]

houses

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,Prices,Bathrooms,Square_Feet,Outlier,Log_Of_Square_Feet
0,534433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956


When Outliers are present with Standardization, mean and variance can be influenced. RobustScaler is most robust against outliers

## 4.8 Discretizating features

Breaking a numerical feature to discrete bins

In [45]:
# 1. Binarizing the feature with threshold 

from sklearn.preprocessing import Binarizer

In [46]:
# creating feature

age = np.array([[6],
                [12],
                [20],
                [36],
                [65]])

<IPython.core.display.Javascript object>

In [57]:
# creating binarizer

binarizer = Binarizer().fit_transform(age)

binarizer

array([[1],
       [1],
       [1],
       [1],
       [1]])

In [58]:
# 2. we can break up numerical features into multiple thresholds

In [59]:
# Bin feature

np.digitize(age, bins=[20,30,64])
# 3 thresholds

<IPython.core.display.Javascript object>

array([[0],
       [0],
       [1],
       [2],
       [3]], dtype=int64)

In [60]:
np.digitize(age, bins=[20,30,64], right=True)

<IPython.core.display.Javascript object>

array([[0],
       [0],
       [0],
       [2],
       [3]], dtype=int64)

In [61]:
np.digitize(age, bins=[18])
# single threshold

<IPython.core.display.Javascript object>

array([[0],
       [0],
       [1],
       [1],
       [1]], dtype=int64)

## 4.9 Grouping observations using clustering

To cluster observations so that similar observations are grouped together. clustering is a preprocessing step

If we have k-groups we can use k-means to clustering to group similar observations and output new feature containing each observation group membership. k-means is a unsupervised algorithm

In [62]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

In [63]:
# Simulated feature matrix

features, _ = make_blobs(n_samples = 50,
                        n_features = 2,
                        centers = 3,
                        random_state = 1)

In [64]:
# creating dataframe

dataframe = pd.DataFrame(features, columns = ['feature1','feature2'])

<IPython.core.display.Javascript object>

In [66]:
# Making k-means clusterer

clusterer = KMeans(3, random_state = 0)

# 3 is the no of clusters to form

In [72]:
# fit clusterer
clusterer.fit(features)



In [79]:
# predict values

dataframe["group"] = clusterer.predict(features)

In [80]:
# View first few observations
dataframe.head(5)

Unnamed: 0,feature_1,feature_2,group
0,-9.877554,-3.336145,0
1,-7.28721,-8.353986,2
2,-6.943061,-7.023744,2
3,-7.440167,-8.791959,2
4,-6.641388,-8.075888,2


## 4.10 Deleting observations with missing values

Handling missing values can be done using numpy and pandas

In [81]:
# creating a feature matrix

features = np.array([[1.1, 11.1],
                     [2.2, 22.2],
                     [3.3, 33.3],
                     [4.4, 44.4],
                     [np.nan, 55]])

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [84]:
# keeping observations which are not missing

features[~np.isnan(features).any(axis=1)]

<IPython.core.display.Javascript object>

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4]])

In [86]:
dataframe = pd.DataFrame(features, columns = ['feature1', 'feature2'])
dataframe.dropna()

<IPython.core.display.Javascript object>

Unnamed: 0,feature1,feature2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4


## 4.11 Imputing missing values

Filling or predicting missing values

If we have small amount of data we can predict missing values using k-nearest neighbors(KNN)

In [96]:
pip install fancyimpute

Collecting fancyimputeNote: you may need to restart the kernel to use updated packages.

  Downloading fancyimpute-0.7.0.tar.gz (25 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting knnimpute>=0.1.0
  Downloading knnimpute-0.1.0.tar.gz (8.3 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting cvxpy
  Downloading cvxpy-1.4.1-cp310-cp310-win_amd64.whl (1.0 MB)
     ---------------------------------------- 1.0/1.0 MB 7.3 MB/s eta 0:00:00
Collecting cvxopt
  Downloading cvxopt-1.3.2-cp310-cp310-win_amd64.whl (12.8 MB)
     ---------------------------------------- 12.8/12.8 MB 9.2 MB/s eta 0:00:00
Collecting nose
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
     -------------------------------------- 154.7/154.7 kB 4.7 MB/s eta 0:00:00
Collecting osqp>=0.6.2
  Downloading osqp-0.6.3-cp310-cp310-win_amd64.whl (292 kB)
     ----------------------------------

In [103]:
from fancyimpute import KNN
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs

In [104]:
# creating a simulated feature matrix

features, _ = make_blobs(n_samples = 1000,
                        n_features = 2,
                        random_state = 1)

# Standardizing the feature
scaler = StandardScaler()
standardized_feature = scaler.fit_transform(features)

In [105]:
# replacing the first value with missing values

true_val = standardized_feature[0,0] 
standardized_feature[0,0] = np.nan

<IPython.core.display.Javascript object>

In [120]:
# Predicting the missing values in the feature matrix

knn_imputed = KNN(k=5, verbose = 0).fit_transform(standardized_feature)

# k=5: no of neighboring rows to use

In [121]:
print("True Value:", true_val)
print("Imputed Value:", knn_imputed[0,0])

True Value: 0.8730186113995938
Imputed Value: 1.0955332713113226
