<a href="https://colab.research.google.com/github/AjayNRG/Machine_Learning_Algorithms_Giuseppe_Bonnacorso/blob/master/Feature_selection_and_Feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston

In [0]:
boston = load_boston()

In [0]:
X = boston.data
Y = boston.target

In [0]:
print(X.shape)
print(Y.shape)

(506, 13)
(506,)


In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
test_size=0.25, random_state=1000)

#A. Managing categorical data


In [0]:
X = np.random.uniform(0,1,size=(10,2))
Y = np.random.choice(('Male','Female'),size=10)
X[0]

array([0.47039087, 0.95033027])

In [0]:
Y[0]

'Male'

###1. LabelEncoder

In [0]:
from sklearn.preprocessing import LabelEncoder
le  = LabelEncoder()
yt = le.fit_transform(Y)
print(yt)

[1 1 0 0 0 1 1 0 0 0]


In [0]:
le.classes_

array(['Female', 'Male'], dtype='<U6')

In [0]:
output = [1, 0, 1, 1, 0, 0]
decoded_output = [le.classes_[i] for i in output]
decoded_output

['Male', 'Female', 'Male', 'Male', 'Female', 'Female']

###2. LabelBinarizer

In [0]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
Yb = lb.fit_transform(Y)
Yb

array([[1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0]])

In [0]:
lb.inverse_transform(Yb)

array(['Male', 'Male', 'Female', 'Female', 'Female', 'Male', 'Male',
       'Female', 'Female', 'Female'], dtype='<U6')

###3. DictVectorizer and FeatureHasher
######scikit-learn offers the classes DictVectorizer and FeatureHasher; they both produce sparse matrices of real numbers that can be fed into any machine learning model.The latter has a limited memory consumption and adopts MurmurHash 3 

#B. Managing missing features

In [0]:
from sklearn.impute import SimpleImputer

data = np.array([[1, np.nan, 2], [2, 3, np.nan], [-1, 4, 2]])
print(data)

[[ 1. nan  2.]
 [ 2.  3. nan]
 [-1.  4.  2.]]


In [0]:
imp = SimpleImputer(strategy='mean')
imp.fit_transform(data)

array([[ 1. ,  3.5,  2. ],
       [ 2. ,  3. ,  2. ],
       [-1. ,  4. ,  2. ]])

In [0]:
imp = SimpleImputer(strategy='median')
imp.fit_transform(data)

array([[ 1. ,  3.5,  2. ],
       [ 2. ,  3. ,  2. ],
       [-1. ,  4. ,  2. ]])

In [0]:
imp = SimpleImputer(strategy='most_frequent')
imp.fit_transform(data)

array([[ 1.,  3.,  2.],
       [ 2.,  3.,  2.],
       [-1.,  4.,  2.]])

#C. Data scaling and normalization

####1. StandardScaler

In [0]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
scaled_data = ss.fit_transform(data)
scaled_data

array([[ 0.26726124,         nan,  0.        ],
       [ 1.06904497, -1.        ,         nan],
       [-1.33630621,  1.        ,  0.        ]])

#### 2.RobustScaler

In [0]:
from sklearn.preprocessing import RobustScaler
rb1 = RobustScaler(quantile_range=(15,85))
scaled_data1 = rb1.fit_transform(data)
scaled_data1

array([[ 0.        ,         nan,  0.        ],
       [ 0.47619048, -0.71428571,         nan],
       [-0.95238095,  0.71428571,  0.        ]])

In [0]:
rb2 = RobustScaler(quantile_range=(25,75))
scaled_data2 = rb1.fit_transform(data)
scaled_data2

array([[ 0.        ,         nan,  0.        ],
       [ 0.47619048, -0.71428571,         nan],
       [-0.95238095,  0.71428571,  0.        ]])

In [0]:
rb2 = RobustScaler(quantile_range=(30, 60))
scaled_data2 = rb2.fit_transform(data)
scaled_data2

array([[ 0.        ,         nan,  0.        ],
       [ 1.        , -1.66666667,         nan],
       [-2.        ,  1.66666667,  0.        ]])

###### Other options include MinMaxScaler and MaxAbsScaler, which scale data by removing elements that don't belong to a given range (the former) or by Considering a maximum absolute value (the latter).

####3. Normalizer

In [0]:
from sklearn.preprocessing import Normalizer

data = np.array([1.0, 2.0])

n_max = Normalizer(norm='max')
n_max.fit_transform(data.reshape(1, -1))

array([[0.5, 1. ]])

In [0]:
n_max = Normalizer(norm='l1') #L1- Norm
n_max.fit_transform(data.reshape(1, -1))

array([[0.33333333, 0.66666667]])

In [0]:
n_max = Normalizer(norm='l2') #L2- Norm
n_max.fit_transform(data.reshape(1, -1))

array([[0.4472136 , 0.89442719]])

#D. Feature selection and filtering

#### 1. VarianceThreshold

In [0]:
from sklearn.feature_selection import VarianceThreshold

Xx = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]
vt = VarianceThreshold()
Xx_vt = vt.fit_transform(Xx)
Xx_vt

array([[2, 0],
       [1, 4],
       [1, 1]])

In [0]:
vt = VarianceThreshold(threshold=1)
Xx_vt = vt.fit_transform(Xx)
Xx_vt

array([[0],
       [4],
       [1]])

####2. SelectKBest and SelectPercentile

###### Two examples of feature selection that use the classes SelectKBest (which selects the best K high-score features) and SelectPercentile (which selects only a subset of features belonging to a certain percentile) are shown next. 

In [0]:
from sklearn.datasets import load_boston, load_iris
from sklearn.feature_selection import SelectKBest, SelectPercentile, chi2, f_regression

In [0]:
regr_data = load_boston()
regr_data.data.shape

(506, 13)

In [0]:
kb_regr = SelectKBest(f_regression)
X_b = kb_regr.fit_transform(regr_data.data, regr_data.target)
X_b.shape

(506, 10)

In [0]:
kb_regr.scores_

array([ 89.48611476,  75.2576423 , 153.95488314,  15.97151242,
       112.59148028, 471.84673988,  83.47745922,  33.57957033,
        85.91427767, 141.76135658, 175.10554288,  63.05422911,
       601.61787111])

In [0]:
class_data = load_iris()
class_data.data.shape

(150, 4)

In [0]:
perc_class = SelectPercentile(chi2,percentile=15)
X_p = perc_class.fit_transform(class_data.data, class_data.target)
X_p.shape

(150, 1)

In [0]:
perc_class.scores_

array([ 10.81782088,   3.7107283 , 116.31261309,  67.0483602 ])

#E. Principal component analysis

####1. PCA

In [0]:
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA

digits = load_digits()

In [35]:
digits.data.shape

(1797, 64)

######Each image is a vector of 64 unsigned int (8 bit) numbers (0, 255), so the initial number of components is indeed 64. However, the total amount of black pixels is often predominant and the basic signs needed to write 10 digits are similar, so it's reasonable to assume both high cross-correlation and a low variance on several components. Trying with 36 principal components

In [0]:
pca = PCA(n_components=36, whiten=True)
x_pca = pca.fit_transform(digits.data / 255)

In [38]:
x_pca.shape

(1797, 36)

In [44]:
pca.explained_variance_ratio_

array([0.14890594, 0.13618771, 0.11794594, 0.08409979, 0.05782415,
       0.0491691 , 0.04315987, 0.03661373, 0.03353248, 0.03078806,
       0.02372341, 0.02272697, 0.01821863, 0.01773855, 0.01467101,
       0.01409716, 0.01318589, 0.01248138, 0.01017718, 0.00905617,
       0.00889538, 0.00797123, 0.00767493, 0.00722904, 0.00695889,
       0.00596081, 0.00575615, 0.00515158, 0.0048954 , 0.00428888,
       0.00373606, 0.00353273, 0.00336683, 0.0032803 , 0.00308312,
       0.00293777])

In [0]:
X_rebuilt = pca.inverse_transform(x_pca)

####2. Non-negative matrix factorization

###### When the dataset is made up of non-negative elements, it's possible to use non-negative matrix factorization (NNMF) instead of standard PCA. The algorithm optimizes a loss function (alternatively on W and H) based on the Frobenius norm

In [47]:
from sklearn.datasets import load_iris
from sklearn.decomposition import NMF

iris = load_iris()
iris.data.shape

(150, 4)

In [0]:
nmf =  NMF(n_components=3, init='random' ,l1_ratio=0.1)
Xt = nmf.fit_transform(iris.data)

In [53]:
nmf.reconstruction_err_

1.8857626377510346

In [54]:
iris.data[0]

array([5.1, 3.5, 1.4, 0.2])

In [55]:
Xt[0]

array([0.22249994, 1.56326316, 1.94918153])

In [60]:
nmf.inverse_transform(Xt[0])

array([5.09911539, 3.50130596, 1.40116508, 0.19759821])

####3. Sparse PCA

######The following snippet shows a sparse PCA with 60 components. In this context, they're usually called atoms and the amount of sparsity can be controlled via L1-norm regularization (higher alpha parameter values lead to more sparse results). This approach is very common in classification algorithms and will be discussed in the next chapters:

In [0]:
from sklearn.decomposition import SparsePCA

spca = SparsePCA(n_components = 60, alpha = 0.1)
X_pca = spca.fit_transform(digits.data / 255)

In [63]:
spca.components_.shape

(60, 64)

####4. Kernel PCA

####Let's consider a dataset made up of a circle with a blob inside:

In [66]:
from sklearn.datasets import make_circles
Xb, Yb = make_circles(n_samples=500, factor=0.1, noise=0.05)
print(Xb.shape)
print(Yb.shape)

(500, 2)
(500,)


In [0]:
from sklearn.decomposition import KernelPCA

In [0]:
kpca = KernelPCA(n_components=2, kernel='rbf',fit_inverse_transform=True, gamma=1.0)
X_kpca = kpca.fit_transform(Xb)

#F. Atom extraction and dictionary learning

In [0]:
from sklearn.decomposition import DictionaryLearning
dl = DictionaryLearning(n_components=36, fit_algorithm='lars',transform_algorithm='lasso_lars')
X_dict = dl.fit_transform(digits.data)

In [0]:
X_dict[0]