In [28]:
# Dimensionality reduction using feature selection
import numpy as np
import pandas as pd

from sklearn import datasets, linear_model
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
from sklearn.feature_selection import SelectPercentile

import warnings
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV

In [3]:
# Thresholding numerical feature variance

# import data
iris = datasets.load_iris()

# create features and target
features = iris.data
target = iris.target

# create thresholder
thresholder = VarianceThreshold(threshold=0.5)

# create high variance feature matrix
features_high_variance = thresholder.fit_transform(features)

# view high variance feature matrix
features_high_variance[0:3]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2]])

In [4]:
# see the variance for each feature
thresholder.fit(features).variances_

array([0.68112222, 0.18871289, 3.09550267, 0.57713289])

In [8]:
# if we standardized features it will not work properly

# standardize feature matrix
scaler = StandardScaler()
features_std = scaler.fit_transform(features)

# calculate variance of each feature
selector = VarianceThreshold()
selector.fit(features_std).variances_

array([1., 1., 1., 1.])

In [9]:
# Thresholding Binary Feature Variance

# Create feature matrix with:
# Feature 0: 80% class 0
# Feature 1: 80% class 1
# Feature 2: 60% class 0, 40% class 1
features = [[0, 1, 0],
            [0, 1, 1],
            [0, 1, 0],
            [0, 1, 1],
            [1, 0, 0]]

# run threshold by variance
thresholder = VarianceThreshold(threshold=(0.75 * (1 - 0.75)))
thresholder.fit_transform(features)

array([[0],
       [1],
       [0],
       [1],
       [0]])

In [15]:
# Handling Highly Correlated Features

# if two or more features highly correlated drop all of them except one
features = np.array([[1, 1, 1],
                     [2, 2, 0],
                     [3, 3, 1],
                     [4, 4, 0],
                     [5, 5, 1],
                     [6, 6, 0],
                     [7, 7, 1],
                     [8, 7, 0],
                     [9, 7, 1]])

# convert feature matrix into DataFrame
dataframe = pd.DataFrame(features)

# create correlation matrix
corr_matrix = dataframe.corr().abs()

# select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),
                                 k=1).astype(np.bool))

# find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

# drop features
dataframe.drop(dataframe.columns[to_drop], axis=1).head(3)

Unnamed: 0,0,2
0,1,1
1,2,0
2,3,1


In [18]:
# removing irrelevant features for classification

# calculate chi-square between feature and the target vector

# load data
iris = load_iris()
features = iris.data
target = iris.target

# convert to categorical data by converting data to integers
features = features.astype(int)

# select two features with highest chi-squared statistics
chi2_selector = SelectKBest(chi2, k=2)
features_kbest = chi2_selector.fit_transform(features, target)

# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


In [19]:
# If the features are quantitative, compute the ANOVA F-value 
# between each feature and the target vector

# select two features with highest F-values
fvalue_selector = SelectKBest(f_classif, k=2)
features_kbest = fvalue_selector.fit_transform(features, target)

# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


In [21]:
# if we can't select the number of features use SelectPercentile

# select top 75% of features with highest F-values
fvalue_selector = SelectPercentile(f_classif, percentile=75)
features_kbest = fvalue_selector.fit_transform(features, target)

# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 3


In [31]:
# recursive feature elimination (RFE) using cross validation (CV) -> RFECV

# suppress an annoying but harmless warning
warnings.filterwarnings(action='ignore',
                       module='scipy',
                       message='^internal gelsd')

# generate features matrix, target vector, and the true coefficients
features, target = make_regression(n_samples= 10000,
                                  n_features = 100,
                                  n_informative = 2,
                                  random_state = 1)

# create a linear regression
ols = linear_model.LinearRegression()

# recursively eliminate features
rfecv = RFECV(estimator=ols,
             step=1,
             scoring='neg_mean_squared_error')
rfecv.fit(features, target)
rfecv.transform(features)

array([[ 8.50798578e-03, -9.17608307e-01,  7.03127695e-01,
        -1.31422783e+00, -2.69038359e-01],
       [-1.07500204e+00,  2.18090991e+00,  2.56148527e+00,
         1.69309663e-03, -7.19902387e-01],
       [ 1.37940721e+00,  7.64016678e-01, -1.77039484e+00,
         9.15427035e-01,  4.95589417e-01],
       ...,
       [-8.03316558e-01,  3.73094914e-01, -1.60648007e+00,
         3.59151116e-02,  2.29001330e-01],
       [ 3.95088442e-01,  7.12615392e-01, -1.34564911e+00,
         7.14224184e-01,  2.72381805e-02],
       [-5.53830347e-01,  1.27270420e+00,  8.28801124e-01,
         1.10168019e+00, -1.08865284e+00]])

In [32]:
# number of best features
rfecv.n_features_

5