# Thresholding Numerical Feature Variance

In [2]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline

In [5]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import VarianceThreshold

In [7]:
features = load_iris().data

In [17]:
model = VarianceThreshold(threshold = 0.6)

In [18]:
features.shape

(150, 4)

In [19]:
model.fit_transform(features).shape

(150, 2)

In [20]:
model.fit(features).variances_

array([0.68112222, 0.18871289, 3.09550267, 0.57713289])

In [None]:
#if the features have been standardized (to mean zero and unit variance) 
#then for obvious reasons variance thresholding will not work correctly:

In [21]:
from sklearn.preprocessing import StandardScaler

In [22]:
scaler = StandardScaler()

In [24]:
newfeatures = scaler.fit_transform(features)

In [25]:
model.fit(newfeatures).variances_

array([1., 1., 1., 1.])

# Thresholding Binary Feature Variance

In [26]:
# Var x = p(1 − p) , p is the proportion of observations of class.

In [30]:
features2 = [[0, 1, 0],
             [0, 1, 1],
             [0, 1, 0], 
             [0, 1, 1], 
             [1, 0, 0]]

In [39]:
thresholder = VarianceThreshold(threshold=0.6*(1-0.6)) # keep feature that has one class less than or equal to 60% 

In [40]:
thresholder.fit_transform(features2)

array([[0],
       [1],
       [0],
       [1],
       [0]])

# Handling Highly Correlated Features

In [42]:
features3 = np.array([[1, 1, 1],
                     [2, 2, 0], 
                     [3, 3, 1], 
                     [4, 4, 0], 
                     [5, 5, 1], 
                     [6, 6, 0], 
                     [7, 7, 1], 
                     [8, 7, 0],
                     [9, 7, 1]])

In [43]:
df = pd.DataFrame(features3)

In [47]:
df.corr()

Unnamed: 0,0,1,2
0,1.0,0.976103,0.0
1,0.976103,1.0,-0.034503
2,0.0,-0.034503,1.0


In [48]:
corr_matrix = df.corr().abs()

In [53]:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) # intersting 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


In [54]:
upper

Unnamed: 0,0,1,2
0,,0.976103,0.0
1,,,0.034503
2,,,


In [56]:
to_drop = [column for column in upper.columns if any(upper[column]>0.95)]

In [58]:
to_drop

[1]

In [62]:
df.drop(to_drop, axis=1).head()

Unnamed: 0,0,2
0,1,1
1,2,0
2,3,1
3,4,0
4,5,1


# Removing Irrelevant Features for Classification

In [63]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif

In [65]:
iris = load_iris()
features4 = iris.data
target4 = iris.target
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

### If the features are categorical, calculate a chi-square (χ2 ) statistic between each feature and the target vector

In [67]:
features4.dtype

dtype('float64')

In [76]:
features4 = features4.astype(int)
features4.shape

(150, 4)

In [75]:
chi2_selector = SelectKBest(chi2, k=2)
new_features4 = chi2_selector.fit_transform(features4, target4)
new_features4.shape

(150, 2)

### If the features are quantitative, compute the ANOVA F-value between each feature and the target vector

In [77]:
F_selector = SelectKBest(f_classif, k=2)
features4_new = F_selector.fit_transform(features4, target4)

In [79]:
features4_new.shape

(150, 2)

### Instead of selecting a specific number of features, we can also use SelectPercentile to select the top n percent of features:

In [82]:
from sklearn.feature_selection import SelectPercentile

In [83]:
F2_selector = SelectPercentile(f_classif, percentile=75)
percentile_feature = F2_selector.fit_transform(features4, target4)

In [84]:
percentile_feature.shape

(150, 3)

# Recursively Eliminating Features

In [2]:
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression

In [3]:
features, target = make_regression(n_samples = 10000, n_features = 100, n_informative = 2,random_state = 1)

In [90]:
Linearmodel = LinearRegression()

In [99]:
modelRFE = RFECV(estimator = Linearmodel, step=1, scoring ="neg_mean_squared_error", cv=8)

In [100]:
modelRFE.fit(features, target).transform(features).shape

(10000, 2)

In [101]:
modelRFE.support_

array([False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [106]:
modelRFE.ranking_

array([55, 29, 86, 94, 78,  1, 59, 83,  4, 17,  6, 67, 80, 48, 39, 11, 26,
       45, 13, 40, 35, 24, 60,  8, 37, 56, 49, 46, 57, 91, 12, 70,  2, 16,
       79, 92, 73, 87, 66,  1, 81, 41, 74, 97, 36, 19, 95, 88, 54, 69, 72,
       53, 61, 28,  9,  3, 58, 82, 33, 76, 43, 47, 10, 63, 23, 99, 32, 34,
       18, 77, 64, 96, 90,  5, 62, 21, 93, 20, 22, 15, 42, 44, 25, 89, 85,
       50, 30, 52, 38, 68, 31, 65, 71, 75, 14,  7, 84, 51, 27, 98])

In [107]:
modelRFE.n_features_

2