# Feature Selection

## Constant features

Constant features show the same value for all the rows  of the dataset. These features ultimately provide no information value to a machine learning model.

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

In [3]:
df = pd.read_csv('toy_dataset.csv')

In [4]:
df.shape

(50000, 301)

In [5]:
X = df.drop(labels=['target'], axis=1)
y = df['target']

### Variance Threshold

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [7]:
sel = VarianceThreshold(threshold=0)

In [8]:
sel.fit(X_train)

VarianceThreshold(threshold=0)

In [9]:
sum(sel.get_support())

266

In [10]:
constant = X_train.columns[~sel.get_support()]
len(constant)

34

In [11]:
feat_names = X_train.columns[sel.get_support()]
feat_names

Index(['var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'var_6', 'var_7', 'var_8',
       'var_9', 'var_10',
       ...
       'var_289', 'var_290', 'var_291', 'var_292', 'var_293', 'var_295',
       'var_296', 'var_298', 'var_299', 'var_300'],
      dtype='object', length=266)

In [12]:
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

In [13]:
X_train.shape, X_test.shape

((35000, 266), (15000, 266))

### Standard Deviation Approach (numerical)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [15]:
X_train.shape, X_test.shape

((35000, 300), (15000, 300))

In [16]:
constant_features = [feat for feat in X_train.columns if X_train[feat].std() == 0]

In [17]:
len(constant_features)

34

In [18]:
X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)

In [19]:
X_train.shape, X_test.shape

((35000, 266), (15000, 266))

### Unique Values Approach (numerical and categorical)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [21]:
X_train.shape, X_test.shape

((35000, 300), (15000, 300))

In [22]:
X_train = X_train.astype('O')

In [23]:
X_train.dtypes

var_1      object
var_2      object
var_3      object
var_4      object
var_5      object
            ...  
var_296    object
var_297    object
var_298    object
var_299    object
var_300    object
Length: 300, dtype: object

In [24]:
constant_features = [feat for feat in X_train.columns if X_train[feat].nunique()== 1]

In [25]:
len(constant_features)

34

In [26]:
X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)

In [27]:
X_train.shape, X_test.shape

((35000, 266), (15000, 266))