## Feature Selection: Quasi Constant Features

**Quasi Constant Features** are those that show the same value for a great majority of observations in the dataset. 

In [1]:
# Import Dependencies
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

In [2]:
# Load Data
df = pd.read_csv("C:/Users/H A R I H A R A N/Desktop/sem 8/Heart Disease/heart.csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
# Check for presence of Null features
[col for col in df.columns if df[col].isnull().sum() > 0]

[]

In [4]:
X = df.drop(['target'], axis=1)
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [5]:
y = df['target']
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [6]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((212, 13), (212,), (91, 13), (91,))

In [7]:
# Remove Constant Features
# Find all constant Features
# For each feature in training data, if it's standard deviation == 0, it is a constant variable.
const_features = [feat for feat in X_train.columns if X_train[feat].std() == 0]

In [8]:
len(const_features)

0

In [9]:
# Remove constant features
X_train.drop(labels=const_features, axis=1, inplace=True)
X_test.drop(labels=const_features, axis=1, inplace=True)

In [10]:
# Feature Selector
# 0.01: Approx. 99% of observations
feature_selector = VarianceThreshold(threshold=0.01)
feature_selector.fit(X_train)

VarianceThreshold(threshold=0.01)

In [11]:
# Num. features to be retained
sum(feature_selector.get_support())

13

In [14]:
# Features that are Quasi -constant
[x for x in X_train.columns if x not in X_train.columns[feature_selector.get_support()]]

[]

In [13]:
# Percentage of observations showing all different values
X_train['num_var34_0'].value_counts() / np.float(len(X_train)) 

KeyError: 'num_var34_0'

For the above value, we can see that a "0" appears > 99% of the time and a "3" for 0.0029%. So, this variable is almost constant.