[Reference](https://medium.com/@WojtekFulmyk/feature-selection-in-data-preprocessing-421d5b527005)

In [1]:
from scipy.stats import pearsonr

height = [1, 2, 3, 4, 5]
weight = [3, 4, 5, 9, 10]

# calculate pearson corrrelation
r, p_value = pearsonr(height, weight)
print("Correlation:", r)

# feature selection thresholds
strong_threshold = 0.7
strong_inverse_threshold = -0.7

# feature selection decision
if r > strong_threshold:
    print("Strong positive correlation between height and weight.")
    print("Select both features.")
elif r < strong_inverse_threshold:
    print("Strong negative correlation between height and weight.")
    print("Select both features.")
else:
    print("Weak correlation between height and weight.")
    print("Only select height feature.")

Correlation: 0.9645788568769382
Strong positive correlation between height and weight.
Select both features.


In [2]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import itertools

# load iris dataset
X, y = load_iris(return_X_y=True)
feature_names = load_iris().feature_names

# generate feature combinations
combos = itertools.chain(
  itertools.combinations(feature_names, 4),
  itertools.combinations(feature_names, 3),
  itertools.combinations(feature_names, 2),
  itertools.combinations(feature_names, 1))

# evaluate feature subsets
for features in combos:

  X_temp = X[:,[i for i,f in enumerate(feature_names) if f in features]]

  model = DecisionTreeClassifier()
  scores = cross_val_score(model, X_temp, y, cv=5)

  print("Selected features:", features)
  if scores.mean() > 0.90:
    print("Accuracy:", scores.mean(),"High accuracy. Select all features.")
  else:
    print("Accuracy:", scores.mean(), "Low accuracy with features. Do not select.")

Selected features: ('sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)')
Accuracy: 0.9600000000000002 High accuracy. Select all features.
Selected features: ('sepal length (cm)', 'sepal width (cm)', 'petal length (cm)')
Accuracy: 0.9400000000000001 High accuracy. Select all features.
Selected features: ('sepal length (cm)', 'sepal width (cm)', 'petal width (cm)')
Accuracy: 0.9533333333333334 High accuracy. Select all features.
Selected features: ('sepal length (cm)', 'petal length (cm)', 'petal width (cm)')
Accuracy: 0.9533333333333334 High accuracy. Select all features.
Selected features: ('sepal width (cm)', 'petal length (cm)', 'petal width (cm)')
Accuracy: 0.9600000000000002 High accuracy. Select all features.
Selected features: ('sepal length (cm)', 'sepal width (cm)')
Accuracy: 0.7266666666666667 Low accuracy with features. Do not select.
Selected features: ('sepal length (cm)', 'petal length (cm)')
Accuracy: 0.9133333333333333 High accuracy. Select a

In [3]:
from sklearn.datasets import load_iris
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
import numpy as np

# feature mapping dictionar
feature_mapping = {'A': 'sepal length',
                   'B': 'sepal width',
                   'C': 'petal length',
                   'D': 'petal width'}

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lasso = Lasso(alpha=0.04)
lasso.fit(X_train, y_train)

coefs = lasso.coef_
selected, = np.nonzero(coefs)

# get the selected letters
selected_letters = [list(feature_mapping.keys())[i] for i in selected]
print(feature_mapping)
print("Selected features:")
print(selected_letters)
print("Lasso accuracy:", lasso.score(X_test, y_test))

{'A': 'sepal length', 'B': 'sepal width', 'C': 'petal length', 'D': 'petal width'}
Selected features:
['C', 'D']
Lasso accuracy: 0.9090708820329703
