Often in data science we have hundreds or even millions of features and we want a way to create a model that only includes the most important features. This has three benefits. First, we make our model more simple to interpret. Second, we can reduce the variance of the model, and therefore overfitting. Finally, we can reduce the computational cost (and time) of training a model.

In [8]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [2]:
import warnings
warnings.filterwarnings('ignore')
# Load the iris dataset
iris = datasets.load_iris()

# Create a list of feature names
feat_labels = ['Sepal Length','Sepal Width','Petal Length','Petal Width']

# Create X from the features
X = iris.data

# Create y from output
y = iris.target

In [3]:
# View the features
X[0:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [4]:
# View the target data
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [5]:
# Split the data into 40% test and 60% training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)


In [9]:
# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)
clf2 = DecisionTreeClassifier()
# Train the classifier
clf.fit(X_train, y_train)

# Print the name and gini importance of each feature
for feature in zip(feat_labels, clf.feature_importances_):
    print(feature)

('Sepal Length', 0.11024282328064558)
('Sepal Width', 0.016255033655398383)
('Petal Length', 0.45028123999239505)
('Petal Width', 0.42322090307156096)


In [25]:
y_pred = clf.predict(X_test)

# View The Accuracy Of Our Full Feature (4 Features) Model
accuracy_score(y_test, y_pred)

0.9333333333333333

SelectFromModel will select those features which importance is greater than the mean importance of all the features by default

In [10]:
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.15
sfm = SelectFromModel(clf, threshold=0.15)
sfm2 = SelectFromModel(clf2 , threshold = 0.15)
# Train the selector
sfm.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier(n_estimators=10000, n_jobs=-1,
                                                 random_state=0),
                threshold=0.15)

In [11]:
sfm2.fit(X_train , y_train)

SelectFromModel(estimator=DecisionTreeClassifier(), threshold=0.15)

In [12]:
# Print the names of the most important features
for feature_list_index in sfm.get_support(indices=True):
    print(feat_labels[feature_list_index])
print("----------------")
# Print the names of the most important features
for feature_list_index in sfm2.get_support(indices=True):
    print(feat_labels[feature_list_index])

Petal Length
Petal Width
----------------
Petal Length
Petal Width


In [16]:
feat_labels[2]

'Petal Length'

In [13]:
# Transform the data to create a new dataset containing only the most important features
# Note: We have to apply the transform to both the training X and test X data.
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)

In [23]:
X_important_train

array([[4.5, 1.6],
       [1.6, 0.2],
       [5.1, 1.9],
       [4.2, 1.3],
       [3.6, 1.3],
       [4. , 1.3],
       [4.6, 1.4],
       [6. , 1.8],
       [1.5, 0.2],
       [1.1, 0.1],
       [5.3, 1.9],
       [4.2, 1.2],
       [1.7, 0.2],
       [1.5, 0.4],
       [4.9, 1.5],
       [1.5, 0.2],
       [5.1, 1.8],
       [3. , 1.1],
       [1.4, 0.3],
       [4.5, 1.5],
       [6.1, 2.5],
       [4.2, 1.3],
       [1.4, 0.1],
       [5.9, 2.1],
       [5.7, 2.3],
       [5.8, 2.2],
       [5.6, 2.1],
       [1.6, 0.2],
       [1.6, 0.2],
       [5.1, 2. ],
       [5.7, 2.1],
       [1.3, 0.3],
       [5.4, 2.3],
       [1.4, 0.2],
       [5. , 2. ],
       [5.4, 2.1],
       [1.3, 0.2],
       [1.4, 0.2],
       [5.8, 1.6],
       [1.4, 0.3],
       [1.3, 0.2],
       [1.7, 0.4],
       [4. , 1.3],
       [5.9, 2.3],
       [6.6, 2.1],
       [1.4, 0.2],
       [1.5, 0.2],
       [1.4, 0.2],
       [4.5, 1.3],
       [4.4, 1.4],
       [1.2, 0.2],
       [1.7, 0.5],
       [4.3,

In [24]:
# Create a new random forest classifier for the most important features
clf_important = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the new classifier on the new dataset containing the most important features
clf_important.fit(X_important_train, y_train)

RandomForestClassifier(n_estimators=10000, n_jobs=-1, random_state=0)

In [27]:
pred = clf_important.predict(X_important_test)

# View The Accuracy Of Our Limited Feature (2 Features) Model
accuracy_score(y_test, pred)

0.9