scikit-learn · StefanieSenger · Mar 13, 2016 · Mar 13, 2016 · Mar 14, 2016 · Mar 14, 2016
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -578,6 +578,8 @@ From text
    :template: function.rst
 
    feature_selection.chi2
+   feature_selection.info_gain
+   feature_selection.info_gain_ratio
    feature_selection.f_classif
    feature_selection.f_regression
    feature_selection.r_regression

diff --git a/doc/modules/feature_selection.rst b/doc/modules/feature_selection.rst
@@ -89,7 +89,8 @@ and p-values (or only scores for :class:`SelectKBest` and
 
 * For regression: :func:`r_regression`, :func:`f_regression`, :func:`mutual_info_regression`
 
-* For classification: :func:`chi2`, :func:`f_classif`, :func:`mutual_info_classif`
+ * For classification: :func:`chi2`, :func:`info_gain`, :func:`info_gain_ratio`,
+   :func:`f_classif`, :func:`mutual_info_classif`
- * For classification: :func:`chi2`, :func:`info_gain`, :func:`info_gain_ratio`,
-   :func:`f_classif`, :func:`mutual_info_classif`
+* For classification: :func:`chi2`, :func:`info_gain`, :func:`info_gain_ratio`,
+  :func:`f_classif`, :func:`mutual_info_classif`.
- * For classification: :func:`chi2`, :func:`info_gain`, :func:`info_gain_ratio`,
-   :func:`f_classif`, :func:`mutual_info_classif`
+* For classification: :func:`chi2`, :func:`info_gain`, :func:`info_gain_ratio`,
+  :func:`f_classif`, :func:`mutual_info_classif`.
 
 The methods based on F-test estimate the degree of linear dependency between
 two random variables. On the other hand, mutual information methods can capture
@@ -100,8 +101,9 @@ applied to non-negative features, such as frequencies.
 .. topic:: Feature selection with sparse data
 
    If you use sparse data (i.e. data represented as sparse matrices),
-   :func:`chi2`, :func:`mutual_info_regression`, :func:`mutual_info_classif`
-   will deal with the data without making it dense.
+   :func:`chi2`, :func:`mutual_info_regression`, :func:`mutual_info_classif`,
+   :func:`info_gain`, :func:`info_gain_ratio` will deal with the data without
+   making it dense.
 
 .. warning::
 

diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst
@@ -38,7 +38,7 @@ See :ref:`array_api` for more details.
 
 **Classes:**
 
-- 
+-
 
 Changelog
 ---------
@@ -54,6 +54,13 @@ Changelog
     :pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
     where 123455 is the *pull request* number, not the issue number.
 
+:mod:`sklearn.feature_selection`
+................................
+
+- |Feature| :func:`~feature_selection.info_gain` and
+  :func:`~feature_selection.info_gain_ratio` can now be used for
+  univariate feature selection. :pr:`28905` by :user:`Viktor Pekar <vpekar>`.
-  univariate feature selection. :pr:`28905` by :user:`Viktor Pekar <vpekar>`.
+  univariate feature selection.
+  :pr:`28905` by :user:`Viktor Pekar <vpekar>` and
+  :user:`Stefanie Senger <StefanieSenger>`.
-  univariate feature selection. :pr:`28905` by :user:`Viktor Pekar <vpekar>`.
+  univariate feature selection.
+  :pr:`28905` by :user:`Viktor Pekar <vpekar>` and
+  :user:`Stefanie Senger <StefanieSenger>`.
+
 Thanks to everyone who has contributed to the maintenance and improvement of
 the project since version 1.5, including:
 

diff --git a/examples/feature_selection/plot_compare_feature_selection.py b/examples/feature_selection/plot_compare_feature_selection.py
@@ -0,0 +1,115 @@
+"""
+=========================================
+Comparison of feature selection functions
+=========================================
+
+This example illustrates the performance of different univariate feature selection
+functions on a text classification task (the 20 newsgroups dataset).
+
+The plot shows the accuracy of a multinomial Naive Bayes classifier as a function of the
+amount of the best features selected for training it using four methods: chi-square,
+information gain, information gain ratio and F-test. Kraskov et al's mutual information
+based on k-nearest neighbor distances is too slow for this example and is therefore
+excluded.
+"""
+
+# %%
+# Load data
+# =========
+from sklearn.datasets import fetch_20newsgroups
+
+remove = ("headers", "footers", "quotes")
+data_train = fetch_20newsgroups(
+    subset="train", categories=None, shuffle=True, random_state=42, remove=remove
+)
+data_test = fetch_20newsgroups(
+    subset="test", categories=None, shuffle=True, random_state=42, remove=remove
+)
+
+# %%
+# Train-test split
+# ================
+import numpy as np
+
+from sklearn.feature_extraction.text import CountVectorizer
+
+y_train, y_test = data_train.target, data_test.target
+categories = data_train.target_names  # for case categories == None
+
+vectorizer = CountVectorizer(max_df=0.5, stop_words="english")
+X_train = vectorizer.fit_transform(data_train.data)
+X_test = vectorizer.transform(data_test.data)
+feature_names = vectorizer.get_feature_names_out()
+cutoffs = [
+    int(x) for x in np.logspace(np.log10(1000.0), np.log10(X_train.shape[1]), num=10)
+]
+
+
+# %%
+# Calculate accuracy of Naive Bayes classifier
+# ============================================
+import time
+
+from sklearn import metrics
+from sklearn.feature_selection import (
+    SelectKBest,
+    chi2,
+    f_classif,
+    info_gain,
+    info_gain_ratio,
+)
+from sklearn.naive_bayes import MultinomialNB
+
+results = {}
+
+clf = MultinomialNB(alpha=0.01)
+
+for func in [chi2, info_gain, info_gain_ratio, f_classif]:
+
+    results[func.__name__] = []
+
+    for k in cutoffs:
+
+        # apply feature selection
+        t0 = time.time()
+        selector = SelectKBest(func, k=k)
+        X_train2 = selector.fit_transform(X_train, y_train)
+        X_test2 = selector.transform(X_test)
+        duration = time.time() - t0
+
+        # keep selected feature names
+        feature_names2 = [feature_names[i] for i in selector.get_support(indices=True)]
+        feature_names2 = np.asarray(feature_names2)
+
+        # train and evaluate a classifier
+        clf.fit(X_train2, y_train)
+        pred = clf.predict(X_test2)
+        score = metrics.accuracy_score(y_test, pred)
+
+        results[func.__name__].append((score, duration))
+
+# %%
+# Plot results
+# ============
+import matplotlib.pyplot as plt
+
+f, (ax1, ax2) = plt.subplots(2, sharex=True, figsize=(12, 8))
+ax1.set_title("20 newsgroups dataset")
+
+ax1.set_xlabel("#Features")
+ax1.set_ylabel("Accuracy")
+ax2.set_ylabel("Time, secs")
+colors = "bgrcmyk"
+plt.ticklabel_format(useOffset=False)
+
+for i, (name, results) in enumerate(results.items()):
+    scores, durations = zip(*results)
+    ax1.plot(cutoffs, scores, color=colors[i], label=name)
+    ax2.plot(cutoffs, durations, color=colors[i], label=name)
+
+ax1.grid(True)
+ax2.grid(True)
+ax1.legend(loc="best")
+ax2.legend(loc="best")
+
+_ = plt.show()
diff --git a/sklearn/feature_selection/__init__.py b/sklearn/feature_selection/__init__.py
@@ -20,6 +20,8 @@
     f_classif,
     f_oneway,
     f_regression,
+    info_gain,
+    info_gain_ratio,
     r_regression,
 )
 from ._variance_threshold import VarianceThreshold
@@ -37,6 +39,8 @@
     "SelectPercentile",
     "VarianceThreshold",
     "chi2",
+    "info_gain",
+    "info_gain_ratio",
     "f_classif",
     "f_oneway",
     "f_regression",