In [1]:
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression

## 20NG

In [2]:
news_train = fetch_20newsgroups(
    data_home='../data/20newsgroups/', 
    subset='train')
news_test = fetch_20newsgroups(
    data_home='../data/20newsgroups/', 
    subset='test')

In [3]:
vectorizer = CountVectorizer(binary=True, stop_words='english')
counts_train = vectorizer.fit_transform(news_train.data)
counts_test = vectorizer.transform(news_test.data)

In [4]:
reverse_map = {v: k for k, v in vectorizer.vocabulary_.items()}

## chi2

In [5]:
from sklearn.feature_selection import chi2

In [6]:
%%time
_20_chi2, _20_pval = chi2(counts_train, news_train.target)

CPU times: user 234 ms, sys: 25.7 ms, total: 260 ms
Wall time: 260 ms


In [8]:
top_200 = np.argsort(_20_chi2)[::-1][:200]

In [9]:
[reverse_map[d] for d in top_200[:10]]

['clipper',
 'encryption',
 'sale',
 'dod',
 'bike',
 'hockey',
 'windows',
 'israeli',
 'israel',
 'god']

In [10]:
chi_counts_train = counts_train[:, top_200]
chi_counts_test = counts_test[:, top_200]

### predict

In [11]:
_20_lr = LogisticRegression(penalty='l2')

In [12]:
%%time
_20_lr.fit(chi_counts_train, news_train.target)

CPU times: user 809 ms, sys: 36 µs, total: 809 ms
Wall time: 812 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
yhat = _20_lr.predict(chi_counts_train)
print('Train acc:', sum(yhat == news_train.target)/len(news_train.target))
_yhat = _20_lr.predict(chi_counts_test)
print('Test acc:', sum(_yhyhat = _20_lr.predict(chi_counts_train)
print('Train acc:', sum(yhat == news_train.target)/len(news_train.target))
_yhat = _20_lr.predict(chi_counts_test)
print('Test acc:', sum(_yhat == news_test.target)/len(news_test.target))at == news_test.target)/len(news_test.target))

Train acc: 0.707353721053562
Test acc: 0.6344928305894849


## MI

In [5]:
from sklearn.feature_selection import mutual_info_classif

In [6]:
%%time
_20_mi = mutual_info_classif(counts_train, news_train.target, random_state=666)

CPU times: user 5min 11s, sys: 193 ms, total: 5min 11s
Wall time: 5min 14s


In [14]:
top_200_mi = np.argsort(_20_mi)[::-1][:200]

In [16]:
[reverse_map[d] for d in top_200_mi[:10]]

['windows',
 'god',
 'clipper',
 'sale',
 'dod',
 'government',
 'team',
 'encryption',
 'people',
 'car']

In [19]:
mi_counts_train = counts_train[:, top_200_mi]
mi_counts_test = counts_test[:, top_200_mi]

### predict

In [21]:
_20_lr = LogisticRegression(penalty='l2')

In [22]:
%%time
_20_lr.fit(mi_counts_train, news_train.target)

CPU times: user 2.11 s, sys: 2.95 ms, total: 2.11 s
Wall time: 2.14 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [24]:
yhat = _20_lr.predict(mi_counts_train)
print('Train acc:', sum(yhat == news_train.target)/len(news_train.target))
_yhat = _20_lr.predict(mi_counts_test)
print('Test acc:', sum(_yhat == news_test.target)/len(news_test.target))

Train acc: 0.7352837192858406
Test acc: 0.6313064259160913


## strong l1