In [1]:
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression

## 20NG

In [2]:
news_train = fetch_20newsgroups(
    data_home='../data/20newsgroups/', 
    subset='train')
news_test = fetch_20newsgroups(
    data_home='../data/20newsgroups/', 
    subset='test')

In [3]:
vectorizer = CountVectorizer(binary=True, stop_words='english')
counts_train = vectorizer.fit_transform(news_train.data)
counts_test = vectorizer.transform(news_test.data)

In [4]:
reverse_map = {v: k for k, v in vectorizer.vocabulary_.items()}

## chi2

In [5]:
from sklearn.feature_selection import chi2

In [6]:
%%time
_20_chi2, _20_pval = chi2(counts_train, news_train.target)

CPU times: user 234 ms, sys: 25.7 ms, total: 260 ms
Wall time: 260 ms


In [8]:
top_200 = np.argsort(_20_chi2)[::-1][:200]

In [9]:
[reverse_map[d] for d in top_200[:10]]

['clipper',
 'encryption',
 'sale',
 'dod',
 'bike',
 'hockey',
 'windows',
 'israeli',
 'israel',
 'god']

In [10]:
chi_counts_train = counts_train[:, top_200]
chi_counts_test = counts_test[:, top_200]

### predict

In [11]:
_20_lr = LogisticRegression(penalty='l2')

In [12]:
%%time
_20_lr.fit(chi_counts_train, news_train.target)

CPU times: user 809 ms, sys: 36 µs, total: 809 ms
Wall time: 812 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
yhat = _20_lr.predict(chi_counts_train)
print('Train acc:', sum(yhat == news_train.target)/len(news_train.target))
_yhat = _20_lr.predict(chi_counts_test)
print('Test acc:', sum(_yhyhat = _20_lr.predict(chi_counts_train)
print('Train acc:', sum(yhat == news_train.target)/len(news_train.target))
_yhat = _20_lr.predict(chi_counts_test)
print('Test acc:', sum(_yhat == news_test.target)/len(news_test.target))at == news_test.target)/len(news_test.target))

Train acc: 0.707353721053562
Test acc: 0.6344928305894849


## MI

In [5]:
from sklearn.feature_selection import mutual_info_classif

In [6]:
%%time
_20_mi = mutual_info_classif(counts_train, news_train.target, random_state=666)

CPU times: user 5min 11s, sys: 193 ms, total: 5min 11s
Wall time: 5min 14s


In [14]:
top_200_mi = np.argsort(_20_mi)[::-1][:200]

In [16]:
[reverse_map[d] for d in top_200_mi[:10]]

['windows',
 'god',
 'clipper',
 'sale',
 'dod',
 'government',
 'team',
 'encryption',
 'people',
 'car']

In [19]:
mi_counts_train = counts_train[:, top_200_mi]
mi_counts_test = counts_test[:, top_200_mi]

### predict

In [21]:
_20_lr = LogisticRegression(penalty='l2')

In [22]:
%%time
_20_lr.fit(mi_counts_train, news_train.target)

CPU times: user 2.11 s, sys: 2.95 ms, total: 2.11 s
Wall time: 2.14 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [24]:
yhat = _20_lr.predict(mi_counts_train)
print('Train acc:', sum(yhat == news_train.target)/len(news_train.target))
_yhat = _20_lr.predict(mi_counts_test)
print('Test acc:', sum(_yhat == news_test.target)/len(news_test.target))

Train acc: 0.7352837192858406
Test acc: 0.6313064259160913


## strong l1

In [6]:
from scipy.sparse import csc_matrix

In [7]:
_20_lr = LogisticRegression(penalty='l1', C=0.1, n_jobs=-1)

In [8]:
%%time
_20_lr.fit(counts_train, news_train.target)

  " = {}.".format(self.n_jobs))


CPU times: user 11 s, sys: 38.4 ms, total: 11 s
Wall time: 11.1 s


LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
yhat = _20_lr.predict(counts_train)
print('Train acc:', sum(yhat == news_train.target)/len(news_train.target))
_yhat = _20_lr.predict(counts_test)
print('Test acc:', sum(_yhat == news_test.target)/len(news_test.target))

Train acc: 0.8238465617818632
Test acc: 0.7276951672862454


In [10]:
top_200 = np.argsort(_20_lr.coef_, axis=1)[:,::-1][:,:200]

In [12]:
%%time
new_counts = lil_matrix(counts_train.shape, dtype='int16')
for i in range(20):
    tg = np.where(news_train.target == i)[0]  
    
    rows = np.arange(0, len(tg), 1/200)
    cols = np.tile(top_200[i], len(tg))
    data = [1] * (len(rows))
    new_counts[tg] = csc_matrix((data, (rows, cols)), shape=(len(tg), counts_train.shape[1]))
    
    print('done for', news_train.target_names[i])

done for alt.atheism
done for comp.graphics
done for comp.os.ms-windows.misc
done for comp.sys.ibm.pc.hardware
done for comp.sys.mac.hardware
done for comp.windows.x
done for misc.forsale
done for rec.autos
done for rec.motorcycles
done for rec.sport.baseball
done for rec.sport.hockey
done for sci.crypt
done for sci.electronics
done for sci.med
done for sci.space
done for soc.religion.christian
done for talk.politics.guns
done for talk.politics.mideast
done for talk.politics.misc
done for talk.religion.misc
CPU times: user 1min 34s, sys: 2.59 s, total: 1min 36s
Wall time: 1min 36s


In [13]:
%%time
_20_lr_new = LogisticRegression(n_jobs=-1)
_20_lr_new.fit(new_counts, news_train.target)
yhat = _20_lr_new.predict(new_counts)
print('Train acc:', sum(yhat == news_train.target)/len(news_train.target))

  " = {}.".format(self.n_jobs))


Train acc: 1.0
CPU times: user 18.2 s, sys: 1.15 s, total: 19.3 s
Wall time: 10.2 s


In [16]:
%%time
new_counts_test = lil_matrix(counts_test.shape, dtype='int16')
for i in range(20):
    tg = np.where(news_test.target == i)[0]
    
    rows = np.arange(0, len(tg), 1/200)
    cols = np.tile(top_200[i], len(tg))
    data = [1] * (len(rows))
    
    new_counts_test[tg] = csc_matrix((data, (rows, cols)), shape=(len(tg), counts_test.shape[1]))
    
    print('done', news_test.target_names[i])

done alt.atheism
done comp.graphics
done comp.os.ms-windows.misc
done comp.sys.ibm.pc.hardware
done comp.sys.mac.hardware
done comp.windows.x
done misc.forsale
done rec.autos
done rec.motorcycles
done rec.sport.baseball
done rec.sport.hockey
done sci.crypt
done sci.electronics
done sci.med
done sci.space
done soc.religion.christian
done talk.politics.guns
done talk.politics.mideast
done talk.politics.misc
done talk.religion.misc
CPU times: user 55.7 s, sys: 1.79 s, total: 57.5 s
Wall time: 57.8 s


In [18]:
yhat = _20_lr_new.predict(new_counts_test)
print('Test acc:', sum(yhat == news_test.target)/len(news_test.target))

Test acc: 1.0


In [28]:
news_test.target.shape[0]

7532

In [31]:
for i in range(10):
    idx = np.random.randint(news_test.target.shape[0])
    yhat = _20_lr_new.predict(new_counts_test[idx])
    y = news_test.target[idx]
    print(idx, 'o', yhat[0], 'p', y)

4800 o 0 p 0
712 o 15 p 15
748 o 5 p 5
4451 o 14 p 14
7227 o 15 p 15
1456 o 8 p 8
1214 o 4 p 4
1305 o 10 p 10
6465 o 7 p 7
1401 o 9 p 9
