In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.optimize as opt
import sklearn.linear_model
import sklearn.model_selection
from sklearn.model_selection import KFold

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import HashingVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import accuracy_score

Load the dataset

In [2]:
data = fetch_20newsgroups()

In [3]:
data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [4]:
print(len(data.data))
data.data[1]

11314


"From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 11\nNNTP-Posting-Host: carson.u.washington.edu\n\nA fair number of brave souls who upgraded their SI clock oscillator have\nshared their experiences for this poll. Please send a brief message detailing\nyour experiences with the procedure. Top speed attained, CPU rated speed,\nadd on cards and adapters, heat sinks, hour of usage per day, floppy disk\nfunctionality with 800 and 1.4 m floppies are especially requested.\n\nI will be summarizing in the next two days, so please add to the network\nknowledge base if you have done the clock upgrade and haven't answered this\npoll. Thanks.\n\nGuy Kuo <guykuo@u.washington.edu>\n"

In [5]:
print(len(data.target_names))
data.target_names[:10]

20


['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball']

In [6]:
print(len(data.target))
data.target[:10]

11314


array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

Use HashingVectorizer to encode the text into sparse features:

In [7]:
vectorizer = HashingVectorizer(stop_words = 'english', 
                      n_features=2**10, 
                      binary=True, 
                      token_pattern=r'\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b')

X = vectorizer .fit_transform(data.data)

In [8]:
X

<11314x1024 sparse matrix of type '<class 'numpy.float64'>'
	with 1016839 stored elements in Compressed Sparse Row format>

In [9]:
y = data.target

Use the K-Fold cross-validation to split the dataset into training and test parts:

In [10]:
def splitter(chunks):
    return KFold(chunks, shuffle=True)

In [11]:
def test_model(model, chunks):
    for train_index, test_index in splitter(chunks).split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        y_pred_ch = model.predict(X_test)
        print('Chunk\'s accuracy score: ', accuracy_score(y_test, y_pred_ch))
    y_pred = model.predict(X)
    print('Accuracy score (on all dataset): ', accuracy_score(y, y_pred))

Experiment with different models (L1, L2, ...)

What model worked best?

In [12]:
model = LogisticRegression(penalty='l1', solver='liblinear')
test_model(model, 5)

Chunk's accuracy score:  0.6425099425541316
Chunk's accuracy score:  0.65974370304905
Chunk's accuracy score:  0.6504639858594786
Chunk's accuracy score:  0.6593018117543085
Chunk's accuracy score:  0.6445623342175066
Accuracy score (on all dataset):  0.7285663779388368


In [13]:
model = LogisticRegression(penalty='l2');
test_model(model, 5)

Chunk's accuracy score:  0.7339814405656209
Chunk's accuracy score:  0.7304463102076889
Chunk's accuracy score:  0.7247017233760495
Chunk's accuracy score:  0.7392841361025188
Chunk's accuracy score:  0.7263483642793988
Accuracy score (on all dataset):  0.8324200106063284


In [14]:
model = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.2)
test_model(model, 5)

Chunk's accuracy score:  0.722492266902342
Chunk's accuracy score:  0.7273530711444984
Chunk's accuracy score:  0.7176314626601856
Chunk's accuracy score:  0.7238179407865665
Chunk's accuracy score:  0.7298850574712644
Accuracy score (on all dataset):  0.8209298214601379
