In [2]:
import sklearn

import numpy as np
import pandas as pd

In [3]:
from sklearn.datasets import fetch_20newsgroups

In [4]:
newsgroups_data = fetch_20newsgroups()

In [5]:
newsgroups_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [6]:
print(newsgroups_data.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features       

In [7]:
newsgroups_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [8]:
np.unique(newsgroups_data.target)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [9]:
len(newsgroups_data.data), len(newsgroups_data.target)

(11314, 11314)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
tfidf_vect = TfidfVectorizer(stop_words='english')

newsgroups_data_transformed = tfidf_vect.fit_transform(newsgroups_data.data)

In [13]:
newsgroups_data_transformed.shape

(11314, 129796)

In [14]:
len(tfidf_vect.get_feature_names())

129796

In [15]:
import random

random.sample(tfidf_vect.vocabulary_.items(), 10)

[('cashed', 37972),
 ('w91', 122628),
 ('6emasri', 17559),
 ('egr', 50657),
 ('keyring', 72225),
 ('worldlink', 124754),
 ('mprc', 83957),
 ('115256', 3227),
 ('cg3ac_enblen', 38758),
 ('aktueel', 27176)]

In [16]:
print(newsgroups_data_transformed[0])

  (0, 86416)	0.14330464297977982
  (0, 35135)	0.10188109676312235
  (0, 65968)	0.10658183340971177
  (0, 114195)	0.06002582888934523
  (0, 78809)	0.06524029473980168
  (0, 76578)	0.0752490171119318
  (0, 57203)	0.16977226500364592
  (0, 67023)	0.07965653370342658
  (0, 63238)	0.09086750717799585
  (0, 95944)	0.11792442679286105
  (0, 127721)	0.0660283455431985
  (0, 109044)	0.11811852219269026
  (0, 51651)	0.10581100308545811
  (0, 83103)	0.09633120317294654
  (0, 113755)	0.1926949257821117
  (0, 73061)	0.04662587301170703
  (0, 34131)	0.09493746671845804
  (0, 101175)	0.08899924936054199
  (0, 105907)	0.10749912859686628
  (0, 35560)	0.1446512460011004
  (0, 26070)	0.10385185139503332
  (0, 108033)	0.08197182211166716
  (0, 99619)	0.06171903092868097
  (0, 48552)	0.1263844988551673
  (0, 34943)	0.18203649549572573
  :	:
  (0, 76574)	0.09842306773884467
  (0, 109354)	0.11773212031617089
  (0, 48550)	0.10908149802523066
  (0, 45232)	0.07212208178051426
  (0, 104609)	0.09217540920934716


In [17]:
from sklearn.model_selection import train_test_split

In [18]:
x_train, x_test, y_train, y_test = train_test_split(newsgroups_data_transformed,
                                                   newsgroups_data.target,
                                                   shuffle=True,
                                                   test_size=0.2)

In [19]:
x_train.shape, y_train.shape

((9051, 129796), (9051,))

In [20]:
from sklearn.neural_network import MLPClassifier

In [23]:
# adam optimizer used when the data set is large
mlp_clf = MLPClassifier(activation = 'relu',
                       hidden_layer_sizes = (32,),
                       solver='adam',
                       verbose=True,
                       max_iter=50)

In [24]:
mlp_clf.fit(x_train, y_train)

Iteration 1, loss = 2.90248604
Iteration 2, loss = 2.51863079
Iteration 3, loss = 2.00548773
Iteration 4, loss = 1.46579503
Iteration 5, loss = 1.01125629
Iteration 6, loss = 0.68815903
Iteration 7, loss = 0.47696704
Iteration 8, loss = 0.34108540
Iteration 9, loss = 0.25181707
Iteration 10, loss = 0.19157795
Iteration 11, loss = 0.14948560
Iteration 12, loss = 0.11946847
Iteration 13, loss = 0.09754985
Iteration 14, loss = 0.08090300
Iteration 15, loss = 0.06827859
Iteration 16, loss = 0.05846600
Iteration 17, loss = 0.05074945
Iteration 18, loss = 0.04453936
Iteration 19, loss = 0.03954318
Iteration 20, loss = 0.03545898
Iteration 21, loss = 0.03205534
Iteration 22, loss = 0.02916175
Iteration 23, loss = 0.02678203
Iteration 24, loss = 0.02468484
Iteration 25, loss = 0.02293187
Iteration 26, loss = 0.02143281
Iteration 27, loss = 0.02008764
Iteration 28, loss = 0.01886820
Iteration 29, loss = 0.01790081
Iteration 30, loss = 0.01701363
Iteration 31, loss = 0.01610783
Iteration 32, los



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(32,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=50, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=True, warm_start=False)

In [26]:
y_pred = mlp_clf.predict(x_test)

In [27]:
pred_results = pd.DataFrame({'y_test':y_test, 'y_pred':y_pred})

In [28]:
pred_results.sample(10)

Unnamed: 0,y_test,y_pred
1107,3,3
807,2,2
42,0,15
101,6,8
1896,13,13
1449,16,16
1976,18,18
747,2,2
1692,19,19
2129,14,14


In [29]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.9244365885992046