# Multilayer Classification on Text data

In [1]:
import pandas as pd 
import numpy as np

import sklearn

In [2]:
from sklearn.datasets import fetch_20newsgroups


## Exploering our data

In [3]:
new_groups= fetch_20newsgroups()

In [4]:
new_groups.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [7]:
print(new_groups.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`~sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

Classes                     20
Samples total            18846
Dimensionality               1
Features                  text

In [8]:
new_groups.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [None]:
print(new_groups.data[4])

From: guykuo@carson.u.washington.edu (Guy Kuo)
Subject: SI Clock Poll - Final Call
Summary: Final call for SI clock reports
Keywords: SI,acceleration,clock,upgrade
Article-I.D.: shelley.1qvfo9INNc3s
Organization: University of Washington
Lines: 11
NNTP-Posting-Host: carson.u.washington.edu

A fair number of brave souls who upgraded their SI clock oscillator have
shared their experiences for this poll. Please send a brief message detailing
your experiences with the procedure. Top speed attained, CPU rated speed,
add on cards and adapters, heat sinks, hour of usage per day, floppy disk
functionality with 800 and 1.4 m floppies are especially requested.

I will be summarizing in the next two days, so please add to the network
knowledge base if you have done the clock upgrade and haven't answered this
poll. Thanks.

Guy Kuo <guykuo@u.washington.edu>



In [None]:
print(new_groups.target[4])  # sci.space


14


In [12]:
np.unique(new_groups.target)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [14]:
len(new_groups.data)   , len(new_groups.target)

(11314, 11314)

## Create Feature Vectors from Text Using TF-IDF

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
tfidf_vect= TfidfVectorizer(stop_words='english')

new_groups_transformed= tfidf_vect.fit_transform(new_groups.data)

In [17]:
new_groups_transformed.shape

(11314, 129796)

In [18]:
len(tfidf_vect.get_feature_names_out())

129796

In [22]:
print(new_groups_transformed[0])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 55 stored elements and shape (1, 129796)>
  Coords	Values
  (0, 75215)	0.38538985156422345
  (0, 122887)	0.282869751755441
  (0, 118013)	0.23076236589534987
  (0, 50455)	0.05948476266845307
  (0, 114439)	0.06768238878777005
  (0, 111094)	0.020865105019220037
  (0, 37722)	0.41534653529092685
  (0, 87451)	0.03885306291479392
  (0, 94962)	0.03754552571724598
  (0, 63970)	0.03857974543636419
  (0, 98748)	0.17501596694257227
  (0, 90192)	0.021706106200820422
  (0, 118714)	0.04039328791909072
  (0, 79519)	0.11911704310036365
  (0, 40939)	0.08497090499024601
  (0, 91885)	0.10797335594250271
  (0, 75888)	0.020933445618156278
  (0, 4605)	0.06897342558445459
  (0, 124627)	0.0967471326603278
  (0, 51714)	0.1460907895102532
  (0, 104609)	0.09217540920934718
  (0, 45232)	0.07212208178051426
  (0, 48550)	0.10908149802523068
  (0, 109354)	0.1177321203161709
  (0, 76574)	0.09842306773884468
  :	:
  (0, 34943)	0.18203649549572576
  (0, 48552

In [23]:
from sklearn.model_selection import train_test_split


In [24]:
x_train, x_test, y_train, y_test= train_test_split(new_groups_transformed,
                                                   new_groups.target,
                                                   shuffle=True,
                                                   test_size=0.2)

In [25]:
x_train.shape , y_train.shape

((9051, 129796), (9051,))

In [26]:
x_test.shape, y_test.shape

((2263, 129796), (2263,))

## Build A Prototype

In [None]:
from sklearn.neural_network import MLPClassifier

mlp_clf= MLPClassifier(
        activation='relu',
        hidden_layer_sizes=(32,),
        solver='adam',
        max_iter=50,
        verbose=0
        )

In [28]:
mlp_clf.fit(x_train,y_train)



## Let's Evaluate our model

In [29]:
y_pred= mlp_clf.predict(x_test)

In [31]:
pred_results= pd.DataFrame({
    'y_test': y_test,
    'y_pred':y_pred
})
pred_results.sample(10)

Unnamed: 0,y_test,y_pred
680,4,4
1686,5,5
860,10,10
483,13,13
1031,6,6
1404,12,12
799,13,13
1811,1,1
1453,15,15
1901,0,0


In [32]:
from sklearn.metrics import accuracy_score


accuracy_score(y_test,y_pred)

0.9306230667255855

## Let's try another one 

we'll add more hidden layers , and let the model train more

> i just think this will take a 5m

In [33]:
mlp_clf_2= MLPClassifier(
        activation='relu',
        hidden_layer_sizes=(32,32),
        solver='adam',
        max_iter=100,
        verbose=0
        ).fit(x_train,y_train)

In [35]:
y_pred_2= mlp_clf_2.predict(x_test)

In [48]:
result=pd.DataFrame({
    'test': y_test,
    "predicted" : y_pred_2
})
result.head()

Unnamed: 0,test,predicted
0,5,5
1,5,5
2,4,4
3,10,10
4,16,16


In [43]:
accuracy_score(y_test,y_pred_2)

0.9111798497569598