## Classify newspaper articles

In [15]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.neural_network import MLPClassifier

### Function for data cleaning e bag of words

In [2]:
import string
import spacy
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
english_stopwords = stopwords.words('english')
nlp = spacy.load('en_core_web_sm')
punctuation = set(string.punctuation)

def data_cleaner(dataset):
    dataset_to_return = []
    for sentence in dataset:
        sentence = sentence.lower()
        for c in string.punctuation:
            sentence = sentence.replace(c, " ")
        document = nlp(sentence)
        sentence = ' '.join(token.lemma_ for token in document)
        sentence = ' '.join(word for word in sentence.split() if word not in english_stopwords)
        sentence = re.sub('\d', '', sentence)
        dataset_to_return.append(sentence)

    return dataset_to_return


def bow_tfidf(dataset, tfidf_vectorizer):
    if tfidf_vectorizer == None:
        tfidf_vectorizer = TfidfVectorizer()
        X = tfidf_vectorizer.fit_transform(dataset)
    else:
        X = tfidf_vectorizer.transform(dataset)
        
    return X.toarray(), tfidf_vectorizer

### Import dataset (train e test)

In [4]:
# train e test dataset
train_dataset = fetch_20newsgroups(subset='train')
test_dataset = fetch_20newsgroups(subset='test')

train_data = train_dataset['data']
test_data = test_dataset['data']

train_target = train_dataset['target']
test_target = test_dataset['target']

In [5]:
dataset_train = pd.DataFrame({'data': train_dataset['data'],
                             'target': train_dataset['target']})
dataset_train

Unnamed: 0,data,target
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14
...,...,...
11309,From: jim.zisfein@factory.com (Jim Zisfein) \n...,13
11310,From: ebodin@pearl.tufts.edu\nSubject: Screen ...,4
11311,From: westes@netcom.com (Will Estes)\nSubject:...,3
11312,From: steve@hcrlgw (Steven Collins)\nSubject: ...,1


In [6]:
dataset_test = pd.DataFrame({'data': test_dataset['data'],
                             'target': test_dataset['target']})
dataset_test

Unnamed: 0,data,target
0,From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. ...,7
1,From: Rick Miller <rick@ee.uwm.edu>\nSubject: ...,5
2,From: mathew <mathew@mantis.co.uk>\nSubject: R...,0
3,From: bakken@cs.arizona.edu (Dave Bakken)\nSub...,17
4,From: livesey@solntze.wpd.sgi.com (Jon Livesey...,19
...,...,...
7527,From: richmond@spiff.Princeton.EDU (Stupendous...,14
7528,From: smytonj@murr11.alleg.edu (Jim Smyton)\nS...,4
7529,From: hhenderson@vax.clarku.edu\nSubject: RE: ...,9
7530,From: b859zam@utarlg.uta.edu \nSubject: INTEL ...,6


In [7]:
# # train e test target
target_names = train_dataset['target_names']
target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [8]:
target_names = test_dataset['target_names']
target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

### Preprocessing

In [9]:
training_data_cleaned, tfidf_vectorizer = bow_tfidf(data_cleaner(train_data), None)

In [10]:
test_data_cleaned, tfidf_vectorizer = bow_tfidf(data_cleaner(test_data), tfidf_vectorizer)

In [11]:
training_data_cleaned

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
len(training_data_cleaned[0])

84432

In [None]:
test_data_cleaned

In [13]:
len(test_data_cleaned[0])

84432

### Model training: Multi-layer Perceptron Classifier

In [16]:
clf = MLPClassifier(activation='logistic',
                    hidden_layer_sizes=(100,),
                    max_iter=100,
                    solver='adam',
                    tol=0.005,
                    verbose=True)

clf.fit(training_data_cleaned,train_target)

Iteration 1, loss = 2.96146401
Iteration 2, loss = 2.84526560
Iteration 3, loss = 2.70603188
Iteration 4, loss = 2.50568886
Iteration 5, loss = 2.23764148
Iteration 6, loss = 1.91021857
Iteration 7, loss = 1.57206142
Iteration 8, loss = 1.26412353
Iteration 9, loss = 1.01039998
Iteration 10, loss = 0.81027177
Iteration 11, loss = 0.65603040
Iteration 12, loss = 0.53754403
Iteration 13, loss = 0.44659684
Iteration 14, loss = 0.37613076
Iteration 15, loss = 0.32093304
Iteration 16, loss = 0.27732589
Iteration 17, loss = 0.24240202
Iteration 18, loss = 0.21418043
Iteration 19, loss = 0.19122876
Iteration 20, loss = 0.17204512
Iteration 21, loss = 0.15615362
Iteration 22, loss = 0.14280151
Iteration 23, loss = 0.13150143
Iteration 24, loss = 0.12191761
Iteration 25, loss = 0.11363831
Iteration 26, loss = 0.10659497
Iteration 27, loss = 0.10039297
Iteration 28, loss = 0.09503078
Iteration 29, loss = 0.09032187
Iteration 30, loss = 0.08616931
Iteration 31, loss = 0.08251046
Iteration 32, los

MLPClassifier(activation='logistic', max_iter=100, tol=0.005, verbose=True)

#### Test del modello

In [17]:
clf.score(test_data_cleaned, test_target)

0.8570100902814658

In [18]:
target = clf.predict(bow_tfidf(data_cleaner(["This is a mac book pro!!!"]),tfidf_vectorizer)[0])[0]

In [19]:
target

4

In [20]:
test_dataset['target_names'][target]

'comp.sys.mac.hardware'

In [25]:
test_data[100]

'Subject: help\nFrom: C..Doelle@p26.f3333.n106.z1.fidonet.org (C. Doelle)\nLines: 13\n\nHello All!\n\n    It is my understanding that all True-Type fonts in Windows are loaded in\nprior to starting Windows - this makes getting into Windows quite slow if you\nhave hundreds of them as I do.  First off, am I correct in this thinking -\nsecondly, if that is the case - can you get Windows to ignore them on boot and\nmaybe make something like a PIF file to load them only when you enter the\napplications that need fonts?  Any ideas?\n\n\nChris\n\n * Origin: chris.doelle.@f3333.n106.z1.fidonet.org (1:106/3333.26)\n'

In [26]:
test_dataset['target_names'][clf.predict([test_data_cleaned[100]])[0]]

'comp.os.ms-windows.misc'

In [27]:
test_dataset['target_names'][test_target[100]]

'comp.os.ms-windows.misc'