In [35]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

In [36]:
newsgroups = fetch_20newsgroups(subset='all',
                                remove = ('headers', 'footers', 'quotes'))


In [37]:
newsgroups.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [38]:
print(newsgroups['DESCR'])

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`~sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

Classes                     20
Samples total            18846
Dimensionality               1
Features                  text

In [39]:
print(newsgroups['target_names'])

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


we construct the DataFrame

In [40]:
df = pd.DataFrame({'text': newsgroups.data, 
                   'target': newsgroups.target})
df.head()

Unnamed: 0,text,target
0,\n\nI am sure some bashers of Pens fans are pr...,10
1,My brother is in the market for a high-perform...,3
2,\n\n\n\n\tFinally you said what you dream abou...,17
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3
4,1) I have an old Jasmine drive which I cann...,4


In [41]:
df['target'].value_counts()

target
10    999
15    997
8     996
9     994
11    991
7     990
13    990
5     988
14    987
2     985
12    984
3     982
6     975
1     973
4     963
17    940
16    910
0     799
18    775
19    628
Name: count, dtype: int64

In [42]:
print(df['text'].values[0]) 



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!




Our goal is classificate this text.
Import libraries

In [43]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

In [44]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dianaterraza/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dianaterraza/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dianaterraza/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/dianaterraza/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### 1. Bring everything in lowercase 

Lowercasing is normalization

In [45]:
df['text_processed'] = df['text'].apply(lambda doc: doc.lower())
df['text_processed'].head()

0    \n\ni am sure some bashers of pens fans are pr...
1    my brother is in the market for a high-perform...
2    \n\n\n\n\tfinally you said what you dream abou...
3    \nthink!\n\nit's the scsi card doing the dma t...
4    1)    i have an old jasmine drive which i cann...
Name: text_processed, dtype: object

### Removing all the numbers 

In [46]:
df['text_processed'] = df['text_processed'].apply(lambda doc: re.sub(r'\d+','',doc))
df['text_processed'].head()

0    \n\ni am sure some bashers of pens fans are pr...
1    my brother is in the market for a high-perform...
2    \n\n\n\n\tfinally you said what you dream abou...
3    \nthink!\n\nit's the scsi card doing the dma t...
4    )    i have an old jasmine drive which i canno...
Name: text_processed, dtype: object

### And then tokenization: 

In [47]:
df['text_processed'] = df['text_processed'].apply(lambda doc: doc.split())
df['text_processed'].head()

0    [i, am, sure, some, bashers, of, pens, fans, a...
1    [my, brother, is, in, the, market, for, a, hig...
2    [finally, you, said, what, you, dream, about.,...
3    [think!, it's, the, scsi, card, doing, the, dm...
4    [), i, have, an, old, jasmine, drive, which, i...
Name: text_processed, dtype: object

### Removing puntuation 

tokens is the full list, for every word in the list im removing the punctuation

In [48]:
df['text_processed'] = df['text_processed'].apply(
    lambda tokens: [w.translate(str.maketrans('','',string.punctuation)) for w in tokens]) 

df['text_processed'].head()

0    [i, am, sure, some, bashers, of, pens, fans, a...
1    [my, brother, is, in, the, market, for, a, hig...
2    [finally, you, said, what, you, dream, about, ...
3    [think, its, the, scsi, card, doing, the, dma,...
4    [, i, have, an, old, jasmine, drive, which, i,...
Name: text_processed, dtype: object

### Removing non alphabetic or short words

In [49]:
df['text_processed'] = df['text_processed'].apply(
    lambda tokens: [word for word in tokens if word.isalpha() and len(word) > 1])

df['text_processed'].head()

0    [am, sure, some, bashers, of, pens, fans, are,...
1    [my, brother, is, in, the, market, for, highpe...
2    [finally, you, said, what, you, dream, about, ...
3    [think, its, the, scsi, card, doing, the, dma,...
4    [have, an, old, jasmine, drive, which, cannot,...
Name: text_processed, dtype: object

### Removing the stopwords : unique of set 

In [50]:
stop_words = set(stopwords.words('english'))
df['text_processed'] = df['text_processed'].apply(
    lambda tokens: [w for w in tokens if w not in stop_words])

df['text_processed'].head()

0    [sure, bashers, pens, fans, pretty, confused, ...
1    [brother, market, highperformance, video, card...
2    [finally, said, dream, mediterranean, new, are...
3    [think, scsi, card, dma, transfers, disks, scs...
4    [old, jasmine, drive, cannot, use, new, system...
Name: text_processed, dtype: object

In [None]:
stop_words # this is the words we remove 

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [52]:
from nltk.corpus import wordnet 

Function that converts or tag nltk text in a noun, verb, adverb. 

In [63]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/dianaterraza/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [65]:
def get_wordnet_pos(treebank_tag):
    """
    Convert an NLTK POS tag to the corresponding WordNet POS tag.
    :param treebank_tag: The POS tag in the Treebank format from NLTK
    :return: The corresponding WordNet part of speech
    """
    if treebank_tag.startswith('J'):  # Check if the tag indicates an adjective
        return wordnet.ADJ  # Return the WordNet constant for adjectives
    elif treebank_tag.startswith('N'):  # Check if the tag indicates a noun
        return wordnet.NOUN  # Return the WordNet constant for nouns
    elif treebank_tag.startswith('V'):  # Check if the tag indicates a verb
        return wordnet.VERB  # Return the WordNet constant for verbs
    elif treebank_tag.startswith('R'):  # Check if the tag indicates an adverb
        return wordnet.ADV  # Return the WordNet constant for adverbs
    else:
        return wordnet.NOUN  # Default to returning the noun constant if no match is found

Remember tokens is a list 

In [66]:
lemmatizer = WordNetLemmatizer()

In [67]:
df['text_processed'] = df['text_processed'].apply(
    lambda tokens: [lemmatizer.lemmatize(w,
                                         pos = get_wordnet_pos(nltk.pos_tag([w])[0][1])) 
    for w in tokens]
)

df['text_processed'].head()

0    [sure, bashers, pen, fan, pretty, confuse, lac...
1    [brother, market, highperformance, video, card...
2    [finally, say, dream, mediterranean, new, area...
3    [think, scsi, card, dma, transfer, disk, scsi,...
4    [old, jasmine, drive, cannot, use, new, system...
Name: text_processed, dtype: object

And then we dont have lists, we have the text again 

In [68]:
df['text_processed']=df['text_processed'].apply(lambda tokens: ' '.join(tokens))
df['text_processed'].head()

0    sure bashers pen fan pretty confuse lack kind ...
1    brother market highperformance video card supp...
2    finally say dream mediterranean new area great...
3    think scsi card dma transfer disk scsi card dm...
4    old jasmine drive cannot use new system unders...
Name: text_processed, dtype: object

In [69]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

Split the data train and test 

In [70]:
X_train, X_test, y_train, y_test = train_test_split(df['text_processed'], 
                                                    df['target'], 
                                                    test_size=0.2,
                                                    random_state=42)

In [71]:
X_train.shape

(15076,)

In [72]:
X_test.shape

(3770,)

And then vectorize to do our model (only numbers)

In [73]:
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

In [74]:
X_train_counts.shape

(15076, 91200)

In [75]:
X_test_counts.shape 

(3770, 91200)

91200 is the number of words, of tokens. 

### Transform the data into DF - IFD

In [106]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [108]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [109]:
X_train_tfidf.shape

(15076, 91200)

The shape didnt change, tf - tdf is normalizer

In [110]:
vectorizer.get_feature_names_out()

array(['aa', 'aaa', 'aaaaa', ..., 'zzzzzzt', 'µsec', 'úz'],
      shape=(91200,), dtype=object)

In [111]:
vectorizer.vocabulary_

{'ive': 35150,
 'gotten': 27967,
 'post': 63951,
 'group': 28537,
 'last': 38600,
 'couple': 14664,
 'day': 16372,
 'recently': 67863,
 'add': 723,
 'feed': 23933,
 'list': 39862,
 'near': 56366,
 'death': 16575,
 'see': 72299,
 'mail': 41507,
 'side': 73695,
 'im': 32666,
 'get': 27110,
 'right': 69634,
 'amount': 2491,
 'traffic': 81266,
 'patrick': 61497,
 'mahan': 41454,
 'tgv': 79613,
 'window': 87306,
 'washer': 86264,
 'mahantgvcom': 41457,
 'wake': 86073,
 'person': 62247,
 'unnecessarily': 83471,
 'consider': 13787,
 'lazarus': 38758,
 'long': 40260,
 'capital': 9959,
 'crime': 15005,
 'first': 24488,
 'offense': 59029,
 'notebook': 58148,
 'interest': 34206,
 'id': 32247,
 'fight': 24275,
 'ticket': 80327,
 'there': 79816,
 'chance': 10996,
 'cop': 14253,
 'wont': 87768,
 'show': 73566,
 'secondly': 72224,
 'point': 63576,
 'lie': 39513,
 'purgered': 66289,
 'beleive': 6523,
 'yore': 90437,
 'charge': 11083,
 'go': 27677,
 'mph': 50941,
 'speed': 75577,
 'severe': 72979,
 'co

In [89]:
X_test_tfidf.toarray()

array([[0.        , 0.13957863, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]], shape=(3770, 91200))

Classification problem, is supervised model because we have labels (x,y)

### We choose model naive bayes 

In [90]:
from sklearn.naive_bayes import MultinomialNB

In [91]:
from sklearn.metrics import classification_report

### Bag of words

In [92]:
model = MultinomialNB()
model.fit(X_train_counts, y_train)

y_pred = model.predict(X_test_counts)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.39      0.49       151
           1       0.53      0.74      0.61       202
           2       0.85      0.27      0.41       195
           3       0.54      0.74      0.62       183
           4       0.82      0.63      0.71       205
           5       0.72      0.79      0.75       215
           6       0.87      0.58      0.69       193
           7       0.88      0.69      0.77       196
           8       0.49      0.68      0.57       168
           9       0.94      0.77      0.85       211
          10       0.90      0.89      0.90       198
          11       0.64      0.79      0.71       201
          12       0.79      0.56      0.66       202
          13       0.84      0.81      0.83       194
          14       0.74      0.78      0.76       189
          15       0.46      0.91      0.61       202
          16       0.73      0.66      0.70       188
          17       0.62    

### TF-IDF

In [93]:
model = MultinomialNB()
model.fit(X_train_counts, y_train)

y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.43      0.52       151
           1       0.57      0.75      0.65       202
           2       0.86      0.36      0.51       195
           3       0.56      0.76      0.64       183
           4       0.84      0.68      0.75       205
           5       0.78      0.81      0.79       215
           6       0.86      0.59      0.70       193
           7       0.86      0.71      0.78       196
           8       0.49      0.72      0.59       168
           9       0.94      0.82      0.88       211
          10       0.91      0.90      0.91       198
          11       0.69      0.79      0.74       201
          12       0.78      0.61      0.69       202
          13       0.84      0.88      0.86       194
          14       0.72      0.82      0.77       189
          15       0.51      0.90      0.65       202
          16       0.75      0.73      0.74       188
          17       0.64    