In [111]:
from sklearn.datasets import fetch_20newsgroups # dataset de texto para classificação contendo 20 classes
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer # carrega Vectorizer e TFIDF
from sklearn.naive_bayes import MultinomialNB # algoritmo do Naive Bayes
from sklearn.pipeline import Pipeline # Cria pipeline contendo todas as transformações e modelo
from nltk.stem.snowball import SnowballStemmer # Função que retorna a palavra a sua raiz
import numpy as np
from sklearn.linear_model import SGDClassifier # Algoritmo Gradient Descendente Stocastico
from sklearn.model_selection import GridSearchCV
import nltk 
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
import warnings
import matplotlib.pyplot as plt 

warnings.simplefilter('ignore')
#nltk.download()

In [112]:
newsgroups = fetch_20newsgroups(subset='train') # Carrega o dataset de treinamento do fetch 20

In [113]:
list(newsgroups.target_names) # retorna as classes disponíveis para treinamento

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [None]:
categories = [
    'comp.graphics', 
    'comp.os.ms-windows.misc', 
    'comp.sys.ibm.pc.hardware', 
    'comp.sys.mac.hardware', 
    'comp.windows.x', 

    'talk.politics.guns',
    'talk.politics.mideast',
    'talk.politics.misc',
    'talk.religion.misc',
    'soc.religion.christian'
]

In [117]:
# Vai carregar as 10 classes para treinamento.

df_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
df_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True)

#### **Feature Enginnering** 

In [None]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(df_train.data)
X_train_counts.shape

(5487, 94318)

In [None]:
# TF-IDF 
tfidf_transformer = TfidfTransformer() 
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Abordagem I

### Instancia o algoritmo Naive Bayes para treinamento

In [None]:
clf = MultinomialNB()
clf.fit(X_train_tfidf, df_train.target)

### Pipeline

In [None]:
clf_1 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())])

#### Realiza o treinamento do modelo no pipeline

In [None]:
clf_trained = clf_1.fit(df_train.data, df_train.target)

#### **Tuning de parametros**

#### Modelo usando **Naive bayes** com Grid Search

In [None]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}

In [None]:
gs_clf = GridSearchCV(clf_trained, parameters, n_jobs=-1)
# treinamento do modelo 
gs_clf = gs_clf.fit(df_train.data, df_train.target)

In [None]:
print(gs_clf.best_score_)
gs_clf.best_params_

0.9205410350799414


{'clf__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

#### Faz a predição dos dados de teste

In [None]:
pred = clf_trained.predict(df_test.data)

In [None]:
acc = np.mean(pred == df_test.target)
print('>>>> Acurácia: ', acc)

>>>> Acurácia:  0.7567049808429118


In [None]:
creport = classification_report(df_test.target, pred, target_names=df_test.target_names)
print(creport)

                          precision    recall  f1-score   support

           comp.graphics       0.83      0.72      0.77       389
 comp.os.ms-windows.misc       0.81      0.67      0.73       394
comp.sys.ibm.pc.hardware       0.74      0.85      0.79       392
   comp.sys.mac.hardware       0.88      0.83      0.85       385
          comp.windows.x       0.89      0.79      0.84       395
  soc.religion.christian       0.53      0.99      0.69       398
      talk.politics.guns       0.64      0.96      0.77       364
   talk.politics.mideast       0.95      0.92      0.93       376
      talk.politics.misc       0.97      0.42      0.59       310
      talk.religion.misc       1.00      0.15      0.26       251

                accuracy                           0.76      3654
               macro avg       0.82      0.73      0.72      3654
            weighted avg       0.81      0.76      0.74      3654



# Abordagem II

In [None]:
clf_2 = Pipeline([
    ('vect', CountVectorizer()), 
    ('tfidf', TfidfTransformer()),
    ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=25, random_state=42))])

In [118]:
# Treinando o modelo na abordagem dois
svm_trained = clf_2.fit(df_train.data, df_train.target)

In [None]:
pred = svm_trained.predict(df_test.data)

### Acurácia

In [None]:
acc = np.mean(pred == df_test.target)
print(f'Acurácia: {acc}')

Acurácia: 0.8070607553366174


#### **Tuning de parametros**

#### Modelo usando **SGD** com Grid Search

In [None]:
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),'clf-svm__alpha': (1e-2, 1e-3)}

In [106]:
gs_clf_svm = GridSearchCV(svm_trained, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(df_train.data, df_train.target)

In [107]:
print(gs_clf_svm.best_score_)
gs_clf_svm.best_params_

0.89921876686376


{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [108]:
pred = gs_clf_svm.predict(df_test.data)

In [109]:
acc = np.mean(pred == df_test.target)
print(f'Acurácia: {acc}')

Acurácia: 0.8147235905856596


In [110]:
creport = classification_report(df_test.target, pred, target_names=df_test.target_names)
print(creport)

                          precision    recall  f1-score   support

           comp.graphics       0.80      0.77      0.78       389
 comp.os.ms-windows.misc       0.76      0.79      0.77       394
comp.sys.ibm.pc.hardware       0.81      0.78      0.80       392
   comp.sys.mac.hardware       0.86      0.86      0.86       385
          comp.windows.x       0.85      0.79      0.82       395
  soc.religion.christian       0.79      0.97      0.87       398
      talk.politics.guns       0.70      0.97      0.81       364
   talk.politics.mideast       0.93      0.95      0.94       376
      talk.politics.misc       0.85      0.61      0.71       310
      talk.religion.misc       0.90      0.55      0.68       251

                accuracy                           0.81      3654
               macro avg       0.83      0.80      0.80      3654
            weighted avg       0.82      0.81      0.81      3654

