In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#np.set_printoptions(suppress=True)

from sklearn.datasets import fetch_20newsgroups

categories = ['rec.motorcycles', 'comp.graphics', 'sci.space']

data_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True)

data_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True)

print("*" * 100)
print("Number of documents in Train: ", len(data_train.data))
print("Number of documents in Test: ", len(data_test.data))

print("*" * 100)
print(data_train.data[1])
print("*" * 100)
print(data_test.data[1])

****************************************************************************************************
Number of documents in Train:  1775
Number of documents in Test:  1181
****************************************************************************************************
From: asimov@wk223.nas.nasa.gov (Daniel A. Asimov)
Subject: Re: Sunrise/ sunset times
Organization: NAS, NASA Ames Research Center, Moffett Field, California
Lines: 19

In article <1993Apr21.141824.23536@cbis.ece.drexel.edu> jpw@cbis.ece.drexel.edu (Joseph Wetstein) writes:
>
>Hello. I am looking for a program (or algorithm) that can be used
>to compute sunrise and sunset times.
>
>Joe Wetstein

There is a wonderful book by Jean Meeus called
"Astronomical Algorithms," (1991) which I am fairly sure
contains an algorithm for sunrise and sunset times.


Dan Asimov
Mail Stop T045-1
NASA Ames Research Center
Moffett Field, CA 94035-1000

asimov@nas.nasa.gov
(415) 604-4799

**************************************************

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False)

x_train = vectorizer.transform(data_train.data)
x_test = vectorizer.transform(data_test.data)

y_train = data_train.target
y_test = data_test.target

print(x_train[0])

  (0, 27819)	0.09805806756909202
  (0, 29926)	0.09805806756909202
  (0, 30686)	0.09805806756909202
  (0, 31025)	0.09805806756909202
  (0, 42971)	0.09805806756909202
  (0, 51071)	0.09805806756909202
  (0, 61110)	0.09805806756909202
  (0, 83771)	0.09805806756909202
  (0, 91615)	0.09805806756909202
  (0, 104938)	0.09805806756909202
  (0, 113308)	0.09805806756909202
  (0, 114899)	0.09805806756909202
  (0, 125641)	0.09805806756909202
  (0, 146963)	0.09805806756909202
  (0, 151854)	0.09805806756909202
  (0, 179078)	0.09805806756909202
  (0, 182920)	0.09805806756909202
  (0, 186328)	0.09805806756909202
  (0, 193788)	0.09805806756909202
  (0, 202285)	0.09805806756909202
  (0, 224706)	0.19611613513818404
  (0, 231399)	0.09805806756909202
  (0, 231969)	0.09805806756909202
  (0, 236961)	0.09805806756909202
  (0, 249513)	0.09805806756909202
  :	:
  (0, 684195)	0.09805806756909202
  (0, 697057)	0.09805806756909202
  (0, 720286)	0.09805806756909202
  (0, 752539)	0.09805806756909202
  (0, 773314)	0.2

In [3]:
from sklearn.naive_bayes import MultinomialNB

nb_classifier = MultinomialNB()

nb_classifier.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [4]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("--" * 50)
print("Accuracy Score for Training Data")
print(accuracy_score(y_train, nb_classifier.predict(x_train)))

print("--" * 50)

print("Classification report for Training Data")
print(classification_report(y_train, nb_classifier.predict(x_train)))

print("--" * 50)

print("Accuracy Score for Testing Data")
print(accuracy_score(y_test, nb_classifier.predict(x_test)))

print("--" * 50)

print("Classification report for Testing Data")
print(classification_report(y_test, nb_classifier.predict(x_test)))

print("--" * 50)

print("Train Confusion Matrix")
print(confusion_matrix(y_train, nb_classifier.predict(x_train)))

print("--" * 50)

print("\nTest Confusion Matrix")
print(confusion_matrix(y_test, nb_classifier.predict(x_test)))

----------------------------------------------------------------------------------------------------
Accuracy Score for Training Data
0.9932394366197184
----------------------------------------------------------------------------------------------------
Classification report for Training Data
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       584
           1       0.99      1.00      1.00       598
           2       0.99      1.00      0.99       593

   micro avg       0.99      0.99      0.99      1775
   macro avg       0.99      0.99      0.99      1775
weighted avg       0.99      0.99      0.99      1775

----------------------------------------------------------------------------------------------------
Accuracy Score for Testing Data
0.9576629974597799
----------------------------------------------------------------------------------------------------
Classification report for Testing Data
              precision    recall