In [1]:
# import all libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report




In [2]:
#2. Import 20 newsgroup dataset from scikit-learn

newsgroups = fetch_20newsgroups(subset='all')


In [3]:
#3. Load 20 newsgroup train subset

train_data = fetch_20newsgroups(subset='train')


In [4]:
#3. Load 20 newsgroup test subset

test_data = fetch_20newsgroups(subset='test')


In [5]:
#✅ 5. Print all target labels

print("Target Labels:", train_data.target_names)


Target Labels: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [6]:
#✅ 6. Prepare subset of categories

cat = ['alt.atheism', 'comp.graphics', 'sci.space']


In [11]:
#✅ 7. Load 20 newsgroup train subset with three categories

train_subset = fetch_20newsgroups(subset='train', categories=cat)


In [12]:
#✅ 8. Load 20 newsgroup test subset with three categories

test_subset = fetch_20newsgroups(subset='test', categories=cat)


In [13]:
#✅ 9. Print new training set target names (Labels)

print("Training set target names:", train_subset.target_names)


Training set target names: ['alt.atheism', 'comp.graphics', 'sci.space']


In [14]:
# print training data of 5th  Article

print("5th Article Data:\n", train_subset.data[4])


5th Article Data:
 From: henry@zoo.toronto.edu (Henry Spencer)
Subject: Re: TRUE "GLOBE", Who makes it?
Organization: U of Toronto Zoology
Lines: 12

In article <bill.047m@xpresso.UUCP> bill@xpresso.UUCP (Bill Vance) writes:
>It has been known for quite a while that the earth is actually more pear
>shaped than globular/spherical.  Does anyone make a "globe" that is accurate
>as to actual shape, landmass configuration/Long/Lat lines etc.?

I don't think you're going to be able to see the differences from a sphere
unless they are greatly exaggerated.  Even the equatorial bulge is only
about 1 part in 300 -- you'd never notice a 1mm error in a 30cm globe --
and the other deviations from spherical shape are much smaller.
-- 
SVR4 resembles a high-speed collision   | Henry Spencer @ U of Toronto Zoology
between SVR3 and SunOS.    - Dick Dunn  |  henry@zoo.toronto.edu  utzoo!henry



In [15]:
#✅ 11. Print shape of data and targets

print("Training Set Shape:", np.shape(train_subset.data))
print("Test Set Shape:", np.shape(test_subset.data))


Training Set Shape: (1657,)
Test Set Shape: (1102,)


In [16]:
#✅ 12. Print training set filenames

print("Training set filenames:", train_subset.filenames[:5])  # Print first 5 filenames


Training set filenames: ['/home/user/scikit_learn_data/20news_home/20news-bydate-train/sci.space/60869'
 '/home/user/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38633'
 '/home/user/scikit_learn_data/20news_home/20news-bydate-train/alt.atheism/53534'
 '/home/user/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38516'
 '/home/user/scikit_learn_data/20news_home/20news-bydate-train/sci.space/61210']


In [28]:
#✅ 13. By using CountVectorizer, convert train data into numerical format

vectorizer = CountVectorizer()

X_train = vectorizer.fit_transform(train_subset.data)



In [20]:
#✅ 14. Use BernoulliNB for training

model = BernoulliNB()
model.fit(X_train, train_subset.target)


In [21]:
#✅ 15. By using CountVectorizer, convert test data into numerical format

X_test = vectorizer.transform(test_subset.data)


In [22]:
#✅ 16 predict target labels for testing set

y_pred = model.predict(X_test)


In [23]:
#✅ 17. Find accuracy score on the test set

accuracy = accuracy_score(test_subset.target, y_pred)
print("Test Set Accuracy:", accuracy)


Test Set Accuracy: 0.852994555353902


In [24]:
#✅ 18. Use TfidfVectorizer instead of CountVectorizer and use MultinomialNB

# Using TfidfVectorizer
tfidf_vect = TfidfVectorizer()

X_train_tfidf = tfidf_vect.fit_transform(train_subset.data)
X_test_tfidf = tfidf_vect.transform(test_subset.data)

# Multinomial Naive Bayes
mnb = MultinomialNB()
mnb.fit(X_train_tfidf, train_subset.target)

# Prediction and accuracy
y_pred_tfidf = mnb.predict(X_test_tfidf)

accuracy_tfidf = accuracy_score(test_subset.target, y_pred_tfidf)

print("TFIDF Test Set Accuracy:", accuracy_tfidf)


TFIDF Test Set Accuracy: 0.9473684210526315


In [25]:
#✅ 19. Find test set accuracy

print("BernoulliNB Accuracy:", accuracy)
print("MultinomialNB with TFIDF Accuracy:", accuracy_tfidf)


BernoulliNB Accuracy: 0.852994555353902
MultinomialNB with TFIDF Accuracy: 0.9473684210526315


In [26]:
#✅ 20. Try with avoiding stopwords and repeat the same

# Vectorizing with stopwords removed
vectorizer_stopwords = CountVectorizer(stop_words='english')
X_train_sw = vectorizer_stopwords.fit_transform(train_subset.data)
X_test_sw = vectorizer_stopwords.transform(test_subset.data)

# BernoulliNB with stopwords removed
bnb_sw = BernoulliNB()
bnb_sw.fit(X_train_sw, train_subset.target)

# Prediction and accuracy
y_pred_sw = bnb_sw.predict(X_test_sw)
accuracy_sw = accuracy_score(test_subset.target, y_pred_sw)
print("Accuracy with stopwords removed:", accuracy_sw)



Accuracy with stopwords removed: 0.8856624319419237
