In [4]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Loadingthe twenty user groups dataset
twenty_users = fetch_20newsgroups()

# Creating a CountVectorizer with default parameters
count_vectorizer = CountVectorizer()

# Transforming the training data using CountVectorizer
X_train_counts = count_vectorizer.fit_transform(twenty_users.data)

# Getting the total number of documents
total_documents = X_train_counts.shape[0]

# Calculating the document frequency of each word
word_freq = np.asarray(X_train_counts.sum(axis=0)).squeeze()
document_freq = word_freq / total_documents

# Finding words appearing in less than 2.5% or more than 97.5% of documents
stopwords_25 = [word for word, df in zip(count_vectorizer.get_feature_names_out(), document_freq) if df < 0.025]
stopwords_975 = [word for word, df in zip(count_vectorizer.get_feature_names_out(), document_freq) if df > 0.975]

# Adding the stopwords to the CountVectorizer
count_vectorizer.stop_words_.update(stopwords_25)
count_vectorizer.stop_words_.update(stopwords_975)

# Transforming the testing data using CountVectorizer
X_test_counts = count_vectorizer.transform(twenty_users.data)

# Training a Multinomial Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_counts, twenty_users.target)

# Prediction on the training and testing data
train_predictions = classifier.predict(X_train_counts)
test_predictions = classifier.predict(X_test_counts)

# Calculating accuracy on the training and testing data
train_accuracy = accuracy_score(twenty_users.target, train_predictions)
test_accuracy = accuracy_score(twenty_users.target, test_predictions)

print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)


Training Accuracy: 0.9245182959165635
Testing Accuracy: 0.9245182959165635


    What does this do to the accuracy of the classifier on the training set? On the testing set?