# **Assuming a set of documents that need to be classified, use the naïve Bayesian Classifier model to perform this task. Built-in Java classes/API can be used to write theprogram. Calculate the accuracy, precision, and recall for your data set.** #


In [3]:
# Import necessary libraries
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
import numpy as np

# 1. Load the 20 newsgroups dataset for training
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
print("Length of the training dataset: ", len(twenty_train))

# 2. Extract features from text using CountVectorizer (word counts)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
print('Dimensions of word counts: ', X_train_counts.shape)

# 3. Transform word counts into TF-IDF (Term Frequency-Inverse Document Frequency)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print('Dimensions of TF-IDF matrix: ', X_train_tfidf.shape)

# 4. Train a Naive Bayes classifier on the training data
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

# 5. Build a pipeline for vectorization, transformation, and classification
text_clf = Pipeline([
    ('vect', CountVectorizer()),           # Step 1: CountVectorizer
    ('tfidf', TfidfTransformer()),         # Step 2: TfidfTransformer
    ('clf', MultinomialNB())               # Step 3: Naive Bayes Classifier
])

# Train the pipeline on the training data
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

# 6. Load the 20 newsgroups dataset for testing
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)

# 7. Make predictions on the test data
predicted = text_clf.predict(twenty_test.data)

# 8. Calculate the prediction accuracy
accuracy = np.mean(predicted == twenty_test.target)
print("Predicted Accuracy: {:.2f}%".format(accuracy * 100))

# 9. Performance metrics: Accuracy, Precision, Recall, and F1-score
print("Accuracy: ", metrics.accuracy_score(twenty_test.target, predicted))
print("Precision: ", metrics.precision_score(twenty_test.target, predicted, average=None))
print("Recall: ", metrics.recall_score(twenty_test.target, predicted, average=None))

# 10. Full classification report
print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))


Length of the training dataset:  5
Dimensions of word counts:  (11314, 130107)
Dimensions of TF-IDF matrix:  (11314, 130107)
Predicted Accuracy: 77.39%
Accuracy:  0.7738980350504514
Precision:  [0.80193237 0.81028939 0.81904762 0.67180617 0.85632184 0.88955224
 0.93127148 0.84651163 0.93686869 0.92248062 0.89170507 0.59379845
 0.83629893 0.92113565 0.84172662 0.43896976 0.64339623 0.92972973
 0.95555556 0.97222222]
Recall:  [0.52037618 0.64781491 0.65482234 0.77806122 0.77402597 0.75443038
 0.69487179 0.91919192 0.9321608  0.89924433 0.96992481 0.96717172
 0.59796438 0.73737374 0.89086294 0.98492462 0.93681319 0.91489362
 0.41612903 0.13944223]
                          precision    recall  f1-score   support

             alt.atheism       0.80      0.52      0.63       319
           comp.graphics       0.81      0.65      0.72       389
 comp.os.ms-windows.misc       0.82      0.65      0.73       394
comp.sys.ibm.pc.hardware       0.67      0.78      0.72       392
   comp.sys.mac.