In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
import nltk

In [2]:

# Download requirement
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
newsgroups = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)
X, y = newsgroups.data, newsgroups.target

# stop words and stemmer
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

# Preprocess text: tokenization, stopword removal, stemming
def preprocess(text):
    tokens = word_tokenize(text)
    tokens = [stemmer.stem(word) for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

X_preprocessed = [preprocess(text) for text in X]

# Split the dataset 
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.3, random_state=42)

# pipeline with text preprocessing and Naive Bayes classifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

# Train the classifier
text_clf.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = text_clf.predict(X_test)

# Evaluate the model
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-Score: {f1:.4f}')
print('\nClassification Report:\n')
print(classification_report(y_test, y_pred, target_names=newsgroups.target_names))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\22anj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\22anj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Precision: 0.8795
Recall: 0.8569
F1-Score: 0.8505

Classification Report:

                          precision    recall  f1-score   support

             alt.atheism       0.87      0.73      0.79       236
           comp.graphics       0.82      0.83      0.82       287
 comp.os.ms-windows.misc       0.85      0.83      0.84       290
comp.sys.ibm.pc.hardware       0.63      0.86      0.72       285
   comp.sys.mac.hardware       0.93      0.84      0.88       312
          comp.windows.x       0.96      0.83      0.89       308
            misc.forsale       0.92      0.70      0.79       276
               rec.autos       0.93      0.93      0.93       304
         rec.motorcycles       0.94      0.96      0.95       279
      rec.sport.baseball       0.97      0.97      0.97       308
        rec.sport.hockey       0.95      0.98      0.97       309
               sci.crypt       0.81      0.98      0.88       290
         sci.electronics       0.91      0.80      0.85       304
