In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

from nltk.corpus import stopwords

import pandas as pd
import re

In [3]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

In [4]:
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.2, random_state=42)

In [5]:
stop_words = set(stopwords.words("english"))
def preprocess_input(text):
    text = " ".join([word for word in text.split() if word.lower() not in stop_words])
    return text

In [6]:
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=preprocess_input)),  # Step 1: TF-IDF Vectorization with preprocessing
    ('clf', MultinomialNB())                                     # Step 2: Naive Bayes Classifier
])

In [7]:
text_clf.fit(X_train, y_train)

In [8]:
predictions = text_clf.predict(X_test)

In [9]:
accuracy = text_clf.score(X_test, y_test)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7018567639257295


In [10]:
news_articles = [
    "Researchers make progress in understanding the human brain's complex neural networks.",
    "NASA's Perseverance rover detects signs of ancient microbial life on Mars.",
    "Breakthrough in cancer research as a novel treatment shows promising results in early trials.",
    "Astronomers discover a new exoplanet in a distant star system, expanding our understanding of the cosmos.",
]

In [11]:
category_predicted = text_clf.predict(news_articles)

In [12]:
predicted_category_names = [newsgroups.target_names[i] for i in category_predicted]

# Create a DataFrame with the results
data = {
    "article": news_articles,
    "category": predicted_category_names
}

df = pd.DataFrame(data)
print(df)

                                             article                category
0  Researchers make progress in understanding the...  soc.religion.christian
1  NASA's Perseverance rover detects signs of anc...               sci.space
2  Breakthrough in cancer research as a novel tre...                 sci.med
3  Astronomers discover a new exoplanet in a dist...               sci.space
