In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
import pandas as pd
import re

In [2]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

In [3]:
stop_words = set(stopwords.words("english"))

In [4]:
def preprocess_input(text):
    # text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = " ".join([word for word in text.split() if word.lower() not in stop_words])
    return text

In [5]:
preprocessed_texts = [preprocess_input(text) for text in newsgroups.data]

In [6]:
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(preprocessed_texts)

In [7]:
model = MultinomialNB()
model.fit(X_tfidf, newsgroups.target)

In [8]:
news_articles = [
    "Researchers make progress in understanding the human brain's complex neural networks.",
    "NASA's Perseverance rover detects signs of ancient microbial life on Mars.",
    "Breakthrough in cancer research as a novel treatment shows promising results in early trials.",
    "Astronomers discover a new exoplanet in a distant star system, expanding our understanding of the cosmos.",
]

In [9]:
def prediction(news_article):
    preprocessed_input = preprocess_input(news_article)
    X_input_tfidf = vectorizer.transform([preprocessed_input])
    
    predicted_category = model.predict(X_input_tfidf)
    return newsgroups.target_names[predicted_category[0]]

In [10]:
category_predicted = []
for article in news_articles:
    category_predicted.append(prediction(article))

In [11]:
data  = {
    "aricle" : news_articles,
    "category" : category_predicted
}

df  = pd.DataFrame(data)

In [12]:
df

Unnamed: 0,aricle,category
0,Researchers make progress in understanding the...,sci.med
1,NASA's Perseverance rover detects signs of anc...,sci.space
2,Breakthrough in cancer research as a novel tre...,sci.med
3,Astronomers discover a new exoplanet in a dist...,sci.space
