In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
data = fetch_20newsgroups(subset='all')
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, random_state=42)

In [10]:
X_train, X_test

(["Subject: Re: Ed must be a Daemon Child!!\nFrom: REE700A@MAINE.MAINE.EDU\n <1993Apr2.003029.1962@adobe.com><1993Apr2.163021.17074@linus.mitre.org>\nOrganization: University of Maine System\nLines: 8\n\nEd's heading out on the highway?\n\nDid he finally buy a bike or is he a passanger?\n\nJeff Andle   DoD #3005  1976 KZ900     REE700A@MAINE.MAINE.EDU\n\nIntermittentNet access arranged through Bowdoin College.  Please reply\nvia e-mail, since a followup might expire before I see the Net again.\n",
  'From: hammerl@acsu.buffalo.edu (Valerie S. Hammerl)\nSubject: Re: Octopus in Detroit?\nOrganization: UB\nLines: 14\nNntp-Posting-Host: lictor.acsu.buffalo.edu\n\nJust a side note, squid/octopi made their way to the ice in Buffalo.\nI still don\'t understand why Buffalo, but maybe it\'s lucky.  :-)  btw,\nthey shovel them up from the ice here, using the shovel used to scoop\nup the snow the zamboni leaves as it leaves the ice.  Although Blue\ndid give some technical directions on its remova

In [3]:
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

In [4]:
model.fit(X_train, y_train)

0,1,2
,steps,"[('tfidfvectorizer', ...), ('multinomialnb', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [8]:
y_pred = model.predict(X_test)
y_pred

array([ 9, 12, 14, ...,  9,  3,  8])

In [9]:
print(classification_report(y_test, y_pred, target_names=data.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.88      0.72      0.79       198
           comp.graphics       0.86      0.79      0.82       245
 comp.os.ms-windows.misc       0.88      0.83      0.85       242
comp.sys.ibm.pc.hardware       0.66      0.86      0.75       238
   comp.sys.mac.hardware       0.95      0.84      0.89       250
          comp.windows.x       0.96      0.80      0.87       260
            misc.forsale       0.96      0.66      0.78       241
               rec.autos       0.89      0.93      0.91       244
         rec.motorcycles       0.91      0.95      0.93       219
      rec.sport.baseball       0.96      0.94      0.95       261
        rec.sport.hockey       0.90      0.98      0.94       245
               sci.crypt       0.78      0.98      0.87       251
         sci.electronics       0.92      0.80      0.86       249
                 sci.med       0.97      0.88      0.92       249
         

In [11]:
new_text = ["God is love and the Bible teaches us compassion."]
predicted_label = model.predict(new_text)[0]
print("Predicted category:", data.target_names[predicted_label])

Predicted category: soc.religion.christian


In [12]:
new_texts = [
    "I need help configuring my graphics card driver.",
    "The hockey match yesterday was thrilling!",
    "SpaceX is planning a new mission to Mars."
]

predicted_labels = model.predict(new_texts)

for text, label in zip(new_texts, predicted_labels):
    print(f"Text: {text[:50]}... → Predicted: {data.target_names[label]}")

Text: I need help configuring my graphics card driver.... → Predicted: comp.sys.ibm.pc.hardware
Text: The hockey match yesterday was thrilling!... → Predicted: rec.sport.hockey
Text: SpaceX is planning a new mission to Mars.... → Predicted: sci.space
