In [1]:
import pandas as pd
import numpy as np
from sklearn import naive_bayes, metrics, model_selection, preprocessing

In [2]:
data = pd.read_csv("uci-news-aggregator.csv")
data.head()

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [3]:
import string
punct = string.punctuation

In [4]:
#dataCleaning
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from spacy.lang.en.stop_words import STOP_WORDS

In [5]:
nlp = spacy.load('en_core_web_sm')
stopwords = list(STOP_WORDS)

In [6]:
def text_data_cleaning(sentence):
    doc = nlp(sentence)
    tokens = []
    for token in doc:
        if token.lemma_ != '-PRON-':
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)

    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [8]:
print(text_data_cleaning("  this is the best in the BASIM"))

['good', 'basim']


In [5]:
data.shape

(422419, 8)

In [8]:
data.isnull().sum()

ID           0
TITLE        0
URL          0
PUBLISHER    2
CATEGORY     0
STORY        0
HOSTNAME     0
TIMESTAMP    0
dtype: int64

In [16]:
X = data['TITLE']
X

0         Fed official says weak data caused by weather,...
1         Fed's Charles Plosser sees high bar for change...
2         US open: Stocks fall after Fed official hints ...
3         Fed risks falling 'behind the curve', Charles ...
4         Fed's Plosser: Nasty Weather Has Curbed Job Gr...
                                ...                        
422414    Surgeons to remove 4-year-old's rib to rebuild...
422415    Boy to have surgery on esophagus after battery...
422416    Child who swallowed battery to have reconstruc...
422417    Phoenix boy undergoes surgery to repair throat...
422418    Phoenix boy undergoes surgery to repair throat...
Name: TITLE, Length: 422419, dtype: object

In [17]:
y = data['CATEGORY']
y

0         b
1         b
2         b
3         b
4         b
         ..
422414    m
422415    m
422416    m
422417    m
422418    m
Name: CATEGORY, Length: 422419, dtype: object

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
tfidf = TfidfVectorizer(tokenizer=text_data_cleaning)

In [13]:
text = input('enter news text relevant to business,science,entertainment, health: \n')

enter news text relevant to business,science,entertainment, health: 
Bills sent back to premier for "violating" Article 46, which calls for keeping president in the loop on legislative proposals.


In [20]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

In [21]:
clf = Pipeline([('tfidf',tfidf),('clf', model)]) #It takes alot of time to run
clf.fit(X_train, y_train)

pred = clf.predict([text])

if pred == 'b':
    print("Business News")
elif pred == 't':
    print("Science and Technology")
elif pred == 'e':
    print("Entertainment")
elif pred == 'm':
    print("Health")

Business News


In [22]:
print("Accuracy: \n",accuracy_score(y_test, clf.predict(X_test)))

Accuracy: 
 0.9242578476397898


In [23]:
print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           b       0.89      0.91      0.90     23414
           e       0.95      0.97      0.96     30353
           m       0.97      0.86      0.91      9024
           t       0.90      0.90      0.90     21693

    accuracy                           0.92     84484
   macro avg       0.93      0.91      0.92     84484
weighted avg       0.92      0.92      0.92     84484



In [26]:
import joblib
joblib.dump(clf,'news_classifier.pkl')

['news_classifier.pkl']