In [6]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_json('News_Category_Dataset_v2.json', lines=True)

In [3]:
df.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [4]:
df = df[['headline', 'category']]

In [5]:
df.isnull().sum()

headline    0
category    0
dtype: int64

In [7]:
df['category'] = df['category'].factorize()[0]

In [8]:
df.head()

Unnamed: 0,headline,category
0,There Were 2 Mass Shootings In Texas Last Week...,0
1,Will Smith Joins Diplo And Nicky Jam For The 2...,1
2,Hugh Grant Marries For The First Time At Age 57,1
3,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,1
4,Julianna Margulies Uses Donald Trump Poop Bags...,1


In [9]:
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

In [12]:
features = vectorizer.fit_transform(df.headline)

In [16]:
labels = df.category

In [17]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.15, random_state=42)


In [18]:
features

<200853x38707 sparse matrix of type '<class 'numpy.float64'>'
	with 1472632 stored elements in Compressed Sparse Row format>

In [20]:
model = LogisticRegression(random_state=0)

In [21]:
lr_model = model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [24]:
y_pred = model.predict(X_test)

In [25]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_pred))

[[ 248   20    0 ...    1    0    0]
 [  11 1851    1 ...    0    0    0]
 [   7   10   54 ...    0    0    0]
 ...
 [   3    6    0 ...   77    0    0]
 [   0    4    0 ...    1   36    0]
 [   0   23    0 ...    1    0   40]]


In [26]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.56      0.49      0.52       511
           1       0.56      0.77      0.65      2391
           2       0.51      0.17      0.25       325
           3       0.45      0.22      0.30       510
           4       0.61      0.86      0.72      4851
           5       0.37      0.21      0.26       378
           6       0.56      0.37      0.45       660
           7       0.43      0.32      0.37       538
           8       0.63      0.42      0.50       796
           9       0.78      0.64      0.70       955
          10       0.69      0.65      0.67       755
          11       0.52      0.45      0.48       867
          12       0.67      0.76      0.71      1512
          13       0.57      0.34      0.43       409
          14       0.61      0.37      0.46       333
          15       0.65      0.42      0.51       366
          16       0.73      0.41      0.52       337
          17       0.61    

In [27]:
import joblib

joblib.dump(lr_model, 'lr_model.pkl')

joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [28]:
lr_ = joblib.load('lr_model.pkl')
vec_ = joblib.load('vectorizer.pkl')

In [29]:
no_to_topic = {'CRIME': 0, 'ENTERTAINMENT': 1, 'WORLD NEWS': 2, 'IMPACT': 3, 'POLITICS': 4,
                            'WEIRD NEWS': 5, 'BLACK VOICES': 6, 'WOMEN': 7, 'COMEDY': 8, 'QUEER VOICES':9,
                            'SPORTS':10, 'BUSINESS': 11, 'TRAVEL': 12, 'MEDIA': 13, 'TECH': 14, 'RELIGION': 14,
                            'SCIENCE': 15, 'LATINO VOICES': 16, 'EDUCATION': 17, 'COLLEGE': 18, 'PARENTS': 19,
                            'ARTS & CULTURE': 20, 'STYLE': 21, 'GREEN': 22, 'TASTE': 23, 'HEALTHY LIVING': 24,
                            'THE WORLDPOST': 25, 'GOOD NEWS': 26, 'WORLDPOST': 27, 'FIFTY': 28, 'ARTS': 29,
                            'WELLNESS': 30, 'PARENTING': 31, 'HOME & LIVING': 32, 'STYLE & BEAUTY': 33,
                            'DIVORCE': 34, 'WEDDINGS': 35, 'FOOD & DRINK': 36, 'MONEY': 37, 'ENVIRONMENT': 38,
                            'CULTURE & ARTS': 39}

In [41]:
headline = 'US labor costs grew at fastest pace in two decades'
tokens = vec_.transform([headline])
probs = lr_.predict_proba(tokens)
probs = np.array(probs[0])
pred = probs.argsort()[-5:][::-1]

pos = []
for i in pred:
    val = list(no_to_topic.values()).index(i)
    pos.append(list(no_to_topic.keys())[val])

vals = np.sort(probs)[-5:][::-1]
for i in range(5):
    print(f'{i+1}-> Prediction: {pos[i]} with a probability of {round(vals[i], 2)*100}%.')

1-> Prediction: BUSINESS with a probability of 28.000000000000004%.
2-> Prediction: POLITICS with a probability of 14.000000000000002%.
3-> Prediction: HOME & LIVING with a probability of 8.0%.
4-> Prediction: ENVIRONMENT with a probability of 5.0%.
5-> Prediction: PARENTING with a probability of 5.0%.


In [31]:
df.info()

TypeError: Cannot interpret '<attribute 'dtype' of 'numpy.generic' objects>' as a data type