In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy

In [11]:
data = pd.read_json('datasets/News_Category_Dataset_v2.json', lines=True)
data.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [12]:
data.columns

Index(['category', 'headline', 'authors', 'link', 'short_description', 'date'], dtype='object')

In [13]:
data = data.drop(['authors', 'link', 'date'], axis=1)
data.head()

Unnamed: 0,category,headline,short_description
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,She left her husband. He killed their children...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Of course it has a song.
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,The actor and his longtime girlfriend Anna Ebe...
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,The actor gives Dems an ass-kicking for not fi...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,"The ""Dietland"" actress said using the bags is ..."


In [14]:
data['category'].value_counts()

POLITICS          32739
WELLNESS          17827
ENTERTAINMENT     16058
TRAVEL             9887
STYLE & BEAUTY     9649
PARENTING          8677
HEALTHY LIVING     6694
QUEER VOICES       6314
FOOD & DRINK       6226
BUSINESS           5937
COMEDY             5175
SPORTS             4884
BLACK VOICES       4528
HOME & LIVING      4195
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3651
WOMEN              3490
IMPACT             3459
DIVORCE            3426
CRIME              3405
MEDIA              2815
WEIRD NEWS         2670
GREEN              2622
WORLDPOST          2579
RELIGION           2556
STYLE              2254
SCIENCE            2178
WORLD NEWS         2177
TASTE              2096
TECH               2082
MONEY              1707
ARTS               1509
FIFTY              1401
GOOD NEWS          1398
ARTS & CULTURE     1339
ENVIRONMENT        1323
COLLEGE            1144
LATINO VOICES      1129
CULTURE & ARTS     1030
EDUCATION          1004
Name: category, 

In [15]:
len(data)

200853

In [16]:
# merge column headline and short desc
data['text'] = data['headline'].astype(str)+"-"+data['short_description']
data.drop(columns =['headline','short_description'],axis = 1, inplace=True)
data.astype(str)
data.head()

Unnamed: 0,category,text
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 5...
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...


In [17]:
from spacy.lang.en.stop_words import STOP_WORDS
from collections import Counter
import string

# Lemmatization
def lemmetization(doc):
    words = []
    for word in doc:
        if word.lemma_ != "-PRON-":
            temp = word.lemma_.lower().strip()
        else:
            temp = word.lower_

        words.append(temp)

    return words

# Remove punctuation and stopwords
def punc_removal(doc):
    words = []
    for word in doc:
        if word not in list(STOP_WORDS) and word not in string.punctuation:
            words.append(word)
    
    return words

save_words = []
def text_cleaning_english(text,mynlp):

    doc = mynlp(text)
    # lemmetization
    words = lemmetization(doc)
    # remove punctuation
    words = punc_removal(words)

    # save all words in the data file
    for word in words:
        save_words.append(word)

    # traverse in the string     
    complete_sentence = ' '.join([str(word) for word in words])

    return complete_sentence

def most_used_words():
    list = []
    common_words = Counter(save_words).most_common(5)
    for key, value in common_words:
        list.append({'category': key, 'value': value})

    return list

In [18]:
nlp = spacy.load("en_core_web_sm")
text_cleaning = lambda x: text_cleaning_english(x,mynlp=nlp)
data['cleaned_text'] = pd.DataFrame(data['text'].apply(text_cleaning))

common_words = most_used_words()
print(common_words)

[{'category': 'new', 'value': 18882}, {'category': 'trump', 'value': 17816}, {'category': 'year', 'value': 16079}, {'category': '--', 'value': 14813}, {'category': 'day', 'value': 13652}]


In [24]:
from sklearn.preprocessing import LabelEncoder

label_encoder_target = LabelEncoder().fit(data['category'])
encoded_target = label_encoder_target.transform(data['category'])

data = data.drop('category', axis=1)
new_target_df = pd.DataFrame(data=encoded_target,columns=['category'])
# ! ADD JOIN='INNER' TO PRESERVE DTYPE OF INT IF NOT IT WILL CHANGE TO OTHER DTYPE
data = pd.concat([data,new_target_df], axis=1, join='inner')
data.head()


Unnamed: 0,text,cleaned_text,ARTS,ARTS & CULTURE,BLACK VOICES,BUSINESS,COLLEGE,COMEDY,CRIME,CULTURE & ARTS,...,TASTE,TECH,THE WORLDPOST,TRAVEL,WEDDINGS,WEIRD NEWS,WELLNESS,WOMEN,WORLD NEWS,WORLDPOST
0,There Were 2 Mass Shootings In Texas Last Week...,2 mass shootings texas week 1 tv leave husband...,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Will Smith Joins Diplo And Nicky Jam For The 2...,smith joins diplo nicky jam 2018 world cup off...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Hugh Grant Marries For The First Time At Age 5...,hugh grant marries time age 57 actor longtime ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,jim carrey blasts castrato adam schiff democra...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Julianna Margulies Uses Donald Trump Poop Bags...,julianna margulies use donald trump poop bags ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# SPLIT TRAINING & TESTING DATA
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'],data['category'],test_size=0.2,shuffle=True, random_state=42)
print(X_train.shape, y_train.shape)

(160682,) (160682,)


In [23]:
y_train

66880           BUSINESS
68387            PARENTS
100514             GREEN
197218    STYLE & BEAUTY
144020          WELLNESS
               ...      
119879    HEALTHY LIVING
103694          RELIGION
131932    STYLE & BEAUTY
146867     HOME & LIVING
121958         GOOD NEWS
Name: category, Length: 160682, dtype: object

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC 
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score, precision_score, f1_score, recall_score

def sentiment_pipeline(data_train_input,data_train_target,model_type):
    # Classifier selection
    if model_type == "linear":
        classifier = LinearSVC()
    elif model_type == "logistic":
        classifier = LogisticRegression(max_iter=1000)
    elif model_type == "sgd":
        classifier = SGDClassifier()
    elif model_type == "naive_bayes":
        classifier = MultinomialNB()
    elif model_type == "xgboost":
        classifier = XGBClassifier(use_label_encoder=False,eta=0.1,gamma=0.3, n_estimators=100, learning_rate=0.5, min_child_weight=5, 
        max_depth=5, colsample_bytree=0.7,objective="multi:softmax", eval_metric="mlogloss",verbosity=0)

    tfidf = TfidfVectorizer()

    # Pipeline setup
    clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])

    model = clf.fit(data_train_input,data_train_target)

    return model

def sentiment_model_predict(model,data_test_input,data_test_target):
    data_prediction=model.predict(data_test_input)
    conf_matrix = confusion_matrix(data_test_target,data_prediction)
    acc_score = accuracy_score(data_test_target, data_prediction)
    pre_score = precision_score(data_test_target, data_prediction, average="macro")
    re_score = recall_score(data_test_target, data_prediction, average="macro")
    f_score = f1_score(data_test_target, data_prediction, average="macro")

    print("Accuracy : "+str(round(acc_score*100,2)))
    print("Precision : "+str(round(pre_score*100,2)))
    print("Recall : "+str(round(re_score*100,2)))
    print("F1-Score :"+str(round(f_score*100,2)))
    print(conf_matrix)

In [21]:
# Support Vector Classification
svm_model = sentiment_pipeline(X_train, y_train, 'linear')
sentiment_model_predict(svm_model,X_test,y_test)

Accuracy : 60.56
Precision : 51.88
Recall : 45.68
F1-Score :47.88
[[ 61  23   5 ...   5   3   3]
 [ 17  53   8 ...  16   0   2]
 [  2   4 380 ...   7   0   1]
 ...
 [  1   6   7 ... 209   1   2]
 [  0   0   1 ...   5 107  19]
 [  1   2   5 ...   0  17 162]]


In [22]:
# Xgboost
xg_model = sentiment_pipeline(X_train, y_train, 'xgboost')
sentiment_model_predict(xg_model,X_test,y_test)

ValueError: The label must consist of integer labels of form 0, 1, 2, ..., [num_class - 1].