In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from xgboost import XGBRFClassifier
from xgboost import XGBClassifier
from joblib import load, dump

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
dataset = pd.read_json("News_Category_Dataset_v2.json",lines=True)
dataset.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [3]:
dataset.shape

(200853, 6)

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200853 entries, 0 to 200852
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   category           200853 non-null  object        
 1   headline           200853 non-null  object        
 2   authors            200853 non-null  object        
 3   link               200853 non-null  object        
 4   short_description  200853 non-null  object        
 5   date               200853 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.2+ MB


In [5]:
dataset['category'].value_counts()

POLITICS          32739
WELLNESS          17827
ENTERTAINMENT     16058
TRAVEL             9887
STYLE & BEAUTY     9649
PARENTING          8677
HEALTHY LIVING     6694
QUEER VOICES       6314
FOOD & DRINK       6226
BUSINESS           5937
COMEDY             5175
SPORTS             4884
BLACK VOICES       4528
HOME & LIVING      4195
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3651
WOMEN              3490
IMPACT             3459
DIVORCE            3426
CRIME              3405
MEDIA              2815
WEIRD NEWS         2670
GREEN              2622
WORLDPOST          2579
RELIGION           2556
STYLE              2254
SCIENCE            2178
WORLD NEWS         2177
TASTE              2096
TECH               2082
MONEY              1707
ARTS               1509
FIFTY              1401
GOOD NEWS          1398
ARTS & CULTURE     1339
ENVIRONMENT        1323
COLLEGE            1144
LATINO VOICES      1129
CULTURE & ARTS     1030
EDUCATION          1004
Name: category, 

In [6]:
dataset.drop(['authors','link','date','short_description'],axis=1,inplace=True)

In [7]:
dataset.rename({'headline':'Text'},axis=1,inplace=True)

In [8]:
from sklearn.utils import shuffle
def Imbalance_to_balance(df,No_of_sample):
    df_dic = {}
    df_new = pd.DataFrame(columns=['category','Text'])
    
    #cal use to iterate over each class in target col
    for cal in df['category'].unique():
        
        # Filtering class from target which has more than No_of_sample row
        if df[df['category'] == cal].shape[0] > No_of_sample:
            
            #Extracting 4000 sample class wise from dataframe
            df_dic[cal] = df[df['category'] == cal].sample(No_of_sample,random_state=42,ignore_index=True)

    cal = list(df_dic.keys())
    for classs in cal:
        
        df_new = pd.concat([df_new,df_dic[f"{classs}"]],axis=0)
    
    return shuffle(df_new)

In [9]:
df= Imbalance_to_balance(dataset,8000)

In [10]:
df['CategoryId'] = df['category'].factorize()[0]
df.head()

Unnamed: 0,category,Text,CategoryId
2750,ENTERTAINMENT,Plies Gets Thrown Off Stage By Fan,0
5971,STYLE & BEAUTY,Fashionably Late Style Quiz: A Test On All The...,1
2737,PARENTING,Food Allergy Advice for Kids: Don't Delay Pean...,2
2379,PARENTING,Girls and Long Hair: What Message Are We Sending?,2
2942,WELLNESS,Journey to Well-Being: New Light on Yoga for D...,3


In [11]:
def remove_tags(text):
  remove = re.compile(r'')
  return re.sub(remove, '', text)
df['Text'] = df['Text'].apply(remove_tags)

In [12]:
def special_char(text):
  reviews = ''
  for x in text:
    if x.isalnum():
      reviews = reviews + x
    else:
      reviews = reviews + ' '
  return reviews
df['Text'] = df['Text'].apply(special_char)

In [13]:
def convert_lower(text):
   return text.lower()
df['Text'] = df['Text'].apply(convert_lower)

In [14]:
def remove_stopwords(text):
  stop_words = set(stopwords.words('english'))
  words = word_tokenize(text)
  return [x for x in words if x not in stop_words]
df['Text'] = df['Text'].apply(remove_stopwords)

In [15]:
def lemmatize_word(text):
  wordnet = WordNetLemmatizer()
  return " ".join([wordnet.lemmatize(word) for word in text])
df['Text'] = df['Text'].apply(lemmatize_word)

In [16]:
df

Unnamed: 0,category,Text,CategoryId
2750,ENTERTAINMENT,ply get thrown stage fan,0
5971,STYLE & BEAUTY,fashionably late style quiz test style news mi...,1
2737,PARENTING,food allergy advice kid delay peanut egg,2
2379,PARENTING,girl long hair message sending,2
2942,WELLNESS,journey well new light yoga depression anxiety,3
...,...,...,...
1461,TRAVEL,get better seat next time fly,5
4849,WELLNESS,fuss dalai lama,3
3873,ENTERTAINMENT,chrissy metz kate standalone u episode story i...,0
4855,TRAVEL,every international dining rule need know one ...,5


In [17]:
df=df[:10000]

In [18]:
df['category'].unique()

array(['ENTERTAINMENT', 'STYLE & BEAUTY', 'PARENTING', 'WELLNESS',
       'POLITICS', 'TRAVEL'], dtype=object)

In [19]:
x = np.array(df.iloc[:,0].values)
y = np.array(df.CategoryId.values)
cv = CountVectorizer(max_features = 5000)
x = cv.fit_transform(df.Text).toarray()
print("X.shape = ",x.shape)
print("y.shape = ",y.shape)

X.shape =  (10000, 5000)
y.shape =  (10000,)


In [20]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0, shuffle = True)
print(len(x_train))
print(len(x_test))

7000
3000


In [21]:
lr = LogisticRegression()
lr = OneVsRestClassifier(lr)
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
print(accuracy)

74.97


In [22]:
rfc = RandomForestClassifier(n_estimators=100 ,criterion='entropy' , random_state=0)
rfc = OneVsRestClassifier(rfc)
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)
accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
print(accuracy)