In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from tqdm import tqdm

import re
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords

import spacy

import string
PUNCTUATION = string.punctuation

from collections import Counter
import math

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\33610\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\33610\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\33610\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
JSON_FILE_PATH = ""
JSON_FILE_NAME = "News_Category_Dataset_v3_balanced.json"

In [4]:
def extractJsonData(jsonData):
  return pd.read_json(jsonData, lines=True);

jsonFile = open(JSON_FILE_PATH + JSON_FILE_NAME);
df = extractJsonData(jsonFile);

In [5]:
df = df.drop('link', axis=1)
df = df.drop('authors', axis=1)
df = df.drop('date', axis=1)
df = df.drop('headline', axis=1)

In [6]:
class TextPreProcessor:
  def __init__(self, language="english", lemmatization = False, stopword = False, stemmatization = False, lower = False, ponct = False, emoji = False, symbols = False, numbers = False):
    if (lemmatization == True & stemmatization == True):
      raise Exception("Can not lemmatize and stem sentences at the same time.")

    self.lemmatization = lemmatization
    self.stemmatization = stemmatization
    self.lower = lower
    self.ponct = ponct
    self.emoji = emoji
    self.stopword = stopword
    self.symbols = symbols
    self.numbers = numbers
    self.lemmatizer = spacy.load('en_core_web_sm')
    self.stemmer = nltk.SnowballStemmer("english")
    self.REPLACE_BY_SPACE_RE = re.compile('[-+/(){}\[\]\|@,;]')
    self.BAD_SYMBOLS_RE = re.compile('[0-9] {,1}')
    self.STOPWORDS = set(stopwords.words('english'))

  def cleanText(self, text):
    if text == "":
      return ""

    def lower_case(text):
      return text.lower()

    def remove_punctuation(text):
      return text.translate(str.maketrans('', '', PUNCTUATION))

    def remove_symbols(dataframe):
      return self.REPLACE_BY_SPACE_RE.sub(' ', text)

    def remove_numbers(text):
      return self.BAD_SYMBOLS_RE.sub(' ', text)

    def remove_emoji(string):
      emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
      return emoji_pattern.sub(' ', string)

    def remove_stopwords(text):
      return " ".join([word for word in str(text).split() if word not in self.STOPWORDS])

    def lemmatize(text):
      tokens = []
      for token in self.lemmatizer(text):
        tokens.append(token.lemma_)
      return " ".join(tokens)

    def stemmatize(text):
      tokens = []
      for token in text.split(" "):
        tokens.append(self.stemmer.stem(token))
      return " ".join(tokens)
        
    if(self.lower == True):
      text = lower_case(text)
    if(self.numbers == True):
      text = remove_numbers(text)
    if(self.ponct == True):
      text = remove_punctuation(text)
    if(self.symbols == True):
      text = remove_symbols(text)
    if(self.emoji == True):
      text = remove_emoji(text)
    if(self.stopword == True):
      text = remove_stopwords(text)
    if(self.lemmatization == True):
      text = lemmatize(text)
    if(self.stemmatization == True):
      text = stemmatize(text)
    
    return text

In [7]:
class NGram:
    def __init__(self, n=1):
        self.n = n

    def ngram(self, text):
        words = []
        for word in text.split():
            words.append(word)
        temp = zip(*[words[i:] for i in range(0, self.n)])
        ans = [' '.join(n) for n in temp]
        return ans

In [39]:
class Nb_ngram:
    def __init__(self, textPreProcessor = TextPreProcessor(), ngram=1, preProcess = False):
        self.nbClass = 0
        self.isCompile = False
        self.isTrain = False
        self.BoT = dict()
        self.classesProb = []
        self.nGram = NGram(ngram)
        self.preProcess = preProcess
        self.textPreProcessor = textPreProcessor
    
    def get_classes_occurences(self, Y):
        classes = dict()
        for cl in Y:
            if cl not in classes:
                classes[cl] = 0
            classes[cl] += 1
        return classes
        
    def compile(self, X, Y):
        self.X = X
        self.Y = Y
        self.classes = np.unique(Y)
        self.nbClass = len(self.classes)
        
        if self.preProcess == True:
            print("Preprocessing text:")
            for i, sen in enumerate(tqdm(self.X)):
                self.X[i] = self.textPreProcessor.cleanText(sen)

        def oversampling(X, Y):
            average_input_size = 0
            for val in X:
                average_input_size += len(val.split(" "))
            average_input_size = int(average_input_size / len(X))
            
            max_size = 0
            classes_occ = self.get_classes_occurences(Y)
            for cl, occ in classes_occ.items():
                if occ > max_size:
                    max_size = occ

            for classe in np.unique(Y):
                classe_vocab = []
                for text, lb in zip(X, Y):
                    if classe == lb:
                        classe_vocab += self.nGram.ngram(text)
                classe_vocab = np.unique(classe_vocab)
                gen_nb = max_size - classes_occ[classe]
                print(f"Oversampling the classe {classe}:")
                arr = []
                classes = []
                for i in tqdm(range(gen_nb)):
                    random_tokens = np.random.choice(classe_vocab, size=int(average_input_size / self.nGram.n), replace=True)
                    new_string = ""
                    index = 0
                    for i in random_tokens:
                        new_string += i 
                        if index != max_size-1:
                            new_string += " "
                        index += 1
                    arr.append(new_string)
                    classes.append(classe)

                self.X = np.append(self.X, arr)
                self.Y = np.append(self.Y, classes)

        oversampling(self.X, self.Y)

        def create_bag_of_word(X, Y):
            bags_of_ngram = dict();
            for i in self.classes:
                bags_of_ngram[i] = dict()
            print("Creating bags of words...")
            for lab, sen in tqdm(zip(Y, X)):
                ngram_sentence = self.nGram.ngram(sen)
                for t in ngram_sentence:
                    if t not in bags_of_ngram[lab]:
                        bags_of_ngram[lab][t] = 0
                    bags_of_ngram[lab][t] += 1
            return bags_of_ngram

        self.BoT = create_bag_of_word(self.X, self.Y)

    def get_classes_probabilites_log(self, Y):
        def get_classes_proba_log(classes, nb_samples):
            classes_occ = dict()
            for cl, occ in classes.items():
                classes_occ[cl] = math.log(float(occ) / float(nb_samples))
            return classes_occ
        
        self.classes_proba = get_classes_proba_log( classes = self.get_classes_occurences(Y), nb_samples = len(Y) )

    def train(self):
        self.words_by_classes = dict();
        self.vocab = dict()
        print("Extracting vocab:")
        for cl, dic in self.BoT.items():
            if cl not in self.words_by_classes:
                self.words_by_classes[cl] = 0
            for tok, val in tqdm(dic.items()):
                self.words_by_classes[cl] += val
                self.vocab[tok] = 1
        print(len(self.vocab))
        self.vocab_len = len(self.vocab)

        self.get_classes_probabilites_log(self.Y)
        
        self.denominators = dict()
        print("Calculating classes denominators for probabilites:")
        for cl in tqdm(self.classes):
            self.denominators[cl]  = self.words_by_classes[cl] + self.vocab_len
        
        self.Y_info = [(self.BoT[cl], self.classes_proba[cl], self.denominators[cl]) for cl in self.classes] 
        self.Y_info = np.array(self.Y_info) 

    def predict(self, text):
        likelihood_prob = np.zeros(self.classes.shape[0])
        if self.preProcess == True:
            text = self.textPreProcessor.cleanText(text)
        for cl_i, cl in enumerate(self.classes):                 
            for tok in self.nGram.ngram(text):                        
                tok_counts = self.Y_info[cl_i][0].get(tok, 0) + 1 # We add 1 due to the formula to not get 0 probabilities                        
                tok_prob = tok_counts/float(self.Y_info[cl_i][2])                              
                likelihood_prob[cl_i] += math.log(tok_prob)
                                                
        post_prob = np.empty(self.classes.shape[0])
        for cl_i, cl in enumerate(self.classes):
            post_prob[cl_i] = likelihood_prob[cl_i] + self.Y_info[cl_i][1]                              
        
        return post_prob
        

In [46]:
textCleaner = TextPreProcessor( 
    lemmatization=False,
    lower=True,
    stopword = False, 
    stemmatization = True, 
    ponct = True, 
    emoji = True, 
    symbols = True, 
    numbers = True)
nb = Nb_ngram(ngram=3, textPreProcessor=textCleaner, preProcess=True)

In [47]:
train, test = train_test_split(
    df,
    test_size=0.2,
    shuffle=True, 
    stratify=df.category)

In [48]:
nb.compile(train.short_description.values, train.category.values)

Preprocessing text:


100%|██████████| 167621/167621 [00:28<00:00, 5808.87it/s]


Oversampling the classe ARTS & CULTURE:


100%|██████████| 25343/25343 [00:00<00:00, 39848.30it/s]


Oversampling the classe BUSINESS & FINANCES:


100%|██████████| 22283/22283 [00:00<00:00, 39720.27it/s]


Oversampling the classe COMEDY:


100%|██████████| 24161/24161 [00:00<00:00, 40200.27it/s]


Oversampling the classe CRIME:


100%|██████████| 25631/25631 [00:00<00:00, 40363.43it/s]


Oversampling the classe DIVORCE:


100%|██████████| 25740/25740 [00:00<00:00, 39845.13it/s]


Oversampling the classe EDUCATION:


100%|██████████| 26755/26755 [00:00<00:00, 39521.75it/s]


Oversampling the classe ENTERTAINMENT:


100%|██████████| 14592/14592 [00:00<00:00, 39330.95it/s]


Oversampling the classe ENVIRONMENT:


100%|██████████| 25228/25228 [00:00<00:00, 39604.51it/s]


Oversampling the classe FOOD & DRINK:


100%|██████████| 21732/21732 [00:00<00:00, 39300.32it/s]


Oversampling the classe GROUPS VOICES:


100%|██████████| 18833/18833 [00:00<00:00, 39904.16it/s]


Oversampling the classe HOME & LIVING:


100%|██████████| 25025/25025 [00:00<00:00, 40493.92it/s]


Oversampling the classe IMPACT:


100%|██████████| 25694/25694 [00:00<00:00, 40272.88it/s]


Oversampling the classe MEDIA:


100%|██████████| 26126/26126 [00:00<00:00, 38764.84it/s]


Oversampling the classe MISCELLANEOUS:


100%|██████████| 26242/26242 [00:00<00:00, 39515.56it/s]


Oversampling the classe PARENTING:


100%|██████████| 18284/18284 [00:00<00:00, 37311.85it/s]


Oversampling the classe POLITICS:


0it [00:00, ?it/s]


Oversampling the classe RELIGION:


100%|██████████| 26419/26419 [00:00<00:00, 40026.47it/s]


Oversampling the classe SCIENCE & TECH:


100%|██████████| 25033/25033 [00:00<00:00, 39610.02it/s]


Oversampling the classe SPORTS:


100%|██████████| 24419/24419 [00:00<00:00, 40096.69it/s]


Oversampling the classe STYLE & BEAUTY:


100%|██████████| 18827/18827 [00:00<00:00, 39060.18it/s]


Oversampling the classe TRAVEL:


100%|██████████| 20561/20561 [00:00<00:00, 37451.73it/s]


Oversampling the classe U.S. NEWS:


100%|██████████| 27379/27379 [00:00<00:00, 40800.35it/s]


Oversampling the classe WEDDINGS:


100%|██████████| 25559/25559 [00:00<00:00, 39751.02it/s]


Oversampling the classe WEIRD NEWS:


100%|██████████| 26259/26259 [00:00<00:00, 41026.56it/s]


Oversampling the classe WELLNESS:


100%|██████████| 8770/8770 [00:00<00:00, 37799.41it/s]


Oversampling the classe WOMEN:


100%|██████████| 25623/25623 [00:00<00:00, 40101.35it/s]


Oversampling the classe WORLD NEWS:


100%|██████████| 20848/20848 [00:00<00:00, 39408.43it/s]


Creating bags of words...


768987it [00:10, 73912.56it/s]


In [49]:
nb.train()

Extracting vocab:


100%|██████████| 294030/294030 [00:00<00:00, 1872406.54it/s]
100%|██████████| 331379/331379 [00:00<00:00, 1416527.91it/s]
100%|██████████| 274627/274627 [00:00<00:00, 1352625.85it/s]
100%|██████████| 274565/274565 [00:00<00:00, 1605726.07it/s]
100%|██████████| 299689/299689 [00:00<00:00, 871164.55it/s] 
100%|██████████| 289995/289995 [00:00<00:00, 1812224.74it/s]
100%|██████████| 273364/273364 [00:00<00:00, 1697938.05it/s]
100%|██████████| 297555/297555 [00:00<00:00, 1377416.90it/s]
100%|██████████| 298647/298647 [00:00<00:00, 1507993.50it/s]
100%|██████████| 318060/318060 [00:00<00:00, 1309101.94it/s]
100%|██████████| 288098/288098 [00:00<00:00, 789859.56it/s]
100%|██████████| 305496/305496 [00:00<00:00, 1558680.71it/s]
100%|██████████| 275773/275773 [00:00<00:00, 1641191.69it/s]
100%|██████████| 291912/291912 [00:00<00:00, 1586500.74it/s]
100%|██████████| 382683/382683 [00:00<00:00, 1530978.75it/s]
100%|██████████| 325936/325936 [00:00<00:00, 1490162.05it/s]
100%|██████████| 280257/2

7107966
Calculating classes denominators for probabilites:


100%|██████████| 27/27 [00:00<?, ?it/s]


In [54]:
def test_model(X,Y):
    true_positive = 0
    for lb, text in tqdm(zip(Y, X), total=len(Y)):
        predict = nb.classes[nb.predict(text).argmax()]
        if predict == lb:
            true_positive += 1

    return float(true_positive) / float(len(Y))

We achieve better accuracy when we do not remove the stop words. The accuracy is at its best when ngram=3.
The accuracy is still pretty low due to an umbalanced dataset...

In [55]:
accuracy = test_model(test.short_description.values, test.category.values)
print(f"Model accuracy is : {accuracy} %")

100%|██████████| 41906/41906 [00:39<00:00, 1064.19it/s]

Model accuracy is : 0.2937765475110963 %





In [52]:
nb.predict("FIFA has come under pressure from several European soccer federations who want to support a human rights campaign against discrimination at the World Cup.")

array([-346.57262498, -344.07186453, -348.42805305, -350.3331388 ,
       -346.3997935 , -350.68664783, -348.35660948, -346.78740049,
       -350.63540046, -337.94696869, -349.81514548, -345.5925091 ,
       -351.72785539, -345.67120572, -346.472421  , -337.15574614,
       -350.37001604, -348.40851679, -343.51189011, -348.60718015,
       -345.48115296, -350.65537456, -350.20389568, -350.99945142,
       -348.02089534, -345.00590415, -341.41783709])

In [53]:
print(df.short_description[10],df.category[10])

FIFA has come under pressure from several European soccer federations who want to support a human rights campaign against discrimination at the World Cup. WORLD NEWS


In [31]:
df.head()

Unnamed: 0,category,short_description
0,U.S. NEWS,Health experts said it is too early to predict...
1,U.S. NEWS,He was subdued by passengers and crew when he ...
2,COMEDY,"""Until you have a dog you don't understand wha..."
3,PARENTING,"""Accidentally put grown-up toothpaste on my to..."
4,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...


In [57]:
print(nb.X[300000], nb.Y[300000])

spotlight this week togeth to make import part of to think in to the day out of mess  EDUCATION
