In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from tqdm import tqdm

import re
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords

import spacy

import string
PUNCTUATION = string.punctuation

from collections import Counter
import math

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\33610\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\33610\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\33610\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
JSON_FILE_PATH = ""
JSON_FILE_NAME = "News_Category_Dataset_v3_balanced.json"

In [4]:
def extractJsonData(jsonData):
  return pd.read_json(jsonData, lines=True);

jsonFile = open(JSON_FILE_PATH + JSON_FILE_NAME);
df = extractJsonData(jsonFile);

In [16]:
df = df.drop('link', axis=1)
df = df.drop('authors', axis=1)
df = df.drop('date', axis=1)
df = df.drop('headline', axis=1)

In [111]:
class TextPreProcessor:
  def __init__(self, language="english", lemmatization = False, stopword = False, stemmatization = False, lower = False, ponct = False, emoji = False, symbols = False, numbers = False):
    if (lemmatization == True & stemmatization == True):
      raise Exception("Can not lemmatize and stem sentences at the same time.")

    self.lemmatization = lemmatization
    self.stemmatization = stemmatization
    self.lower = lower
    self.ponct = ponct
    self.emoji = emoji
    self.stopword = stopword
    self.symbols = symbols
    self.numbers = numbers
    self.lemmatizer = spacy.load('en_core_web_sm')
    self.stemmer = nltk.SnowballStemmer("english")
    self.REPLACE_BY_SPACE_RE = re.compile('[-+/(){}\[\]\|@,;]')
    self.BAD_SYMBOLS_RE = re.compile('[0-9] {,1}')
    self.STOPWORDS = set(stopwords.words('english'))

  def cleanText(self, text):
    if text == "":
      return ""

    def lower_case(text):
      return text.lower()

    def remove_punctuation(text):
      return text.translate(str.maketrans('', '', PUNCTUATION))

    def remove_symbols(dataframe):
      return self.REPLACE_BY_SPACE_RE.sub(' ', text)

    def remove_numbers(text):
      return self.BAD_SYMBOLS_RE.sub(' ', text)

    def remove_emoji(string):
      emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
      return emoji_pattern.sub(' ', string)

    def remove_stopwords(text):
      return " ".join([word for word in str(text).split() if word not in self.STOPWORDS])

    def lemmatize(text):
      tokens = []
      for token in self.lemmatizer(text):
        tokens.append(token.lemma_)
      return " ".join(tokens)

    def stemmatize(text):
      tokens = []
      for token in text.split(" "):
        tokens.append(self.stemmer.stem(token))
      return " ".join(tokens)
        
    if(self.lower == True):
      text = lower_case(text)
    if(self.numbers == True):
      text = remove_numbers(text)
    if(self.ponct == True):
      text = remove_punctuation(text)
    if(self.symbols == True):
      text = remove_symbols(text)
    if(self.emoji == True):
      text = remove_emoji(text)
    if(self.stopword == True):
      text = remove_stopwords(text)
    if(self.lemmatization == True):
      text = lemmatize(text)
    if(self.stemmatization == True):
      text = stemmatize(text)
    
    return text

In [112]:
class NGram:
    def __init__(self, n=1):
        self.n = n
        self.lemm = spacy.load('en_core_web_sm')

    def ngram(self, text):
        words = []
        for word in text.split():
            words.append(word)
        temp = zip(*[words[i:] for i in range(0, self.n)])
        ans = [' '.join(n) for n in temp]
        return ans

In [113]:
class Nb_ngram:
    def __init__(self, textPreProcessor = TextPreProcessor(), ngram=1, preProcess = False):
        self.nbClass = 0
        self.isCompile = False
        self.isTrain = False
        self.BoT = dict()
        self.classesProb = []
        self.nGram = NGram(ngram)
        self.preProcess = preProcess
        self.textPreProcessor = textPreProcessor
        
    def compile(self, X, Y):
        self.X = X
        self.Y = Y
        self.classes = np.unique(Y)
        self.nbClass = len(self.classes)
        
        if self.preProcess == True:
            print("Preprocessing text:")
            for i, sen in enumerate(tqdm(self.X)):
                self.X[i] = self.textPreProcessor.cleanText(sen)

        def create_bag_of_word(X, Y):
            bags_of_ngram = dict();
            for i in self.classes:
                bags_of_ngram[i] = dict()
            print("Creating bags of words...")
            for lab, sen in tqdm(zip(Y, X)):
                ngram_sentence = self.nGram.ngram(sen)
                for t in ngram_sentence:
                    if t not in bags_of_ngram[lab]:
                        bags_of_ngram[lab][t] = 0
                    bags_of_ngram[lab][t] += 1
            return bags_of_ngram

        self.BoT = create_bag_of_word(self.X, self.Y)

    def get_classes_probabilites_log(self, Y):
        def get_classes_occurences(Y):
            classes = dict()
            for cl in Y:
                if cl not in classes:
                    classes[cl] = 0
                classes[cl] += 1
            return classes
        
        def get_classes_proba_log(classes, nb_samples):
            classes_occ = dict()
            for cl, occ in classes.items():
                classes_occ[cl] = math.log(float(occ) / float(nb_samples))
            return classes_occ
        
        self.classes_proba = get_classes_proba_log( classes = get_classes_occurences(Y), nb_samples = len(Y) )

    def train(self):
        self.words_by_classes = dict();
        self.vocab = []
        print("Extracting vocab:")
        for cl, dic in tqdm(self.BoT.items()):
            if cl not in self.words_by_classes:
                self.words_by_classes[cl] = 0
            for tok, val in dic.items():
                self.words_by_classes[cl] += val
                self.vocab.append(tok)

        self.vocab = np.unique(self.vocab)
        self.vocab_len = len(self.vocab)

        self.get_classes_probabilites_log(self.Y)
        
        self.denominators = dict()
        print("Calculating classes denominators for probabilites:")
        for cl in tqdm(self.classes):
            self.denominators[cl]  = self.words_by_classes[cl] + self.vocab_len
        
        self.Y_info = [(self.BoT[cl], self.classes_proba[cl], self.denominators[cl]) for cl in self.classes] 
        self.Y_info = np.array(self.Y_info) 

    def predict(self, text):
        likelihood_prob = np.zeros(self.classes.shape[0])
        if self.preProcess == True:
            test_str = self.textPreProcessor.cleanText(text)
        for cl_i, cl in enumerate(self.classes):                 
            for tok in self.nGram.ngram(text):                        
                tok_counts = self.Y_info[cl_i][0].get(tok, 0) + 1 # We add 1 due to the formula to not get 0 probabilities                        
                tok_prob = tok_counts/float(self.Y_info[cl_i][2])                              
                likelihood_prob[cl_i] += math.log(tok_prob)
                                                
        post_prob = np.empty(self.classes.shape[0])
        for cl_i, cl in enumerate(self.classes):
            post_prob[cl_i] = likelihood_prob[cl_i] + self.Y_info[cl_i][1]                              
        
        return post_prob
        

In [117]:
textCleaner = TextPreProcessor( 
    lemmatization=False,
    lower=True,
    stopword = False, 
    stemmatization = True, 
    ponct = True, 
    emoji = True, 
    symbols = True, 
    numbers = True)
nb = Nb_ngram(ngram=3, textPreProcessor=textCleaner, preProcess=True)

In [118]:
train, test = train_test_split(
    df,
    test_size=0.2,
    shuffle=True, 
    stratify=df.category)

In [119]:
nb.compile(train.short_description.values, train.category.values)

Preprocessing text:


100%|██████████| 167621/167621 [00:31<00:00, 5371.42it/s]


Creating bags of words...


167621it [00:02, 63978.71it/s]


In [120]:
nb.BoT

{'ARTS & CULTURE': {'ive been reread': 1,
  'been reread huckleberri': 1,
  'reread huckleberri finn': 1,
  'huckleberri finn sinc': 1,
  'finn sinc i': 1,
  'sinc i discov': 1,
  'i discov it': 1,
  'discov it as': 1,
  'it as the': 2,
  'as the sequel': 1,
  'the sequel to': 1,
  'sequel to tom': 1,
  'to tom sawyer': 1,
  'tom sawyer back': 1,
  'sawyer back in': 1,
  'back in junior': 1,
  'in junior high': 1,
  'junior high school': 1,
  'high school get': 1,
  'school get more': 1,
  'get more out': 1,
  'more out of': 1,
  'out of it': 2,
  'of it with': 1,
  'it with each': 1,
  'with each encount': 1,
  'each encount hear': 1,
  'encount hear hal': 1,
  'hear hal holbrook': 1,
  'hal holbrook last': 1,
  'holbrook last friday': 1,
  'last friday inspir': 1,
  'friday inspir me': 1,
  'inspir me to': 1,
  'me to share': 1,
  'to share a': 1,
  'share a few': 1,
  'a few keeper': 1,
  'how do you': 5,
  'do you draw': 1,
  'you draw the': 1,
  'draw the line': 1,
  'the line bet

In [121]:
nb.train()

Extracting vocab:


100%|██████████| 27/27 [00:00<00:00, 54.00it/s]


Calculating classes denominators for probabilites:


100%|██████████| 27/27 [00:00<?, ?it/s]


In [122]:
def test_model(X,Y):
    true_possitive = 0
    for lb, text in tqdm(zip(Y, X), total=len(Y)):
        predict = nb.classes[nb.predict(text).argmax()]
        if predict == lb:
            true_possitive += 1

    return float(true_possitive) / float(len(Y))

We achieve better accuracy when we do not remove the stop words. The accuracy is at its best when ngram=3.
The accuracy is still pretty low due to an umbalanced dataset...

In [123]:
accuracy = test_model(test.short_description.values, test.category.values)
print(f"Model accuracy is : {accuracy} %")

100%|██████████| 41906/41906 [00:41<00:00, 1021.65it/s]

Model accuracy is : 0.18049921252326637 %



