In [1]:
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import numpy as np
import pandas as pd
import string
import re
import nltk
import spacy

In [2]:
def load_csv_dataset(path):
    """Function to load a dataset from a csv file

    Args:
        path (str): relative path to the csv file

    Returns:
        pd.DataFrame: the dataframe load
    """
    return pd.read_csv(path)

In [3]:
df = load_csv_dataset("train_40k.csv")

In [4]:
df.head()

Unnamed: 0,productId,Title,userId,Helpfulness,Score,Time,Text,Cat1,Cat2,Cat3
0,B000E46LYG,Golden Valley Natural Buffalo Jerky,A3MQDNGHDJU4MK,0/0,3.0,-1,The description and photo on this product need...,grocery gourmet food,meat poultry,jerky
1,B000GRA6N8,Westing Game,unknown,0/0,5.0,860630400,This was a great book!!!! It is well thought t...,toys games,games,unknown
2,B000GRA6N8,Westing Game,unknown,0/0,5.0,883008000,"I am a first year teacher, teaching 5th grade....",toys games,games,unknown
3,B000GRA6N8,Westing Game,unknown,0/0,5.0,897696000,I got the book at my bookfair at school lookin...,toys games,games,unknown
4,B00000DMDQ,I SPY A is For Jigsaw Puzzle 63pc,unknown,2/4,5.0,911865600,Hi! I'm Martine Redman and I created this puzz...,toys games,puzzles,jigsaw puzzles


In [5]:
df = df.drop("productId", axis=1)
df = df.drop("Title", axis=1)
df = df.drop("userId", axis=1)
df = df.drop("Helpfulness", axis=1)
df = df.drop("Score", axis=1)
df = df.drop("Time", axis=1)
df = df.drop("Cat2", axis=1)
df = df.drop("Cat3", axis=1)
df = df.rename(columns={"Text": "description", "Cat1": "label"})

In [6]:
df.head()

Unnamed: 0,description,label
0,The description and photo on this product need...,grocery gourmet food
1,This was a great book!!!! It is well thought t...,toys games
2,"I am a first year teacher, teaching 5th grade....",toys games
3,I got the book at my bookfair at school lookin...,toys games
4,Hi! I'm Martine Redman and I created this puzz...,toys games


In [7]:
lemmatizer = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
stemmer = nltk.SnowballStemmer("english")
remove_symbols = re.compile('[-+/(){}\[\]\|@,;]')
remove_numbers = re.compile('[0-9] {,1}')
PUNCTUATION = string.punctuation
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)

In [8]:
def lemmatize_sentence(sentence):
    """Function to lemmatize a sentence

    Args:
        sentence (str): the string to lemmatize

    Returns:
        str: the lemmatized string
    """
    doc = lemmatizer(sentence)
    return " ".join([token.lemma_ for token in doc])

def text_preprocess(sentence):
    """Function to preprocess a sentence to remove punctuation, emoji, symbols and to lemmatize

    Args:
        sentence (str): sentence to be preprocess

    Returns:
        str: the new sentence
    """
    if isinstance(sentence, str):
        sentence = sentence.lower() ## Make the text lower case
        sentence = sentence.translate(str.maketrans('', '', PUNCTUATION)) ## Remove the punctuation
        sentence = emoji_pattern.sub(' ', sentence)
        sentence = remove_symbols.sub(' ', sentence)
        sentence = remove_numbers.sub(' ', sentence)
        sentence = lemmatize_sentence(sentence)
        return sentence
    Exception("sentence need to be a string.")
    

In [9]:
tqdm.pandas() ## To display a progress bar
df.description = df.description.progress_apply(lambda text : text_preprocess(text))

100%|██████████| 40000/40000 [04:18<00:00, 154.63it/s]


In [10]:
train, test = train_test_split(df, test_size=0.2, stratify=df.label)

In [11]:
def tokenize(sentence):
    return [word for word in sentence.split(" ") if word != ""]

In [118]:
class NaiveBayes:
    """Naive Bayes classe to implement naive bayes algorithm with nGram
    """
    def __init__(self, classes):
        """
        Args:
            classes (np.array): classes of the dataset
        """
        self.classes = np.unique(classes)
        self.nb_classes = len(classes)
        self.istrain = False
    
    def get_classes_occ(self, Y):
        self.classes_occ = dict()
        for y in Y:
            if y not in self.classes_occ:
                self.classes_occ[y] = 0
            self.classes_occ[y] += 1
    
    def get_word_occ(self, X, Y):
        self.word_occ_classes = dict()
        for cl in self.classes:
            self.word_occ_classes[cl] = dict()
        for x, y in tqdm(zip(X,Y), total=len(X)):
            for word in tokenize(x):
                if word not in self.word_occ_classes[y]:
                    self.word_occ_classes[y][word] = 0
                self.word_occ_classes[y][word] += 1
                
    def get_tf(self, word, label, len):
        N = len
        occ = self.word_classes_occ[label][word]
        return occ/N

    def get_idf(self, word):
        try:
            occ = self.words_occ[word] + 1
        except:
            occ = 1
        return np.log(float(len(self.X)) / float(self.word_occ[word]))
    
    def get_tf_idf(self, corpus, label):
        
        def get_class_words(label):
            words = 0
            for key, val in self.word_classes_occ[label].items():
                words += val
            return words
        
        def get_word_occ_data(word):
            words = 0
            for key, val in self.word_classes_occ.items():
                if word in val:
                    words += val[word]
            return words
        
        vec = np.zeros((len(self.vocab),))
        tokens = tokenize(corpus)
        tokens_len = len(tokens)
        print(f"tf idf of corpus {label}")
        for word in tqdm(np.unique(tokens)):
            tf = self.word_classes_occ[label][word] / get_class_words(label)
            idf = np.log(len(self.X) / get_word_occ_data(word))
            vec[self.word_index[word]] = tf * idf
        return vec
    
    def get_classes_proba(self, Y, classes_occ):
        self.classes_proba = dict()
        for cl, val in classes_occ.items():
            self.classes_proba[cl] = np.log(float(val) / float(len(Y)))
        
    def create_corpus(self, X, Y):
        self.corpus = dict()
        for x, y in zip(X, Y):
            if y not in self.corpus:
                self.corpus[y] = []
            self.corpus[y].append(x)
        for cl, arr in self.corpus.items():
            self.corpus[cl] = " ".join(self.corpus[cl])
            
    def train(self, X, Y):
        """Function to create the tf-idf for naive bayes algo

        Args:
            X (np.array): the text to process
            Y (np.array): the label for each text
        """
        if len(X) != len(Y):
            Exception("X and Y need to have the same length.")
        self.X = X #Store the dataset
        self.Y = Y #Store the dataset
        self.create_corpus(self.X, self.Y)
        self.get_classes_occ(self.Y)
        self.get_classes_proba(self.Y, self.classes_occ)
        
        self.vocab = dict()
        self.word_index = dict()
        self.word_occ = dict()
        self.word_classes_occ = dict()
        index = 0
        for label in self.classes:
            self.word_classes_occ[label] = dict()
        for sentence, label in tqdm(zip(self.X, self.Y), total=len(self.X)): ### Calculate vocab size
            for word in tokenize(sentence):
                if word not in self.vocab: ## Update the vocab
                    self.word_occ[word] = 0
                    self.vocab[word] = 1
                    self.word_index[word] = index
                    index += 1 
                if word not in self.word_classes_occ[label]:
                    self.word_classes_occ[label][word] = 0
                    
                self.word_classes_occ[label][word] += 1
                self.word_occ[word] += 1                   

        self.word_tf_idf_classes = dict()
        self.classes_tf_idf_total = dict()
        for y in self.classes:
            self.word_tf_idf_classes[y] = dict()
            self.classes_tf_idf_total[y] = 0
            vector = self.get_tf_idf(self.corpus[y],y)
            for i, value in tqdm(enumerate(vector), total=len(vector)):
                self.classes_tf_idf_total[y] += value
                self.word_tf_idf_classes[y][i] = value
                
        self.denominators = np.zeros((len(self.classes),))
        for i, cl in enumerate(self.classes):
            self.denominators[i] = self.classes_tf_idf_total[cl] + len(self.vocab)
        self.istrain = True         
    
    def predict(self, text):
        """Function to get the probabilities of each classes for a given sentence

        Args:
            text (str): a preprocess sentence to evaluate the classe
        
        Return:
            (np.array): an array containing the proba of each classes for the given sentence.
            The proba are given in log space.
        
        """
        if self.istrain != True:
            Exception("Model is not train")
        likelihood_prob = np.zeros(self.classes.shape[0]) ## Initialize proba at 0 for each class
        for i, y in enumerate(self.classes):
            for token in tokenize(text):
                """Calculate the proba for each token in the sentence.
                The token need to be in the vocab else it is ignore
                """
                if token in self.vocab: ### We ignore the word if not in the vocab
                    token_index = self.word_index[token]
                    token_tf_idf = 0
                    if token_index in self.word_tf_idf_classes[y]:
                        token_tf_idf = self.word_tf_idf_classes[y][token_index]
                    token_tf_idf += 1 ### Laplace
                    token_prob = float(token_tf_idf)/float(self.denominators[i]) ### Final proba of the token
                    likelihood_prob[i] += token_prob ### Calculating somme of proba of each token
        return likelihood_prob 

In [119]:
nb = NaiveBayes(df.label.values)

In [120]:
nb.train(train.description.values, train.label.values)

100%|██████████| 32000/32000 [00:01<00:00, 16039.79it/s]


tf idf of corpus baby products


100%|██████████| 12566/12566 [00:09<00:00, 1265.20it/s]
100%|██████████| 55537/55537 [00:00<00:00, 1790048.88it/s]


tf idf of corpus beauty


100%|██████████| 13446/13446 [00:11<00:00, 1218.93it/s]
100%|██████████| 55537/55537 [00:00<00:00, 1633536.66it/s]


tf idf of corpus grocery gourmet food


100%|██████████| 10411/10411 [00:06<00:00, 1662.30it/s]
100%|██████████| 55537/55537 [00:00<00:00, 1851735.45it/s]


tf idf of corpus health personal care


100%|██████████| 21579/21579 [00:32<00:00, 658.06it/s]
100%|██████████| 55537/55537 [00:00<00:00, 1981330.32it/s]


tf idf of corpus pet supplies


100%|██████████| 12861/12861 [00:09<00:00, 1379.52it/s]
100%|██████████| 55537/55537 [00:00<00:00, 1635107.58it/s]


tf idf of corpus toys games


100%|██████████| 21869/21869 [00:30<00:00, 713.25it/s]
100%|██████████| 55537/55537 [00:00<00:00, 1852147.71it/s]


In [124]:
def test_model(model, test):
    """Function to test our model performance

    Args:
        model (NaiveBayes): a Naive Bayes model
        test (pd.Dataframe): the test dataframe

    Returns:
        int: the accuracy of the model
    """
    success = 0
    for x_test, y_test in tqdm(zip(test.description.values, test.label.values), total=len(test.label)):
        if model.classes[model.predict(x_test).argmax()] == y_test:
            success += 1
    return (float(success) / len(test.label.values)) * 100.0

In [125]:
test_model(nb, test)

100%|██████████| 8000/8000 [00:04<00:00, 1992.02it/s]


34.5875