In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
from tqdm import tqdm
import numpy as np
import pandas as pd
import string
import re
import nltk
import spacy

In [2]:
def load_csv_dataset(path):
    """Function to load a dataset from a csv file

    Args:
        path (str): relative path to the csv file

    Returns:
        pd.DataFrame: the dataframe load
    """
    return pd.read_csv(path)

In [3]:
df = load_csv_dataset("train_40k.csv")

In [4]:
df = df.drop("productId", axis=1)
df = df.drop("Title", axis=1)
df = df.drop("userId", axis=1)
df = df.drop("Helpfulness", axis=1)
df = df.drop("Score", axis=1)
df = df.drop("Time", axis=1)
df = df.drop("Cat2", axis=1)
df = df.drop("Cat3", axis=1)
df = df.rename(columns={"Text": "description", "Cat1": "label"})

In [5]:
lemmatizer = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
stemmer = nltk.SnowballStemmer("english")
remove_symbols = re.compile('[-+/(){}\[\]\|@,;]')
remove_numbers = re.compile('[0-9] {,1}')
PUNCTUATION = string.punctuation
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)

In [6]:
def lemmatize_sentence(sentence):
    """Function to lemmatize a sentence

    Args:
        sentence (str): the string to lemmatize

    Returns:
        str: the lemmatized string
    """
    doc = lemmatizer(sentence)
    return " ".join([token.lemma_ for token in doc])

def text_preprocess(sentence):
    """Function to preprocess a sentence to remove punctuation, emoji, symbols and to lemmatize

    Args:
        sentence (str): sentence to be preprocess

    Returns:
        str: the new sentence
    """
    if isinstance(sentence, str):
        sentence = sentence.lower() ## Make the text lower case
        sentence = sentence.translate(str.maketrans('', '', PUNCTUATION)) ## Remove the punctuation
        sentence = emoji_pattern.sub(' ', sentence)
        sentence = remove_symbols.sub(' ', sentence)
        sentence = remove_numbers.sub(' ', sentence)
        sentence = lemmatize_sentence(sentence)
        return sentence
    Exception("sentence need to be a string.")

In [7]:
tqdm.pandas() ## To display a progress bar
df.description = df.description.progress_apply(lambda text : text_preprocess(text))

100%|██████████| 40000/40000 [03:51<00:00, 172.84it/s]


In [8]:
train, test = train_test_split(df, test_size=0.2, stratify=df.label)

In [9]:
tf_vectorizer = CountVectorizer() # or term frequency

X_train_tf = tf_vectorizer.fit_transform(train.description)
X_test_tf = tf_vectorizer.transform(test.description)

In [10]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tf, train.label)

In [11]:
y_pred = naive_bayes_classifier.predict(X_test_tf)

In [14]:
score1 = metrics.accuracy_score(test.label, y_pred)
print(metrics.classification_report(test.label, y_pred,
                                            target_names=['baby products', 'beauty', 'grocery gourmet food',
       'health personal care', 'pet supplies', 'toys games']))

                      precision    recall  f1-score   support

       baby products       0.79      0.82      0.80      1128
              beauty       0.87      0.78      0.82      1169
grocery gourmet food       0.86      0.70      0.77       723
health personal care       0.74      0.80      0.77      1955
        pet supplies       0.93      0.80      0.86       972
          toys games       0.85      0.93      0.89      2053

            accuracy                           0.82      8000
           macro avg       0.84      0.80      0.82      8000
        weighted avg       0.83      0.82      0.82      8000

