In [1]:
import joblib
import nltk
import numpy as np
import pandas as pd
import pickle
import sklearn
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow import keras

In [2]:
# File paths

# Multinomial Naive Bayes model file path
MODEL_DIR = "multi_mnb_model.joblib"

# Balanced datasets
BALANCED_TRAIN_DATASET = "../balanced_dataset.pickle"
BALANCED_TEST_DATASET = "../balanced_test_dataset.pickle"

# Preprocessed balanced data
PREPROCESSED_BAL_TRAIN_DATASET = "../preprocessed_train.pickle"
PREPROCESSED_BAL_TEST_DATASET = "../preprocessed_test.pickle"

In [3]:
# Function to load pickle file
# Params:
    # Str - @file_path: File path of pickle file
# Output:
    # Saved object in original file type (list/dataframe)
def load_pickle(file_path):
    return pickle.load(open(file_path, "rb"))

In [4]:
# Get preprocessed train dataset
bal_train_dataset = load_pickle(PREPROCESSED_BAL_TRAIN_DATASET)

# Get preprocessed test dataset
bal_test_dataset = load_pickle(PREPROCESSED_BAL_TEST_DATASET)

# Get train_y
bal_train_y = pd.read_pickle(BALANCED_TRAIN_DATASET)
bal_train_y = bal_train_y.drop(columns="comment_text")

# Get test_y
bal_test_y = pd.read_pickle(BALANCED_TEST_DATASET)
bal_test_y = bal_test_y.drop(columns="comment_text")

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline

In [6]:
from functools import lru_cache
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [21]:
# Dummy function for TfidfVectorizer tokenizer
def fake_function(comments):
    return comments

# Pre-processing functions


# Function to clean comments in dataset
# Params: 
#   Pandas dataframe - @dataset: Data to be cleaned
# Output: 
#   List    - @comment_list: Cleaned comments (2D List)
def clean_data(dataset):

    # Remove punctuation
    regex_str = "[^a-zA-Z\s]"
    dataset['comment_text'] = dataset['comment_text'].replace(regex=regex_str, value="")

    # Remove extra whitespaces
    regex_space = "\s+"
    dataset['comment_text'] = dataset['comment_text'].replace(regex=regex_space, value=" ")

    # Strip whitespaces
    dataset['comment_text'] = dataset['comment_text'].str.strip()

    # Lowercase
    dataset['comment_text'] = dataset['comment_text'].str.lower()

    # Convert comment_text column into a list
    comment_list = dataset['comment_text'].tolist()

    return comment_list

# Function to get NLTK POS Tagger
# Params: 
#   Str - @word: Token
# Output
#   Dict - POS tagger
def nltk_get_wordnet_pos(word):
    
    tag = nltk.pos_tag([word])[0][1][0].upper()

    # Convert NLTK to wordnet POS notations

    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN) # Default to noun if not found

# Function to use NLTK lemmatizer
# Params: 2D List - Tokenized comments with stopwords removed
# Returns: 2D List - lemmatized tokens
def nltk_lemmatize(comment_stop):

    nltk.download('averaged_perceptron_tagger')
    comment_lemma = []
    lemmatizer = WordNetLemmatizer()
    lemmatizer_cache = lru_cache(maxsize=50000)(lemmatizer.lemmatize)

    for comment in comment_stop:
        temp = []
        temp.append([lemmatizer_cache(word, pos=nltk_get_wordnet_pos(word)) for word in comment])
        comment_lemma += temp

    return comment_lemma

# Function to remove NLTK stopwords
# Params: 
#   2D List - @comment_token:   cleaned & tokenized comments
# Output:
#   2D List - @comment_stop: cleaned tokens with stopwords removed
def nltk_stopwords(comment_token):
    # Stopwords in English only
    STOP_WORDS = set(stopwords.words('english'))

    # Remove stopwords
    comment_stop = []

    for comment in comment_token:
        
        temp_word = []

        for word in comment:
            
            if word not in STOP_WORDS:
                temp_word.append(word)

        comment_stop.append(temp_word)

    return comment_stop

# Function to tokenize comments using NLTK Word Tokenize
# Params: 
#   2D List - @text: cleaned comments
# Output: 
#   2D List - tokenized comments
def nltk_tokenize(text):
    return [word_tokenize(word) for word in text]

# Function for all pre-processing functions without saving as pickle file
# Params:
#   List  - @dataset: Dataset to be pre-processed (train/test)
# Output:
#   List - @comments_list: Preprocessed tokens (2D List)
def preprocess_data_without_pickle(dataset):

    # Prevent re-running on already preprocessed data
    if isinstance(dataset, pd.DataFrame): #if dataframe, data isn't preprocessed

        comments_list, row_count = clean_data(dataset)
        
        # NLTK Tokenize
        comments_list = nltk_tokenize(comments_list)

        # Remove NLTK stopwords
        comments_list = nltk_stopwords(comments_list)

        # NLTK Lemmatization
        comments_list = nltk_lemmatize(comments_list)
        
        return comments_list
    
    else:
        return dataset

In [36]:
pipe = Pipeline([ 
    ('tfidf', TfidfVectorizer(
        analyzer='word', 
        tokenizer=fake_function, 
        preprocessor=preprocess_data_without_pickle, 
        token_pattern=None,
        min_df=5, 
        norm='l2', 
        smooth_idf=True, 
        sublinear_tf=True)), 
    ('multi_mnb', MultiOutputClassifier(MultinomialNB(), n_jobs=-1))
    ])

pipe.fit(bal_train_dataset, bal_train_y)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(min_df=5,
                                 preprocessor=<function preprocess_data_without_pickle at 0x000001EA6AA70E50>,
                                 sublinear_tf=True, token_pattern=None,
                                 tokenizer=<function fake_function at 0x000001EA622E3790>)),
                ('multi_mnb',
                 MultiOutputClassifier(estimator=MultinomialNB(), n_jobs=-1))])

In [37]:
joblib.dump(pipe, 'multi_mnb_model.joblib', compress=1)

['multi_mnb_model.joblib']

In [7]:
pipe = joblib.load(MODEL_DIR)

In [39]:
cols = ['comment_text','toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [40]:
# convert to df to feed into pipeline
def convert_for_pred(comment):

    temp_df = pd.DataFrame(columns=cols)

    new_row = {'comment_text':comment}

    for i in range(len(labels)):
        new_row[labels[i]] = 0

    temp_df = temp_df.append(new_row, ignore_index=True)

    return temp_df

In [29]:
df = pd.DataFrame(columns=['comment_text','toxic','severe_toxic','obscene','threat','insult','identity_hate'])
labels = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [41]:
test = "sad fuck two a b"

comment = convert_for_pred(test)
print(comment)
print("a", preprocess_data_without_pickle(comment))

       comment_text toxic severe_toxic obscene threat insult identity_hate
0  sad fuck two a b     0            0       0      0      0             0
a [['sad', 'fuck', 'two', 'b']]
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lamxw\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [26]:
print(comment)

       comment_text toxic severe_toxic obscene threat insult identity_hate
0  sad fuck two a b     0            0       0      0      0             0


In [28]:
print(pipe.predict(comment['comment_text']).tolist())

[[1, 0, 1, 0, 1, 0]]


In [38]:
prediction = pipe.predict(comment['comment_text']).tolist()
print(prediction)

[[1, 0, 1, 0, 1, 0]]


In [33]:
print(prediction[0])

[1, 0, 1, 0, 1, 0]


In [35]:
new_row = {'comment_text':comment}

for i in range(len(labels)):
    new_row[labels[i]] = prediction[0][i]

    #append row to the dataframe
df = df.append(new_row, ignore_index=True)
print(df)

                                        comment_text toxic severe_toxic  \
0         comment_text toxic severe_toxic obscene...     1            0   

  obscene threat insult identity_hate  
0       1      0      1             0  


In [25]:
y_pred = pipe.predict(bal_test_dataset[:2])
print(y_pred.tolist())

[[1, 1, 1, 0, 1, 0], [1, 0, 1, 0, 1, 1]]
