# Experiment 1
## Comparing Multiple Lemmatizations

In [None]:
import keras
import numpy as np
import pandas as pd
import pickle
import sklearn
import tensorflow as tf

In [None]:
# File paths

# Data Directory
DATA_DIR = "data"

# Balanced datasets
BALANCED_TRAIN_DATASET = "data/balanced_dataset.pickle"
BALANCED_TEST_DATASET = "data/balanced_test_dataset.pickle"

In [None]:
# Function to save data as a .pickle file
# Params: 
    # List or Dataframe - @data: Data to be saved as pickle
    # Str - @folder: folder name
    # Str - file name
# Output: Pickle file in directory/repo 
def save_pickle(data, folder, file_name):
    with open("{0}/{1}.pickle".format(folder, file_name), 'wb') as f:
        pickle.dump(data, f)
    print(f"Saved data is stored in \'{folder}\' in the form of {file_name}.pickle")
    #pickle.dump(data, open("data/{0}.pickle".format(file_name),"wb"))

# Function to load pickle file
# Params:
    # Str - @file_path: File path of pickle file
# Output:
    # Saved object in original file type (list/dataframe)
def load_pickle(file_path):
    return pickle.load(open(file_path, "rb"))

In [None]:
# Load datasets

# Balanced, unprocessed datasets
bal_train_df = load_pickle(BALANCED_TRAIN_DATASET)
bal_test_df = load_pickle(BALANCED_TEST_DATASET)

# Get train_y
bal_train_y = pd.read_pickle(BALANCED_TRAIN_DATASET)
bal_train_y = bal_train_y.drop(columns="comment_text")

# Get test_y
bal_test_y = pd.read_pickle(BALANCED_TEST_DATASET)
bal_test_y = bal_test_y.drop(columns="comment_text")

In [None]:
# Pre-processing functions
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Function to clean comments in train dataset
# Params: pd dataframe - Training dataset
# Return: 2D List - cleaned comments
def clean_data(train_dataset):
    # Remove punctuation
    regex_str = "[^a-zA-Z\s]"
    train_dataset['comment_text'] = train_dataset['comment_text'].replace(regex=regex_str, value="")

    # Remove extra whitespaces
    regex_space = "\s+"
    train_dataset['comment_text'] = train_dataset['comment_text'].replace(regex=regex_space, value=" ")

    # Strip whitespaces
    train_dataset['comment_text'] = train_dataset['comment_text'].str.strip()

    # Lowercase
    train_dataset['comment_text'] = train_dataset['comment_text'].str.lower()

    # Convert comment_text column into a list
    comment_list = train_dataset['comment_text'].tolist()

    return comment_list

# Function to get NLTK POS Tagger
# Params: Token
# Returns: Dict - POS tagger
def nltk_get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    # Convert NOTK to wordnet POS notations
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN) # Default to noun if not found

# Function to use NLTK lemmatizer
# Params: 2D List - Tokenized comments with stopwords removed
# Returns: 2D List - lemmatized tokens
def nltk_lemmatize(comment_stop):

    nltk.download('averaged_perceptron_tagger')
    comment_lemma = []
    lemmatizer = WordNetLemmatizer()
    lemmatizer_cache = lru_cache(maxsize=50000)(lemmatizer.lemmatize)

    for comment in comment_stop:
        temp = []
        temp.append([lemmatizer_cache(word, pos=nltk_get_wordnet_pos(word)) for word in comment])
        comment_lemma += temp

    return comment_lemma

# Function to remove NLTK stopwords
# Params: 2D List - cleaned & tokenized comments
# Returns: 2D List - cleaned tokens with stopwords removed
def nltk_stopwords(comment_token):
    # Stopwords in English only
    STOP_WORDS = set(stopwords.words('english'))

    # Remove stopwords
    comment_stop = []

    for comment in comment_token:
        
        temp_word = []

        for word in comment:
            
            if word not in STOP_WORDS:
                temp_word.append(word)

        comment_stop.append(temp_word)
    
    return comment_stop

# Function to tokenize comments using NLTK Word Tokenize
# Params: 2D List - cleaned comments
# Returns: 2D List - tokenized comments
def nltk_tokenize(text):
    return [word_tokenize(word) for word in text]

# Function for all pre-processing functions
# Params:
    # Pandas Dataframe  - @dataset: Dataset to be pre-processed (train/test)
    # Str               - @file_name: File name to save pre-processed data as pickle
# Output: Pickle file in directory/repo
def preprocess_data(dataset, file_name):

    comment_cleaned = clean_data(dataset)
    
    # NLTK Tokenize
    comment_token = nltk_tokenize(comment_cleaned)

    # Remove NLTK stopwords
    comment_stop = nltk_stopwords(comment_token)

    # NLTK Lemmatization
    comment_lemma = nltk_lemmatize(comment_stop)

    save_pickle(comment_lemma, folder, file_name)

In [None]:
# Prepare basic pre-processing steps until before lemmatization

# Train dataset
train_clean = clean_data(bal_train_df)
train_token = nltk_tokenize(train_clean)
train_stopwords = nltk_stopwords(train_token)

# Test dataset
test_clean = clean_data(bal_test_df)
test_token = nltk_tokenize(test_clean)
test_stopwords = nltk_stopwords(test_clean)

## Spacy Lemmatization

In [None]:
import spacy

# Load small version of spacy's language model
nlp = spacy.load('en_core_web_sm')

Since spacy tokenizes sentences automatically with their model, I have to replace their tokenizer with our used tokenizer (nltk) to ensure spacy's tokenizing function does not affect our result. I will need to modify nltk_tokenize() to return a spacy Doc object.

In [None]:
# 
def nltk_tokenizer_for_spacy(text):
    tokens = []

    # your existing code to fill the list with tokens

    # replace this line:
    return tokens

    # with this:
    return Doc(nlp.vocab, tokens)