In [88]:
# Import modules
import pandas as pd
import gensim
import spacy
import nltk
import pickle
import numpy as np
# Pre-processing imports
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from spacy.tokens import Doc, DocBin

In [70]:
# Constants
TRAIN_DATASET = "train.csv"
TEST_DATA = "test.csv"
TEST_LABELS = "test_labels.csv"
DATA_FIELD = ["comment_text"]
LABEL_FIELDS = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
REDUNDANT_FIELDS = ["id"]
STOP_WORDS = set(stopwords.words('english')) # Stopwords in English only


In [72]:
# Read in training dataset
train_dataset = pd.read_csv(TRAIN_DATASET)

# # Split training_data into x_train and y_train -- SAVE FOR LATER
# x_train = training_data[DATA_FIELD]
# y_train = training_data[LABEL_FIELDS]

# Read in test data
test_data = pd.read_csv(TEST_DATA)
test_labels = pd.read_csv(TEST_LABELS)

# Combine test data and labels into one data frame
test_dataset = pd.concat([test_data, test_labels], axis=1)

# Remove redundant id field from both datasets
train_dataset = train_dataset.drop(columns=REDUNDANT_FIELDS)
test_dataset = test_dataset.drop(columns=REDUNDANT_FIELDS)

# Remove samples with labels containing -1 in test dataset, this 
# is a place holder for samples that were not assigned labels.
test_dataset = test_dataset.drop(test_dataset[(test_dataset.toxic == -1) |
                                              (test_dataset.severe_toxic == -1) |
                                              (test_dataset.obscene == -1) |
                                              (test_dataset.threat == -1) |
                                              (test_dataset.insult == -1) |
                                              (test_dataset.identity_hate == -1)].index)
train_dataset = pd.read_pickle("balanced_dataset.pickle")

print(train_dataset.count())

comment_text     7132
toxic            7132
severe_toxic     7132
obscene          7132
threat           7132
insult           7132
identity_hate    7132
dtype: int64


In [73]:
# Remove punctuation (everything except letters and whitespaces)
regex_str = "[^a-zA-Z\s]"
train_dataset['comment_text'] = train_dataset['comment_text'].replace(regex=regex_str, value="")

# Remove extra whitespaces
regex_space = "\s+"
train_dataset['comment_text'] = train_dataset['comment_text'].replace(regex=regex_space, value=" ")

# Strip whitespaces
train_dataset['comment_text'] = train_dataset['comment_text'].str.strip()

# Lowercase
train_dataset['comment_text'] = train_dataset['comment_text'].str.lower()
#print(train_dataset['comment_text'].head())

In [74]:
# Tokenize function
def tokenize(text):
    return [word_tokenize(word) for word in text]

In [75]:
# Convert comment_text column into a list
comment_list = train_dataset['comment_text'].tolist()

# Tokenize
comment_token = tokenize(comment_list)

In [76]:
# Gensim N-grams
# Create bigram model
bigram = Phrases(comment_token, min_count=5, threshold=100)
bigram_model = Phraser(bigram)

In [77]:
# Remove stopwords
comment_stop = [word for word in comment_list if word not in STOP_WORDS]

# Tokenize stopwords removed
comment_token_stop = tokenize(comment_stop)

# Create Gensim n-grams
comment_bigrams = [bigram_model[word] for word in comment_token_stop]

In [81]:
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    # now we need to convert from nltk to wordnet POS notations (for compatibility reasons)
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN) # return and default to noun if not found

In [82]:
nltk.download('averaged_perceptron_tagger')
comment_lemma = []
for comment in comment_token_stop:
    temp = []
    temp.append([lemmatizer.lemmatize(word, pos=get_wordnet_pos(word)) for word in comment])
    comment_lemma += temp


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\vinal\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [85]:
print(len(comment_lemma))

7132


In [90]:
# Save lemmatised tokens
pickle.dump(comment_lemma, open("comment_lemma.pickle","wb"))