In [29]:
# Import modules
import pandas as pd
import gensim
import spacy
import nltk
import numpy as np
# Pre-processing imports
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from spacy.tokens import Doc, DocBin

In [2]:
# Constants
TRAIN_DATASET = "train.csv"
TEST_DATA = "test.csv"
TEST_LABELS = "test_labels.csv"
DATA_FIELD = ["comment_text"]
LABEL_FIELDS = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
REDUNDANT_FIELDS = ["id"]
STOP_WORDS = set(stopwords.words('english')) # Stopwords in English only


In [3]:
# Read in training dataset
train_dataset = pd.read_csv(TRAIN_DATASET)

# # Split training_data into x_train and y_train -- SAVE FOR LATER
# x_train = training_data[DATA_FIELD]
# y_train = training_data[LABEL_FIELDS]

# Read in test data
test_data = pd.read_csv(TEST_DATA)
test_labels = pd.read_csv(TEST_LABELS)

# Combine test data and labels into one data frame
test_dataset = pd.concat([test_data, test_labels], axis=1)

# Remove redundant id field from both datasets
train_dataset = train_dataset.drop(columns=REDUNDANT_FIELDS)
test_dataset = test_dataset.drop(columns=REDUNDANT_FIELDS)

# Remove samples with labels containing -1 in test dataset, this 
# is a place holder for samples that were not assigned labels.
test_dataset = test_dataset.drop(test_dataset[(test_dataset.toxic == -1) |
                                              (test_dataset.severe_toxic == -1) |
                                              (test_dataset.obscene == -1) |
                                              (test_dataset.threat == -1) |
                                              (test_dataset.insult == -1) |
                                              (test_dataset.identity_hate == -1)].index)

print(train_dataset.head())

                                        comment_text  toxic  severe_toxic  \
0  Explanation\r\nWhy the edits made under my use...      0             0   
1  D'aww! He matches this background colour I'm s...      0             0   
2  Hey man, I'm really not trying to edit war. It...      0             0   
3  "\r\nMore\r\nI can't make any real suggestions...      0             0   
4  You, sir, are my hero. Any chance you remember...      0             0   

   obscene  threat  insult  identity_hate  
0        0       0       0              0  
1        0       0       0              0  
2        0       0       0              0  
3        0       0       0              0  
4        0       0       0              0  


In [4]:
# Remove punctuation (everything except letters and whitespaces)
regex_str = "[^a-zA-Z\s]"
train_dataset['comment_text'] = train_dataset['comment_text'].replace(regex=regex_str, value="")

# Remove extra whitespaces
regex_space = "\s+"
train_dataset['comment_text'] = train_dataset['comment_text'].replace(regex=regex_space, value=" ")

# Strip whitespaces
train_dataset['comment_text'] = train_dataset['comment_text'].str.strip()

# Lowercase
train_dataset['comment_text'] = train_dataset['comment_text'].str.lower()
#print(train_dataset['comment_text'].head())

In [5]:
# Tokenize function
def tokenize(text):
    return [word_tokenize(word) for word in text]

In [49]:
# Convert comment_text column into a list
comment_list = train_dataset['comment_text'].tolist()

# Tokenize
comment_token = tokenize(comment_list)

In [50]:
# Gensim N-grams
# Create bigram model
bigram = Phrases(comment_token, min_count=5, threshold=100)
bigram_model = Phraser(bigram)

In [51]:
# Remove stopwords
comment_stop = [word for word in comment_list if word not in STOP_WORDS]

# Tokenize stopwords removed
comment_token_stop = tokenize(comment_stop)

# Create Gensim n-grams
comment_bigrams = [bigram_model[word] for word in comment_token_stop]

In [52]:
# Prepare bigrams for lemmatization
# Initialize spacy
nlp = spacy.load("en_core_web_sm", disable=['tokenizer', 'parser', 'ner'])

# Convert bigrams to string tokens
# REDUNDANT WTFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
#comment_str = ["".join(str(word)) for word in comment_bigrams]

In [53]:
comment_token_stop = np.asarray(comment_token_stop)
comment_token_stop = comment_token_stop.flatten()
print(comment_token_stop)

[list(['explanation', 'why', 'the', 'edits', 'made', 'under', 'my', 'username', 'hardcore', 'metallica', 'fan', 'were', 'reverted', 'they', 'werent', 'vandalisms', 'just', 'closure', 'on', 'some', 'gas', 'after', 'i', 'voted', 'at', 'new', 'york', 'dolls', 'fac', 'and', 'please', 'dont', 'remove', 'the', 'template', 'from', 'the', 'talk', 'page', 'since', 'im', 'retired', 'now'])
 list(['daww', 'he', 'matches', 'this', 'background', 'colour', 'im', 'seemingly', 'stuck', 'with', 'thanks', 'talk', 'january', 'utc'])
 list(['hey', 'man', 'im', 'really', 'not', 'trying', 'to', 'edit', 'war', 'its', 'just', 'that', 'this', 'guy', 'is', 'constantly', 'removing', 'relevant', 'information', 'and', 'talking', 'to', 'me', 'through', 'edits', 'instead', 'of', 'my', 'talk', 'page', 'he', 'seems', 'to', 'care', 'more', 'about', 'the', 'formatting', 'than', 'the', 'actual', 'info'])
 ...
 list(['spitzer', 'umm', 'theres', 'no', 'actual', 'article', 'for', 'prostitution', 'ring', 'crunch', 'captain']

In [54]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stem_tokens = [lemmatizer.lemmatize(x) for x in comment_token_stop[0]]
print(stem_tokens)

['explanation', 'why', 'the', 'edits', 'made', 'under', 'my', 'username', 'hardcore', 'metallica', 'fan', 'were', 'reverted', 'they', 'werent', 'vandalism', 'just', 'closure', 'on', 'some', 'gas', 'after', 'i', 'voted', 'at', 'new', 'york', 'doll', 'fac', 'and', 'please', 'dont', 'remove', 'the', 'template', 'from', 'the', 'talk', 'page', 'since', 'im', 'retired', 'now']


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vinal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [55]:
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    # now we need to convert from nltk to wordnet POS notations (for compatibility reasons)
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN) # return and default to noun if not found

In [58]:
print(comment_token_stop[0])

['explanation', 'why', 'the', 'edits', 'made', 'under', 'my', 'username', 'hardcore', 'metallica', 'fan', 'were', 'reverted', 'they', 'werent', 'vandalisms', 'just', 'closure', 'on', 'some', 'gas', 'after', 'i', 'voted', 'at', 'new', 'york', 'dolls', 'fac', 'and', 'please', 'dont', 'remove', 'the', 'template', 'from', 'the', 'talk', 'page', 'since', 'im', 'retired', 'now']


In [None]:
nltk.download('averaged_perceptron_tagger')
comment_lemma = []
for comment in comment_token_stop:
    temp = []
    temp.append([lemmatizer.lemmatize(word, pos=get_wordnet_pos(word)) for word in comment])
    comment_lemma += temp
print(comment_lemma)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\vinal\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
print(comment_lemma[1])