In [1]:
# Import modules
import pandas as pd
import gensim
import itertools
from tabulate import tabulate
# Pre-processing imports
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#from nltk import ngrams
#from nltk.stem.porter import PorterStemmer
#from nltk.stem import WordNetLemmatizer
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.utils import lemmatize

In [2]:
# Constants
TRAIN_DATASET = "train.csv"
TEST_DATA = "test.csv"
TEST_LABELS = "test_labels.csv"
DATA_FIELD = ["id","comment_text"]
LABEL_FIELDS = ["id","toxic","severe_toxic","obscene","threat","insult","identity_hate"]
REDUNDANT_FIELDS = ["id"]
STOP_WORDS = set(stopwords.words('english')) # Stopwords in English only

In [3]:
# Read in training dataset
train_dataset = pd.read_csv(TRAIN_DATASET)

# # Split training_data into x_train and y_train -- SAVE FOR LATER
# x_train = training_data[DATA_FIELD]
# y_train = training_data[LABEL_FIELDS]

# Read in test data
test_data = pd.read_csv(TEST_DATA)
test_labels = pd.read_csv(TEST_LABELS)

# Combine test data and labels into one data frame
test_dataset = pd.concat([test_data, test_labels], axis=1)

# Remove redundant id field from both datasets
train_dataset = train_dataset.drop(columns=REDUNDANT_FIELDS)
test_dataset = test_dataset.drop(columns=REDUNDANT_FIELDS)

# Remove samples with labels containing -1 in test dataset, this 
# is a place holder for samples that were not assigned labels.
test_dataset = test_dataset.drop(test_dataset[(test_dataset.toxic == -1) |
                                              (test_dataset.severe_toxic == -1) |
                                              (test_dataset.obscene == -1) |
                                              (test_dataset.threat == -1) |
                                              (test_dataset.insult == -1) |
                                              (test_dataset.identity_hate == -1)].index)

print(train_dataset.head())

                                        comment_text  toxic  severe_toxic  \
0  Explanation\r\nWhy the edits made under my use...      0             0   
1  D'aww! He matches this background colour I'm s...      0             0   
2  Hey man, I'm really not trying to edit war. It...      0             0   
3  "\r\nMore\r\nI can't make any real suggestions...      0             0   
4  You, sir, are my hero. Any chance you remember...      0             0   

   obscene  threat  insult  identity_hate  
0        0       0       0              0  
1        0       0       0              0  
2        0       0       0              0  
3        0       0       0              0  
4        0       0       0              0  


In [4]:
# Remove punctuation (everything except letters and whitespaces)
regex_str = "[^a-zA-Z\s]"
train_dataset['comment_text'] = train_dataset['comment_text'].replace(regex=regex_str, value="")

# Remove extra whitespaces
regex_space = "\s+"
train_dataset['comment_text'] = train_dataset['comment_text'].replace(regex=regex_space, value=" ")

# Strip whitespaces
train_dataset['comment_text'] = train_dataset['comment_text'].str.strip()

# Lowercase
train_dataset['comment_text'] = train_dataset['comment_text'].str.lower()
#print(train_dataset['comment_text'].head())

In [32]:
# Convert comment_text column into a list
comment_list = train_dataset['comment_text'].tolist()

# Flatten list
#comment_flat = list(itertools.chain.from_iterable(comment_list))

In [33]:
#print(comment_flat[1])

In [34]:
# Tokenize
#def tokenize(text):
#    return [[word_tokenize(word) for word in comment] for comment in text]
#comment_token = [word_tokenize(word) for word in comment_stop]
comment_token = [word_tokenize(word) for word in comment_list]
#print(train_dataset['comment_text'].head())
#comment_token = tokenize(comment_list)
#print(comment_token[0])

In [35]:
# Gensim N-grams
# Create bigram model
# bigram = Phrases(train_dataset['comment_text'], min_count=5, threshold=100)
bigram = Phrases(comment_token, min_count=5, threshold=100)
bigram_model = Phraser(bigram)
print(bigram)

Phrases<2394719 vocab, min_count=5, threshold=100, max_vocab_size=40000000>


In [44]:
# Remove stopwords
comment_stop = [word for word in comment_list if word not in STOP_WORDS]
#comment_stop = [' '.join([word for word in comment_flat if word not in STOP_WORDS])]

In [45]:
# Tokenize stopwords removed
comment_token_stop = [word_tokenize(word) for word in comment_stop]

# Create Gensim n-grams
comment_bigrams = [bigram_model[word] for word in comment_token_stop]

In [47]:
# N-grams (Bigrams)
# NLTK bigrams, idk how to make use of it in list form
# train_dataset['comment_text'] = train_dataset['comment_text'].apply(lambda x: list(ngrams(x, 2)))


In [48]:
#porter = PorterStemmer()
#train_dataset['comment_text'] = train_dataset['comment_text'].apply(lambda x: [porter.stem(token) for token in x])

In [49]:
#print(train_dataset['comment_text'].head())

In [50]:
# Lemmatize
#lemmatizer = WordNetLemmatizer()
#train_dataset['comment_text'] = train_dataset['comment_text'].apply(lambda x: [lemmatizer.lemmatize(bigram) for bigram in x])
#train_dataset['comment_text'] = [lemmatizer.lemmatize(token) for row in train_dataset['comment_text'] for bigram in row for token in bigram]
#def lemmatize(text):
#    return [lemmatize(word) for word in text:]
#[word for word in item if word not in STOP_WORDS] for item in comment_list
#comment_lemmatized = 