In [1]:
# Import modules
import pandas as pd
from tabulate import tabulate

In [2]:
# Constants
TRAIN_DATASET = "train.csv"
TEST_DATA = "test.csv"
TEST_LABELS = "test_labels.csv"
DATA_FIELD = ["comment_text"]
LABEL_FIELDS = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
REDUNDANT_FIELDS = ["id"]

In [3]:
# Read in training dataset
train_dataset = pd.read_csv(TRAIN_DATASET)

# # Split training_data into x_train and y_train -- SAVE FOR LATER
# x_train = training_data[DATA_FIELD]
# y_train = training_data[LABEL_FIELDS]

# Read in test data
test_data = pd.read_csv(TEST_DATA)
test_labels = pd.read_csv(TEST_LABELS)

# Combine test data and labels into one data frame
test_dataset = pd.concat([test_data, test_labels], axis=1)

# Remove redundant id field from both datasets
train_dataset = train_dataset.drop(columns=REDUNDANT_FIELDS)
test_dataset = test_dataset.drop(columns=REDUNDANT_FIELDS)

# Remove samples with labels containing -1 in test dataset, this 
# is a place holder for samples that were not assigned labels.
test_dataset = test_dataset.drop(test_dataset[(test_dataset.toxic == -1) |
                                              (test_dataset.severe_toxic == -1) |
                                              (test_dataset.obscene == -1) |
                                              (test_dataset.threat == -1) |
                                              (test_dataset.insult == -1) |
                                              (test_dataset.identity_hate == -1)].index)

print(train_dataset.head())

                                        comment_text  toxic  severe_toxic  \
0  Explanation\r\nWhy the edits made under my use...      0             0   
1  D'aww! He matches this background colour I'm s...      0             0   
2  Hey man, I'm really not trying to edit war. It...      0             0   
3  "\r\nMore\r\nI can't make any real suggestions...      0             0   
4  You, sir, are my hero. Any chance you remember...      0             0   

   obscene  threat  insult  identity_hate  
0        0       0       0              0  
1        0       0       0              0  
2        0       0       0              0  
3        0       0       0              0  
4        0       0       0              0  


In [4]:
import utils
model = utils.build_model(20000)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         2560000   
_________________________________________________________________
dropout (Dropout)            (None, None, 128)         0         
_________________________________________________________________
conv1d (Conv1D)              (None, None, 128)         114816    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 128)         114816    
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               16512     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0

In [5]:
from utils import read_datasets
x, y = read_datasets()

In [6]:
print(x.head())

                                          comment_text  toxic  severe_toxic  \
6         COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK      1             1   
42   You are gay or antisemmitian? \n\nArchangel WH...      1             0   
55   Stupid peace of shit stop deleting my stuff as...      1             1   
105         A pair of jew-hating weiner nazi schmucks.      1             0   

     obscene  threat  insult  identity_hate  
6          1       0       1              0  
42         1       0       1              1  
55         1       0       1              0  
79         0       1       0              0  
105        1       0       1              1  


In [7]:
from utils import preprocess_data
preprocess_data(x)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lamxw\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
['cocksucker', 'before', 'you', 'piss', 'around', 'on', 'my', 'work']
Pre-processed data is in the form of comment_testname.pickle
