 # Individual Experimentation
 Name: Vinal Asodia <br>
 Username: va00191 <br>
 URN: 6539526
 
#### Import Modules

In [1]:
import pickle
import pandas as pd
import numpy as np
import utils
import keras
import h5py
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Conv1D, Dropout, Conv1D, GlobalMaxPooling1D, Embedding, LSTM, SpatialDropout1D

#### Define Constants

In [2]:
TRAIN_DATASET = "train.csv"
TEST_DATA = "test.csv"
TEST_LABELS = "test_labels.csv"
REDUNDANT_FIELDS = ["id"]
DATA_FIELD = ["comment_text"]
LABEL_FIELDS = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]

NUM_WORDS = 20000
EMBEDDING_DIM = 100
MAX_WORD = 200

BATCH_SIZE = 32
EPOCHS = 10

N_SPLITS = 10
N_REPEATS = 3
RANDOM_STATE = None

# Data Preparation

#### Read in the dataset
The dataset can be download here (https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge) <br>
The dataset comes in 3 csv files, the training dataset, the test comments and the test labels

In [3]:
# Read in training dataset
train = pd.read_csv(TRAIN_DATASET)

# Read in test data and labels
test_data = pd.read_csv(TEST_DATA)
test_labels = pd.read_csv(TEST_LABELS)

# Combine test data and labels into one data frame
test = pd.concat([test_data, test_labels], axis=1)

# Remove redundant id field from both datasets
train = train.drop(columns=REDUNDANT_FIELDS)
test = test.drop(columns=REDUNDANT_FIELDS)

# Remove samples with labels containing -1 in test dataset, this 
# is a place holder for samples that were not assigned labels.
test = test.drop(test[(test.toxic == -1) |
                      (test.severe_toxic == -1) |
                      (test.obscene == -1) |
                      (test.threat == -1) |
                      (test.insult == -1) |
                      (test.identity_hate == -1)].index)

#### Normalise and Clean Dataset

In [4]:
# Remove punctuation
regex_str = "[^a-zA-Z\s]"
train['comment_text'] = train['comment_text'].replace(regex=regex_str, value="")

# Remove extra whitespaces
regex_space = "\s+"
train['comment_text'] = train['comment_text'].replace(regex=regex_space, value=" ")

# Strip whitespaces
train['comment_text'] = train['comment_text'].str.strip()

# Lowercase
train['comment_text'] = train['comment_text'].str.lower()

# Convert comment_text column into a list
comment_list = train['comment_text'].tolist()

print(len(comment_list))

159571


In [6]:
comments = [sentence.split() for sentence in comment_list]
tokenised_comment = [word for sentence in comments for word in sentence]

['explanation', 'why', 'the', 'edits', 'made', 'under', 'my', 'username', 'hardcore', 'metallica', 'fan', 'were', 'reverted', 'they', 'werent', 'vandalisms', 'just', 'closure', 'on', 'some', 'gas', 'after', 'i', 'voted', 'at', 'new', 'york', 'dolls', 'fac', 'and', 'please', 'dont', 'remove', 'the', 'template', 'from', 'the', 'talk', 'page', 'since', 'im', 'retired', 'now', 'daww', 'he', 'matches', 'this', 'background', 'colour', 'im', 'seemingly', 'stuck', 'with', 'thanks', 'talk', 'january', 'utc', 'hey', 'man', 'im', 'really', 'not', 'trying', 'to', 'edit', 'war', 'its', 'just', 'that', 'this', 'guy', 'is', 'constantly', 'removing', 'relevant', 'information', 'and', 'talking', 'to', 'me', 'through', 'edits', 'instead', 'of', 'my', 'talk', 'page', 'he', 'seems', 'to', 'care', 'more', 'about', 'the', 'formatting', 'than', 'the', 'actual', 'info', 'more', 'i', 'cant', 'make', 'any', 'real', 'suggestions', 'on', 'improvement', 'i', 'wondered', 'if', 'the', 'section', 'statistics', 'shoul

#### Remove Stopwords

In [13]:
# Remove stopwords, using stopword list from nltk
stopword_list = set(stopwords.words('english'))
removed_stopwords = [word for word in tokenised_comment if word not in stopword_list]

5463485
10445524


#### Visualise the initial class balance

#### Balance the classes

#### Visualise the new class balance

#### Visualise the top words for each label

#### Create N-Grams

#### Visualise the Top N-Grams

#### Tokenise Dataset

#### Lemmatise Dataset

In [6]:
train,test = utils.read_datasets()
x_train = pickle.load(open("comment_lemma.pickle", "rb"))
y_train = train[LABEL_FIELDS]
y_train = y_train.to_numpy()

tokenizer = Tokenizer(NUM_WORDS)
tokenizer.fit_on_texts(x_train)
corpus = tokenizer.word_index
reverse_corpus = dict(map(reversed, corpus.items()))

x_sequences_train = tokenizer.texts_to_sequences(x_train)
x_padded_train = keras.preprocessing.sequence.pad_sequences(x_sequences_train, maxlen= 150)
x_padded_train = np.array(x_padded_train)

In [4]:
type(y_train)

numpy.ndarray

# LSTM Model

In [12]:
model = Sequential()
model.add(Embedding(NUM_WORDS, EMBEDDING_DIM, input_length=200))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(6, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 5
batch_size = 64

history = model.fit(x_padded_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


 # Leave One Out Cross Validation (LOOCV)
 Too many samples to use LOOCV, takes roughly 4.5 minutes for 1 sample.... for roughly 7200 samples it would take 32,400 minutes, which 540 hours, which is 22.5 days..... yeah no
 
 One work around could be lower the number of epochs and increase the batch size, but maybe just move onto other cross validation techniques.

# Experiment Setup 1: Cross Validation

In [12]:
accr, losses = [], []
y_pred,y_true = [], []
modelKFOLD = utils.build_model(NUM_WORDS)
kf = RepeatedKFold(n_splits=N_SPLITS,n_repeats=N_REPEATS,random_state=RANDOM_STATE)

for train_index, val_index in kf.split(x_padded_train):
    X, X_val = x_padded_train[train_index], x_padded_train[val_index]
    y, y_val = y_train[train_index], y_train[val_index]
    modelKFOLD.fit(X, y, batch_size=BATCH_SIZE, epochs=EPOCHS)
    pred = modelKFOLD.evaluate(X_val,y_val)
    prediction = modelKFOLD.predict(X_val)
    print("Loss: ",pred[0])
    print("Accuracy: ",pred[1])
    losses.append(pred[0])
    accr.append(pred[1])
    y_pred.append(prediction)
    y_true.append(y_val)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss:  0.44302573800086975
Accuracy:  0.4719887971878052
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss:  0.24055352807044983
Accuracy:  0.4453781545162201
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss:  0.14551974833011627
Accuracy:  0.4193548262119293
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss:  0.13599838316440582
Accuracy:  0.4684431850910187
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss:  0.12497672438621521
Accuracy:  0.45301541686058044
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss:  0.11378948390483856
Accuracy:  0.44460028

In [16]:
print(y_pred[0][0].round())
print(y_true[0][0])

[1. 0. 1. 0. 1. 1.]
[1 1 1 0 1 0]


In [14]:
print(sum(losses)/len(losses))
print(sum(accr)/len(accr))

0.12279526169101397
0.43713180522123973


In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

# For each class
precision = dict()
recall = dict()
average_precision = dict()
for i in range(n_classes):
    precision[i], recall[i], _ = precision_recall_curve(Y_test[:, i],
                                                        y_score[:, i])
    average_precision[i] = average_precision_score(Y_test[:, i], y_score[:, i])