### 6481784 - Group 7 Individual Experimentation



In [1]:
# Imports
import pickle
import keras
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Dropout, Conv1D, GlobalMaxPooling1D, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Constants
EPOCHS = 30
INIT_LR = 1e-3

In [2]:
# Load in pre-processed dataset
x_text_train = pickle.load(open('../comment_lemma.pickle', 'rb'))
x_text_test_unprocessed = pickle.load(open('../balanced_test_dataset.pickle', 'rb'))
y_train = pd.read_pickle('../balanced_dataset.pickle').drop(columns='comment_text')
y_test = pd.read_pickle('../balanced_test_dataset.pickle').drop(columns='comment_text')

In [3]:
x_text_test = pickle.load(open('test_dataset.pickle', 'rb'))

In [4]:
num_words = 20000
max_len = 200

tokenizer = Tokenizer(num_words)
tokenizer.fit_on_texts(x_text_train)
corpus = tokenizer.word_index
reverse_corpus = dict(map(reversed, corpus.items()))
x_sequences_train = tokenizer.texts_to_sequences(x_text_train)
X_t = keras.preprocessing.sequence.pad_sequences(x_sequences_train, maxlen=max_len)
X_t = np.array(X_t)
np.random.shuffle(X_t)

In [5]:
x_sequences_test = tokenizer.texts_to_sequences(x_text_test)
X_te = keras.preprocessing.sequence.pad_sequences(x_sequences_test, maxlen=max_len)
X_te = np.array(X_te)
np.random.shuffle(X_te)

In [6]:
val_split = 0.2
num_validation_samples = int(val_split*X_t.shape[0])
x_train = X_t[: -num_validation_samples]
y_train = y_train[: -num_validation_samples]
x_val = X_t[-num_validation_samples: ]
y_val = y_train[-num_validation_samples: ]

### Base Model Definition

In [7]:
model = Sequential()

model.add(Embedding(num_words, 128))
model.add(Dropout(0.4))
model.add(Conv1D(128, 7, padding="valid", activation="relu", strides=3))
model.add(Conv1D(128, 7, padding="valid", activation="relu", strides=3))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(6, activation='sigmoid'))

#adam = tf.keras.optimizers.Adam(lr=INIT_LR, decay=INIT_LR / EPOCHS)

model.compile(loss='binary_crossentropy',
            optimizer='adam',
            metrics=['accuracy'])

### Base Model Results

In [8]:
model.fit(x_train,y_train, epochs=15, batch_size=60,  validation_data=(x_val, y_val))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x1b44fc49a60>

### Experiment Setup 1 - Loss Function

### Experiment Setup 1 Results

In [9]:
y_pred = model.predict(X_te)
y_pred = np.array(y_pred)
y_pred = y_pred.astype(float)

In [10]:
y_true = y_train[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]]
y_true = y_true.to_numpy()

In [16]:
index = 257
print(y_true[index])

print(y_pred[index])

[1 1 1 0 1 1]
[0.55544668 0.34327281 0.58660328 0.10950568 0.5551545  0.25217932]
