 # Cross Validation Experiment
 
Import Modules

In [1]:
import pickle
import numpy as np
import utils
import keras
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

Constants

In [2]:
DATA_FIELD = ["comment_text"]
LABEL_FIELDS = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
NUM_WORDS = 20000

BATCH_SIZE = 60
EPOCHS = 5

N_SPLITS = 10
N_REPEATS = 3
RANDOM_STATE = None

Initialise Data

In [3]:
train,test = utils.read_datasets()
x_train = pickle.load(open("comment_lemma.pickle", "rb"))
y_train = train[LABEL_FIELDS]
y_train = y_train.to_numpy()

tokenizer = Tokenizer(NUM_WORDS)
tokenizer.fit_on_texts(x_train)
corpus = tokenizer.word_index
reverse_corpus = dict(map(reversed, corpus.items()))

x_sequences_train = tokenizer.texts_to_sequences(x_train)
x_padded_train = keras.preprocessing.sequence.pad_sequences(x_sequences_train, maxlen= 150)
x_padded_train = np.array(x_padded_train)

In [4]:
type(y_train)

numpy.ndarray

 # Leave One Out Cross Validation (LOOCV)
 Too many samples to use LOOCV, takes roughly 4.5 minutes for 1 sample.... for roughly 7200 samples it would take 32,400 minutes, which 540 hours, which is 22.5 days..... yeah no
 
 One work around could be lower the number of epochs and increase the batch size, but maybe just move onto other cross validation techniques.

# K Fold Cross Validation

In [5]:
accr, losses = [], []
y_pred,y_true = [], []
modelKFOLD = utils.build_model(NUM_WORDS)
kf = RepeatedKFold(n_splits=N_SPLITS,n_repeats=N_REPEATS,random_state=RANDOM_STATE)

for train_index, val_index in kf.split(x_padded_train):
    X, X_val = x_padded_train[train_index], x_padded_train[val_index]
    y, y_val = y_train[train_index], y_train[val_index]
    modelKFOLD.fit(X, y, batch_size=BATCH_SIZE, epochs=EPOCHS)
    pred = modelKFOLD.evaluate(X_val,y_val)
    prediction = modelKFOLD.predict(X_val)
    print("Loss: ",pred[0])
    print("Accuracy: ",pred[1])
    losses.append(pred[0])
    accr.append(pred[1])
    y_pred.append(prediction)
    y_true.append(y_val)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss:  0.3910101354122162
Accuracy:  0.48739495873451233
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss:  0.2911985516548157
Accuracy:  0.47478991746902466
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss:  0.21693377196788788
Accuracy:  0.5077139139175415
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss:  0.17573943734169006
Accuracy:  0.4642356336116791
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss:  0.1392611414194107
Accuracy:  0.4053295850753784
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss:  0.12998297810554504
Accuracy:  0.4151472747325897
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss:  0.12596812844276428
Accuracy:  0.3870967626571655
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss:  0.11601629853248596
Accuracy:  0.32258063554763794
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss:  0.10793799161911011
Accuracy:  0.33660587668418884
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 

In [6]:
print(y_pred[0][0])
print(y_true[0][0])

[0.6390166  0.39182433 0.52366614 0.12164617 0.52126926 0.31050414]
[1 0 1 0 1 1]


In [7]:
print(sum(losses)/len(losses))
print(sum(accr)/len(accr))

0.1300998183588187
0.37983258763949074
