In [1]:
import numpy as np
import pandas as pd
from preprocess import preprocess  # local file. restart kernel if this changed, it won't be re-imported otherwise
from sklearn.model_selection import train_test_split

## Read the first few rows during crude developing:
train = pd.read_csv('data/train.csv', nrows=1000).fillna(' ')
test = pd.read_csv('data/test.csv', nrows=1000).fillna(' ')

## These lines load all data:
#train = pd.read_csv('data/train.csv').fillna(' ')
#test = pd.read_csv('data/test.csv').fillna(' ')

[train, test, train_text, test_text, all_text, class_names] = preprocess(train, test)

# TODO why does a train dev split break the code?
# train, dev = train_test_split(train, test_size=0.1, random_state=42)

In [2]:
train.shape

(1000, 8)

In [49]:
train[class_names].mean()  # imbalanced => baseline of 96% accuracy for "predict all as okay"
# preds: [0.07195416, 0.00958922, 0.03911009, 0.00467693, 0.04485368, 0.00369948]

toxic            0.105
severe_toxic     0.009
obscene          0.049
threat           0.004
insult           0.054
identity_hate    0.009
dtype: float64

In [3]:
train.iloc[0:5]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
# dev.iloc[0:5]

In [5]:
test.shape

(1000, 2)

In [6]:
import glove as glv
g_vectors, words_to_index, index_to_words = glv.loadGloveModel("data/glove.6B.50d.txt")

Loading Glove Model
Done. 400000  words loaded!


In [7]:
for v in words_to_index:
    if words_to_index[v] < 0:
        print("OMG")

In [8]:
g_vectors["hello"]

array([-0.38497 ,  0.80092 ,  0.064106, -0.28355 , -0.026759, -0.34532 ,
       -0.64253 , -0.11729 , -0.33257 ,  0.55243 , -0.087813,  0.9035  ,
        0.47102 ,  0.56657 ,  0.6985  , -0.35229 , -0.86542 ,  0.90573 ,
        0.03576 , -0.071705, -0.12327 ,  0.54923 ,  0.47005 ,  0.35572 ,
        1.2611  , -0.67581 , -0.94983 ,  0.68666 ,  0.3871  , -1.3492  ,
        0.63512 ,  0.46416 , -0.48814 ,  0.83827 , -0.9246  , -0.33722 ,
        0.53741 , -1.0616  , -0.081403, -0.67111 ,  0.30923 , -0.3923  ,
       -0.55002 , -0.68827 ,  0.58049 , -0.11626 ,  0.013139, -0.57654 ,
        0.048833,  0.67204 ])

In [9]:
max_len = train['comment_text'].str.split().apply(len).max()
max_len

1052

In [10]:
train[class_names].iloc[0]

toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 0, dtype: int64

In [11]:
train['comment_text'].iloc[0]

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [12]:
train['comment_text'].iloc[0].lower().split()

['explanation',
 'why',
 'the',
 'edits',
 'made',
 'under',
 'my',
 'username',
 'hardcore',
 'metallica',
 'fan',
 'were',
 'reverted?',
 'they',
 "weren't",
 'vandalisms,',
 'just',
 'closure',
 'on',
 'some',
 'gas',
 'after',
 'i',
 'voted',
 'at',
 'new',
 'york',
 'dolls',
 'fac.',
 'and',
 'please',
 "don't",
 'remove',
 'the',
 'template',
 'from',
 'the',
 'talk',
 'page',
 'since',
 "i'm",
 'retired',
 'now.89.205.38.27']

In [13]:
train['comment_text'].shape

(1000,)

In [14]:
from lstm_utils import sentences_to_indices, pretrained_embedding_layer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [56]:
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, GRU, Activation
from keras.layers.embeddings import Embedding

def TheModel(input_shape, g_vectors, words_to_index):
    """
    Function creating the GRU model
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """
    
    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = Input(shape = input_shape, dtype = 'int32')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(g_vectors, words_to_index)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)   
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a batch of sequences.
    # X = LSTM(units = 128, return_sequences = True)(embeddings)
    X = GRU(units = 128, kernel_initializer = 'glorot_uniform', recurrent_initializer = 'glorot_uniform')(embeddings)
    # Add dropout with a probability of 0.5
    X = Dropout(rate = 0.5)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a single hidden state, not a batch of sequences.
    #X = LSTM(units = 128)(X)
    # Add dropout with a probability of 0.5
    #X = Dropout(rate = 0.5)(X)
    # Propagate X through a Dense (=FC) layer
    X = Dense(units = 6)(X)  # 6 units because 
    # Add a sigmoid activation because we have multilabel classification
    X = Activation("sigmoid", name = "sigmoid_activation")(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs = sentence_indices, outputs = X)
    
    return model

In [57]:
model = TheModel((max_len, ), g_vectors, words_to_index)

In [58]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 1052)              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 1052, 50)          20000050  
_________________________________________________________________
gru_2 (GRU)                  (None, 128)               68736     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 774       
_________________________________________________________________
sigmoid_activation (Activati (None, 6)                 0         
Total params: 20,069,560
Trainable params: 69,510
Non-trainable params: 20,000,050
___________________________________________________________

In [59]:
# https://github.com/keras-team/keras/issues/2166
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [60]:
Y_train = train.as_matrix(columns = class_names)

In [61]:
X_train_indices = sentences_to_indices(train['comment_text'], words_to_index, max_len)

In [62]:
print(X_train_indices.shape)
print(Y_train.shape)

(1000, 1052)
(1000, 6)


In [63]:
model.fit(X_train_indices, Y_train, epochs = 3, batch_size = 32, shuffle=True)

# Plot loss function during training:
# https://www.kaggle.com/parasjindal96/basic-deep-learning-tutorial-using-keras

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f84e4545d30>

In [64]:
from keras.utils import plot_model
plot_model(model, to_file = "keras-model.png", show_shapes = True)

In [65]:
train_loss, train_acc = model.evaluate(X_train_indices, Y_train)
train_acc



0.9616666622161866

In [66]:
pred_train = model.predict(X_train_indices)

In [67]:
pred_train

array([[0.09977868, 0.01022341, 0.0461725 , 0.00577286, 0.04667644,
        0.01186479],
       [0.09977868, 0.01022341, 0.0461725 , 0.00577286, 0.04667645,
        0.01186479],
       [0.09977868, 0.01022341, 0.04617248, 0.00577286, 0.04667645,
        0.01186479],
       ...,
       [0.09977868, 0.01022341, 0.04617248, 0.00577286, 0.04667645,
        0.01186479],
       [0.09977868, 0.01022341, 0.04617248, 0.00577286, 0.04667645,
        0.01186479],
       [0.09977868, 0.01022341, 0.0461725 , 0.00577286, 0.04667644,
        0.01186479]], dtype=float32)

In [23]:
## max_len_dev = dev['comment_text'].str.split().apply(len).max()
#max_len_dev = max_len
#
#X_dev_indices = sentences_to_indices(dev['comment_text'], words_to_index, max_len_dev)
#
#Y_dev = dev.as_matrix(columns = class_names)
#
#loss, acc = model.evaluate(X_dev_indices, Y_dev)
#print()
#print("Test accuracy = ", acc)

In [24]:
## This code allows you to see the mislabelled examples
#C = 5
#y_test_oh = np.eye(C)[Y_test.reshape(-1)]
#X_test_indices = sentences_to_indices(X_test, word_to_index, maxLen)
#pred = model.predict(X_test_indices)
#for i in range(len(X_test)):
#    x = X_test_indices
#    num = np.argmax(pred[i])
#    if(num != Y_test[i]):
#        print('Expected emoji:'+ label_to_emoji(Y_test[i]) + ' prediction: '+ X_test[i] + label_to_emoji(num).strip())

In [40]:
# max_len_test = test['comment_text'].str.split().apply(len).max()
max_len_test = max_len

X_test_indices = sentences_to_indices(test['comment_text'], words_to_index, max_len_test)

Y_test = test.as_matrix(columns = class_names)

preds = model.predict(X_test_indices, verbose = 1)



In [39]:
X_test_indices[0:2]

array([[394017,  77181, 194532, ...,      0,      0,      0],
       [     0, 154323, 307569, ...,      0,      0,      0]], dtype=int32)

In [34]:
model.predict(X_test_indices[0:2,:])

array([[0.07195413, 0.00958922, 0.03911008, 0.00467692, 0.04485367,
        0.00369947],
       [0.07195416, 0.00958922, 0.03911009, 0.00467693, 0.04485369,
        0.00369947]], dtype=float32)

In [26]:
preds

array([[0.07195413, 0.00958922, 0.03911008, 0.00467692, 0.04485367,
        0.00369947],
       [0.07195416, 0.00958922, 0.03911009, 0.00467693, 0.04485368,
        0.00369948],
       [0.07195413, 0.00958922, 0.03911008, 0.00467692, 0.04485368,
        0.00369948],
       ...,
       [0.07195413, 0.00958922, 0.03911008, 0.00467692, 0.04485368,
        0.00369947],
       [0.07195416, 0.00958922, 0.03911008, 0.00467692, 0.04485368,
        0.00369948],
       [0.07195413, 0.00958922, 0.03911007, 0.00467692, 0.04485368,
        0.00369947]], dtype=float32)

In [27]:
import pandas as pd

for i in range(len(class_names)):
    class_name = class_names[i]
    test[class_name] = preds[:, i]
    
test.iloc[0:10]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,0.071954,0.009589,0.03911,0.004677,0.044854,0.003699
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,0.071954,0.009589,0.03911,0.004677,0.044854,0.003699
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",0.071954,0.009589,0.03911,0.004677,0.044854,0.003699
3,00017563c3f7919a,":If you have a look back at the source, the in...",0.071954,0.009589,0.03911,0.004677,0.044854,0.003699
4,00017695ad8997eb,I don't anonymously edit articles at all.,0.071954,0.009589,0.03911,0.004677,0.044854,0.003699
5,0001ea8717f6de06,Thank you for understanding. I think very high...,0.071954,0.009589,0.03911,0.004677,0.044854,0.003699
6,00024115d4cbde0f,Please do not add nonsense to Wikipedia. Such ...,0.071954,0.009589,0.03911,0.004677,0.044854,0.003699
7,000247e83dcc1211,:Dear god this site is horrible.,0.071954,0.009589,0.03911,0.004677,0.044854,0.003699
8,00025358d4737918,""" \n Only a fool can believe in such numbers. ...",0.071954,0.009589,0.03911,0.004677,0.044854,0.003699
9,00026d1092fe71cc,== Double Redirects == \n\n When fixing double...,0.071954,0.009589,0.03911,0.004677,0.044854,0.003699


In [37]:
np.var(test['toxic'])  # oh-oh :(

1.9401147e-16