In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Try with a pretrained embedding layer instead of a trainable; see if the same stuff gets returned

def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')

def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    vocab_len = len(word_to_index) + 1
    emb_dim = word_to_vec_map["cucumber"].shape[0]
    emb_matrix = np.zeros((vocab_len, emb_dim))
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]
    embedding_layer = Embedding(vocab_len, emb_dim, trainable = False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)

In [3]:
train = pd.read_csv('data/train.csv', nrows = 10000)
test = pd.read_csv('data/test.csv', nrows = 100)

In [4]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_train = train["comment_text"]
list_sentences_test = test["comment_text"]

In [6]:
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

In [7]:
maxlen = 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [8]:
totalNumWords = [len(one_comment) for one_comment in list_tokenized_train]

In [9]:
# original kernel model, with trainable embedding layer

#inp = Input(shape=(maxlen, ))
#embed_size = 128
#
#x = Embedding(max_features, embed_size)(inp)
#x = LSTM(60, return_sequences=True, name='lstm_layer')(x)
#x = GlobalMaxPool1D()(x)
#x = Dropout(0.1)(x)
#x = Dense(50, activation="relu")(x)
#x = Dropout(0.1)(x)
#x = Dense(6, activation="sigmoid")(x)
#model = Model(inputs=inp, outputs=x)
#model.compile(loss='binary_crossentropy',
#                  optimizer='adam',
#                  metrics=['accuracy'])

In [10]:
# same architecture, but pretrained embedding layer. still works

#inp = Input(shape=(maxlen, ))
#
#x = embedding_layer(inp)
#
#x = LSTM(60, return_sequences=True, name='lstm_layer')(x)
#x = GlobalMaxPool1D()(x)
#x = Dropout(0.1)(x)
#x = Dense(50, activation="relu")(x)
#x = Dropout(0.1)(x)
#x = Dense(6, activation="sigmoid")(x)
#model = Model(inputs=inp, outputs=x)
#model.compile(loss='binary_crossentropy',
#                  optimizer='adam',
#                  metrics=['accuracy'])

In [20]:
# simpler model.

inp = Input(shape=(maxlen, ))

x = embedding_layer(inp)

x = LSTM(64, name='lstm_layer')(x)
#x = GlobalMaxPool1D()(x)
x = Dropout(0.5)(x)
#x = Dense(50, activation="relu")(x)
#x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

In [21]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 50)           20000050  
_________________________________________________________________
lstm_layer (LSTM)            (None, 64)                29440     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 390       
Total params: 20,029,880
Trainable params: 29,830
Non-trainable params: 20,000,050
_________________________________________________________________


In [26]:
X_t

array([[   0,    0,    0, ..., 3859, 2208, 1068],
       [   0,    0,    0, ...,  607, 5886,  184],
       [   0,    0,    0, ...,    1,  636,  501],
       ...,
       [   0,    0,    0, ...,   10,    1, 1276],
       [   0,    0,    0, ...,  342,  805,   93],
       [   0,    0,    0, ..., 6732,   43,   35]], dtype=int32)

In [22]:
model.fit(X_t, y, batch_size=32, epochs=1) # , validation_split=0.1)

Epoch 1/1


<keras.callbacks.History at 0x7f71a6fbbc50>

In [23]:
#model.evaluate(X_t, y)

In [24]:
model.predict(X_t[0:7, :])

array([[0.10401345, 0.01069206, 0.06057782, 0.00652343, 0.05809954,
        0.01343273],
       [0.08496071, 0.00940252, 0.03739399, 0.00542659, 0.04178619,
        0.00842119],
       [0.09762777, 0.01010141, 0.04026822, 0.00606823, 0.0437963 ,
        0.00893358],
       [0.09526028, 0.00911768, 0.04494273, 0.0050593 , 0.04245529,
        0.00884103],
       [0.06532637, 0.00677277, 0.03399978, 0.0041446 , 0.0353527 ,
        0.00665119],
       [0.06090439, 0.00713453, 0.0333062 , 0.00472817, 0.02773847,
        0.00712927],
       [0.08270717, 0.00923342, 0.04934716, 0.00476087, 0.04595398,
        0.0085892 ]], dtype=float32)

In [16]:
# model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.1)

In [17]:
#pred = model.predict(pad_sequences(list_tokenized_test, maxlen=maxlen))

In [18]:
#submit = pd.DataFrame(test['id'].values, columns=['id'])
#preds = pd.DataFrame(pred, columns=list_classes)
#submit = pd.concat([submit, preds], axis = 1)
#submit

In [19]:
#submit.to_csv('data/submission_04.csv', index = False)