In [12]:
import numpy as np
import pandas as pd
import re
from os.path import join, isfile
from tqdm import tqdm
from sklearn import metrics

In [6]:
max_features=100000
maxlen=150
embed_size=300

file_embedding_txt = join('E://DM//NLP//WordVec', 'glove.840B.300d.txt')
path_tmp = join('E:\\DM\\NLP\\TMP_MEMORY','Toxic_Comment_Classification')
file_model = join(path_tmp, 'weights_base.best.hdf5')
file_embedding_matrix = join(path_tmp, 'embedding_matrix.hkl')
file_train = join(path_tmp, 'train_clean.csv')
file_test = join(path_tmp, 'test_clean.csv')
file_submission = join(path_tmp, 'submission.csv')

In [3]:
labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

def GetData(file_train, file_test):
    train = pd.read_csv(file_train, encoding='utf-8')
    test = pd.read_csv(file_test, encoding='utf-8')
    train["comment_text"].fillna('fillna')
    test["comment_text"].fillna('fillna')
    X_train = train["comment_text"].apply(str)
    X_test = test["comment_text"].apply(str)

    y_train = train[labels]
    merge = pd.concat([X_train, X_test]).reset_index(drop=True)
    merge = merge.astype('str')
    return X_train, X_test, y_train, merge


X_train, X_test, y_train, merge = GetData(file_train, file_test)




In [4]:
X_train.describe()

count     159571
unique    157881
top          nan
freq          52
Name: comment_text, dtype: object

In [5]:
from keras.preprocessing import text, sequence

tokenizer = text.Tokenizer(num_words=max_features, lower=True)
copus = list(X_train) + list(X_test)
copus = [str(x) for x in copus]
tokenizer.fit_on_texts(copus)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = sequence.pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = sequence.pad_sequences(X_test_seq, maxlen=maxlen)

length_seq = [len(document) for document in X_train_seq[:10]]
print('length of X_train_seq:' + str(length_seq))
print('shape of X_train_pad: ' + str(X_train_pad.shape))

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


length of X_train_seq:[26, 12, 22, 57, 5, 6, 4, 6, 45, 4]
shape of X_train_pad: (159571, 150)


In [8]:
def ReadWord2Vec():
    embeddings_index = {}
    with open(file_embedding_txt, encoding='utf8') as f:
        for line in tqdm(f):
            values = line.rstrip().rsplit(' ')
            assert len(values)>=300
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

def CreateEmbeddingMatrix():
    embeddings_index = ReadWord2Vec()
    word_index = tokenizer.word_index
    num_words = min(max_features, len(word_index) + 1)
    embedding_matrix = np.zeros((num_words, embed_size))
    print(len(embeddings_index))
    print(len(word_index))
    print(max_features)
    print(embedding_matrix.shape)
    
    num_missed = 0
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
        else:
            num_missed +=1
    print('number of words that are not found in the embedding_vector: ' + str(num_missed))
    return embedding_matrix


    


2196018it [04:29, 8136.49it/s] 


2196017
291655
100000
(100000, 300)
number of words that are not found in the embedding_vector: 21248


### How to store Large numpy arrays on disk in python
* stackoverflow question: https://stackoverflow.com/questions/9619199/best-way-to-preserve-numpy-arrays-on-disk
* pickle VS. hdf5: https://shocksolution.com/2010/01/10/storing-large-numpy-arrays-on-disk-python-pickle-vs-hdf5adsf/
* hickle github: https://github.com/telegraphic/hickle
* HDF is good for storing large numpy arrays
* hickle aims to be exactly the same in usage as pickle, but using HDF for data storage

```python
import os
import hickle as hkl
import numpy as np
    
# Create a numpy array of data
array_obj = np.ones(32768, dtype='float32')
    
# Dump to file
hkl.dump(array_obj, 'test.hkl', mode='w')
    
# Dump data, with compression
hkl.dump(array_obj, 'test_gzip.hkl', mode='w', compression='gzip')
  
# Compare filesizes
print('uncompressed: %i bytes' % os.path.getsize('test.hkl'))
print('compressed:   %i bytes' % os.path.getsize('test_gzip.hkl'))
    
# Load data
array_hkl = hkl.load('test_gzip.hkl')
    
# Check the two are the same file
assert array_hkl.dtype == array_obj.dtype
assert np.all((array_hkl, array_obj))
```


In [20]:
import hickle as hkl
def GetEmbeddingMatrix():
    if isfile(file_embedding_matrix):
        embedding_matrix = hkl.load(file_embedding_matrix)
        hkl.dump(embedding_matrix, file_embedding_matrix, mode='w')
    else:
        embedding_matrix = CreateEmbeddingMatrix()
    
    return embedding_matrix

# embedding_matrix = GetEmbeddingMatrix()


In [14]:
from keras.callbacks import Callback
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))

In [21]:
# A blog about LSTM-CNNs:
# http://konukoii.com/blog/2018/02/19/twitter-sentiment-analysis-using-combined-lstm-cnn-models/
from keras.layers import Dense,Input,Bidirectional,Activation,Conv1D,GRU, Dropout,Embedding
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.models import Model
from keras.optimizers import Adam

def GetModel():
    sequence_input = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix],trainable = False)(sequence_input)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
    x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = concatenate([avg_pool, max_pool]) 
    # x = Dense(128, activation='relu')(x)
    # x = Dropout(0.1)(x)
    preds = Dense(6, activation="sigmoid")(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])
    return model

model = GetModel()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 150, 300)     30000000    input_2[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 150, 300)     0           embedding_2[0][0]                
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 150, 256)     329472      spatial_dropout1d_1[0][0]        
__________________________________________________________________________________________________
conv1d_1 (

In [22]:
epochs = 2
batch_size = 128

In [23]:
from sklearn.model_selection import train_test_split
X_tra, X_val, y_tra, y_val = train_test_split(X_train_pad, y_train, train_size=0.9, random_state=233)



In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
checkpoint = ModelCheckpoint(file_model, monitor='val_acc', save_best_only=True, mode='max')
early = EarlyStopping(monitor="val_acc", mode="max", patience=5)
ra_val = RocAucEvaluation(validation_data=(X_val, y_val), interval = 1)
callbacks_list = [ra_val,checkpoint, early]

In [None]:
# model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),callbacks = callbacks_list,verbose=1)    


In [60]:
model.load_weights(file_model)
print('Predicting....')
y_pred = model.predict(X_test_pad,batch_size=1024)
submission = pd.read_csv('data/sample_submission.csv')
submission[labels] = y_pred
submission.to_csv(file_submission, index=False)
print('Done')

Predicting....
Done


### Error Analysis

In [None]:
# this is the original data
path_train_origin = 'data/train.csv'
path_test_origin = 'data/test.csv'
X_train_origin, X_test_origin, y_train, merge_origin = GetData(path_train_origin, path_test_origin)

In [70]:
X = X_train_pad
y = y_train

y_pred = model.predict(X, batch_size=1024)
target_class = 0 # toxic
target_result = y_pred[:, target_class]
target_gt = y[labels[target_class]]


In [98]:
def Cost(fpr, tpr):
    a = 1-tpr
    b = fpr
    cost = a + 30*b
    return cost

# get the threshold that maximize the F1 score
def GetBestThreshold(gt, pred):

    fpr, tpr, thresholds = metrics.roc_curve(gt, pred)
    cost = np.inf
    threshold = None
    for idx in range(len(thresholds)):
        current_cost = Cost(fpr[idx], tpr[idx])
        if current_cost < cost:
            cost = current_cost
            threshold = thresholds[idx]
    return threshold, cost

threshold, _ = GetBestThreshold(target_gt, target_result)
print(threshold)

0.86334765


In [99]:
print(metrics.roc_auc_score(target_gt, target_result))
target_pred = target_result>threshold
mask = target_gt!=target_pred
error = np.sum(mask)
total = len(mask)
print(error)
print(total)
print((total-error)/total)

0.9806415376334906
6734
159571
0.9577993495058625


In [107]:
#X_train, X_test, y_train, merge = GetData(file_train, file_test)
misclassified_gt = target_gt[mask]
misclassified_x =  X_train[mask]
#X_train[:5]
Error0 = misclassified_x[misclassified_gt==1] # misclassify toxic as healthy
Error1 = misclassified_x[misclassified_gt==0] # misclassify healthy as toxic

In [116]:
for x in range(20):
    current = Error1.index[x]
    print(Error1[current])
    print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
    print(X_train_origin[current])
    print('######################################')

a bisexual like a homosexual a heterosexual defined sexual activity much like a year old boy attracted a girl sexually never sex still straight a person actually sexually attracted aroused sex well opposite sex bisexual
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
A Bisexual, like a homosexual or a heterosexual, is not defined by sexual activity. (Much like a 15 year old boy who is attracted to a girl sexually but has never had sex is still straight). A person who is actually sexually attracted/aroused by the same sex as well as the opposite sex is bisexual.
######################################
know sex foetus
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
"know the sex of the foetus"""
######################################
reply loser un defines vietnam part southeast asia far i know vietnam part asean used part french indochina laos shit countries anyway culture always influenced sea han chinese proper yangtze han chinese fringe indigenous tribes guangzhou guangxi admit vietnamese a bunch wanna

Ram it up ur ass very hard till ur eyes water
######################################
ms jackson lyrics andre yeah one right goes baby s mamas mamas mamas mamas baby mamas mamas yeah go like chorus andre i sorry ms jackson oooh i real never meant make daughter cry i apologize a trillion times i sorry ms jackson oooh i real never meant make daughter cry i apologize a trillion times big boi baby s drama mama like doin things like havin boys come neighborhood studio tryin fight need get a piece american pie take bite house i disconnect cable turn lights let know grandchild a baby a paycheck private school daycare shit medical bills i pay i love mom everything see i t one laid wanna rib start a custody war lawyers stay never got a chance hear side story divided fish fries cookouts child s birthday i t invited despite i show utmost respect i fall defend lady i call yeah chorus i sorry ms jackson oooh i real never meant make daughter cry i apologize a trillion times i sorry ms jackson oooh i 

######################################
