In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import time
from tqdm import tqdm
import gensim
import re
import gc

from keras.layers import Dense, Input, CuDNNLSTM, CuDNNGRU, Embedding, SpatialDropout1D
from keras.layers import Bidirectional, GlobalMaxPooling1D, Concatenate 
from keras.optimizers import Adam
from keras.models import Model
from keras import backend as K
# from keras.engine import InputSpec, Layer
from keras import initializers, optimizers, layers
# from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import spacy
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.stem.lancaster import LancasterStemmer
lc = LancasterStemmer()
from nltk.stem import SnowballStemmer
sb = SnowballStemmer('english')

In [None]:
import warnings 
warnings.filterwarnings('ignore')

In [None]:
max_length = 55
embedding_size = 300
learning_rate = 0.0005
batch_size = 512
num_epoch = 4

In [None]:
spell_model = gensim.models.KeyedVectors.load_word2vec_format('../input/wikinews300d1mvec/wiki-news-300d-1M.vec')

In [None]:
words = spell_model.index2word
w_rank = {}
for i, word in enumerate(words):
    w_rank[word] = i
WORDS = w_rank

In [None]:
del spell_model
gc.collect()

def words(text):
    return re.findall(r'\w+', text.lower()) 
    # ['https', 'docs', 'python', 'org']

def P(word):
    # Probability of "word"
    return - WORDS.get(word, 0)

def known(words):
    # The subset of 'words' that appear in the dictionary of WORDS
    return set(w for w in words if w in WORDS)

def edits1(word):
    # All edits that are one edit away from 'word'
    letters = 'abcdefghijklmnopqrstuvwxyz'
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    # [('', 'splits'), ('s', 'plits'),...('splits', '')]
    deletes = [L + R[1:] for L, R in splits if R]
    # ['plits', 'slits', 'spits', 'splts', 'splis', 'split']
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    # ['pslits', 'slpits', 'spilts', 'spltis', 'splist']
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    # ['aplits', 'bplits',...'splitx', 'splity', 'splitz']
    inserts = [L + c + R for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word):
    # All edits that are two edits away from 'word'
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))    
    
def candidates(word):
    # Generate possible spelling corrections for word
    return (known([word]) or known(edits1(word)) or [word])

def correction(word):
    # Most probable spelling correction for word
    return max(candidates(word), key=P)

def singlify(word):
    return ''.join([letter for i, letter in enumerate(word) if i==0 or letter!=word[i-1]])
    # 'apple' => 'aple'

In [None]:
def load_glove(word_dict, lemma_dict):
    EMBEDDING_FILE = '../input/glove840b300dtxt/glove.840B.300d.txt'
    def get_coefs(word, *arr):
        return word, np.asarray(arr, dtype='float32')
    
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
    # embedding_index = {',':300d vec, 'this':300d vec.....}
    embed_size = 300
    nb_words = len(word_dict) + 1
    embedding_matrix = np.zeros((nb_words, embed_size), dtype=np.float32)
    unknown_vector = np.zeros((embed_size,), dtype=np.float32) - 1
    for key in tqdm(word_dict):
        # word_dict = {'key': 'index'}
        word = key
        embedding_vector = embeddings_index.get(word) # 返回word对应的300d vec
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.lower()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.upper()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.capitalize()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = ps.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = lc.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = sb.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = lemma_dict[key]
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        if len(key) > 1:
            word = correction(key)
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[word_dict[key]] = embedding_vector
                continue
        embedding_matrix[word_dict[key]] = unknown_vector
    return embedding_matrix, nb_words

In [None]:
def load_fasttext(word_dict, lemma_dict):
    EMBEDDING_FILE = '../input/wikinews300d1mvec/wiki-news-300d-1M.vec'
    def get_coefs(word, *arr):
        return word, np.asarray(arr, dtype='float32')
    
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)
    # embedding_index = {',':300d vec, 'this':300d vec.....}
    embed_size = 300
    nb_words = len(word_dict) + 1
    embedding_matrix = np.zeros((nb_words, embed_size), dtype=np.float32)
    unknown_vector = np.zeros((embed_size,), dtype=np.float32) - 1
    for key in tqdm(word_dict):
        # word_dict = {'key': 'index'}
        word = key
        embedding_vector = embeddings_index.get(word) # 返回word对应的300d vec
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.lower()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.upper()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.capitalize()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = ps.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = lc.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = sb.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = lemma_dict[key]
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        if len(key) > 1:
            word = correction(key)
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[word_dict[key]] = embedding_vector
                continue
        embedding_matrix[word_dict[key]] = unknown_vector
    return embedding_matrix, nb_words

In [None]:
def load_para(word_dict, lemma_dict):
    EMBEDDING_FILE = '../input/paragram-300-sl999/paragram_300_sl999/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word, *arr):
        return word, np.asarray(arr, dtype='float32')
    
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding='utf-8', errors='ignore') if len(o)>100)
    # embedding_index = {',':300d vec, 'this':300d vec.....}
    embed_size = 300
    nb_words = len(word_dict) + 1
    embedding_matrix = np.zeros((nb_words, embed_size), dtype=np.float32)
    unknown_vector = np.zeros((embed_size,), dtype=np.float32) - 1
    for key in tqdm(word_dict):
        # word_dict = {'key': 'index'}
        word = key
        embedding_vector = embeddings_index.get(word) # 返回word对应的300d vec
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.lower()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.upper()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.capitalize()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = ps.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = lc.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = sb.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = lemma_dict[key]
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        if len(key) > 1:
            word = correction(key)
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[word_dict[key]] = embedding_vector
                continue
        embedding_matrix[word_dict[key]] = unknown_vector
    return embedding_matrix, nb_words

In [None]:
def build_model(embedding_matrix, nb_words, embedding_size=300):
    inp = Input(shape=(max_length,))
    x = Embedding(nb_words, embedding_size, weights=[embedding_matrix], trainable=False)(inp)
    x = SpatialDropout1D(0.3)(x)
    x1 = Bidirectional(CuDNNLSTM(256, return_sequences=True))(x)
    x2 = Bidirectional(CuDNNGRU(128, return_sequences=True))(x1)
    max_pool1 = GlobalMaxPooling1D()(x1)
    max_pool2 = GlobalMaxPooling1D()(x2)
    conc = Concatenate()([max_pool1, max_pool2])
    predictions = Dense(1, activation='sigmoid')(conc)
    model = Model(inputs=inp, outputs=predictions)
    adam = optimizers.Adam(lr=learning_rate)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    return model
    

In [None]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", 
                       "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
                       "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", 
                       "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", 
                       "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", 
                       "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", 
                       "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", 
                       "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
                       "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
                       "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", 
                       "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", 
                       "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", 
                       "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", 
                       "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", 
                       "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
                       "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", 
                       "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", 
                       "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  
                       "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", 
                       "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", 
                       "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", 
                       "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", 
                       "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are",
                       "y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", 
                       "you're": "you are", "you've": "you have" }

In [None]:
replace_with_fuck = ['4r5e',  '5h1t', '5hit', 'ass-fucker', 'assfucker', 'assfukka', 'asswhole', 'a_s_s', 
                     'b!tch', 'b17ch', 'blow job', 'boiolas', 'bollok', 'boooobs', 'booooobs', 'booooooobs',
                     'bunny fucker', 'buttmuch', 'c0cksucker', 'carpet muncher', 'cl1t', 'cockface', 'cockmunch',
                     'cockmuncher', 'cocksuka', 'cocksukka', 'cokmuncher', 'coksucka', 'cunillingus', 'cuntlick',
                     'cuntlicker', 'cuntlicking', 'cyalis', 'cyberfuc', 'cyberfuck', 'cyberfucked', 'cyberfucker',
                     'cyberfuckers', 'cyberfucking', 'dirsa', 'dlck', 'dog-fucker', 'donkeyribber', 'ejaculatings',
                     'ejakulate', 'f u c k', 'f u c k e r', 'f4nny', 'faggitt', 'faggs', 'fannyflaps', 
                     'fannyfucker', 'fanyy', 'fingerfucker', 'fingerfuckers', 'fingerfucks', 'fistfuck', 'fistfucked',
                     'fistfucker', 'fistfuckers', 'fistfucking', 'fistfuckings', 'fistfucks', 'fuckingshitmotherfucker',
                     'fuckwhit', 'fudge packer', 'fudgepacker', 'fukwhit', 'fukwit', 'fux0r', 'f_u_c_k', 'god-dam',
                     'kawk', 'knobead', 'knobed', 'knobend', 'knobjocky', 'knobjokey', 'kondum', 'kondums', 'kummer',
                     'kumming', 'kums', 'kunilingus', 'l3itch', 'm0f0', 'm0fo', 'm45terbate', 'ma5terb8', 'ma5terbate',
                     'master-bate', 'masterb8', 'masterbat3', 'masterbations', 'mof0', 'mothafuck', 'mothafuckaz',
                     'mothafucked', 'mothafucking', 'mothafuckings', 'mothafucks', 'mother fucker', 'motherfucked',
                     'motherfuckings', 'motherfuckka', 'motherfucks', 'muthafecker', 'muthafuckker', 'n1gga', 'n1gger',
                     'nigg3r', 'nigg4h', 'nob jokey', 'nobjocky', 'nobjokey', 'penisfucker', 'phuked', 'phuking',
                     'phukked', 'phukking', 'phuks', 'phuq', 'pigfucker', 'pimpis', 'pissflaps', 'rimjaw', 's hit',
                     'scroat', 'sh!t', 'shitdick', 'shitfull', 'shitings', 'shittings', 's_h_i_t', 't1tt1e5', 
                     't1tties', 'teez', 'tittie5', 'tittiefucker', 'tittywank', 'tw4t', 'twathead', 'twunter',
                     'v14gra', 'v1gra', 'w00se', 'whoar']

In [None]:
def clean(text):
       
    text = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in text.split(" ")])        
    text = ' '.join(['fuck' if t in replace_with_fuck else t for t in text.split(" ")])
    return text

In [None]:
start_time = time.time()
print("Loading data...")
train = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv').fillna(' ')
test = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv').fillna(' ')
train['target'] = np.where(train['target'] >= 0.5, True, False)
train_text = train['comment_text'].apply(lambda x: clean(x))
test_text = test['comment_text'].apply(lambda x: clean(x))
text_list = pd.concat([train_text, test_text])
y = train['target'].values
num_train_data = y.shape[0]
print('--- %s seconds ---' % (time.time() - start_time))

del(train, test, train_text, test_text)
gc.collect()

In [None]:
# import json

# file = open('../input/word-dict-and-lemma-dict/word_dict.txt', 'r') 
# js = file.read()
# word_dict = json.loads(js)   
# file.close() 

# file = open('../input/word-dict-and-lemma-dict/lemma_dict.txt', 'r') 
# js = file.read()
# lemma_dict = json.loads(js)   
# file.close() 

In [None]:
print('Spacy NLP...')
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner', 'tagger'])
nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)

In [None]:
######################
word_dict = {}
lemma_dict = {}
word_sequences = []
word_index = 1
docs = nlp.pipe(text_list, n_threads=3)
for doc in tqdm(docs, total=1902194):
    word_seq = []
    for token in doc:
        if (token.text not in word_dict) and (token.pos_ is not "PUNCT"):
            word_dict[token.text] = word_index
            word_index += 1
            lemma_dict[token.text] = token.lemma_
        if token.pos_ is not 'PUNCT':
            word_seq.append(word_dict[token.text])
    word_sequences.append(word_seq)
    
######################
# word_dict = {'The': 1, 'is': 2....}
# lemma_dict = {'The': 'the', 'is': 'is'...}
# word_seq = [1, 2, ...]
# word_sequences = [[1, 2, ...], [...], ...]
# 1902194it

In [None]:
# docs = [token1, token2...]
# token.text          内容
# token.idx                    
# token.lemma_        词元
# token.is_punct      标点
# token.is_space      空格
# token.shape_        正字特征 如Xxx, xxxx, dd
# token.pos_          粗粒度的词性， 如NOUN名词， PUNCT标点
# token.tag_          细粒度的词性， 如NN， VBD

In [None]:
train_word_sequences = word_sequences[:num_train_data]
test_word_sequences = word_sequences[num_train_data:]
del(docs, nlp, word_sequences)
gc.collect()

In [None]:
train_word_sequences = pad_sequences(train_word_sequences, maxlen=max_length, padding='post')
test_word_sequences = pad_sequences(test_word_sequences, maxlen=max_length, padding='post')
# print(train_word_sequences[0])
# print(test_word_sequences[0])

del(text_list)
gc.collect()

In [None]:
start_time = time.time()
print("Loading embedding matrix...")
embedding_matrix_glove, nb_words = load_glove(word_dict, lemma_dict)
# embedding_matrix_fasttext, nb_words = load_fasttext(word_dict, lemma_dict)
# embedding_matrix = np.concatenate((embedding_matrix_glove, embedding_matrix_fasttext), axis=1)
print("--- %s seconds ---" % (time.time() - start_time))
# embedding matrix 536300 * 600

In [None]:
print('Start training ...')
model = build_model(embedding_matrix_glove, nb_words, embedding_size)
model.summary()

In [None]:
pred_prob1 = np.zeros((len(test_word_sequences),), dtype=np.float32)
model.fit(train_word_sequences, y, batch_size=batch_size, epochs=num_epoch-1)
pred_prob1 += 0.3*np.squeeze(model.predict(test_word_sequences, batch_size=batch_size, verbose=2))
model.fit(train_word_sequences, y, batch_size=batch_size, epochs=1)
pred_prob1 += 0.7*np.squeeze(model.predict(test_word_sequences, batch_size=batch_size, verbose=2))

In [None]:
K.clear_session()
del(model, embedding_matrix_glove)
gc.collect()

In [None]:
start_time = time.time()
print("Loading embedding matrix...")
embedding_matrix_fasttext, nb_words = load_fasttext(word_dict, lemma_dict)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
print('Start training ...')
model = build_model(embedding_matrix_fasttext, nb_words, embedding_size)
model.summary()

In [None]:
pred_prob2 = np.zeros((len(test_word_sequences),), dtype=np.float32)
model.fit(train_word_sequences, y, batch_size=batch_size, epochs=num_epoch-1)
pred_prob2 += 0.3*np.squeeze(model.predict(test_word_sequences, batch_size=batch_size, verbose=2))
model.fit(train_word_sequences, y, batch_size=batch_size, epochs=1)
pred_prob2 += 0.7*np.squeeze(model.predict(test_word_sequences, batch_size=batch_size, verbose=2))

In [None]:
K.clear_session()
del(model, embedding_matrix_fasttext)
gc.collect()

In [None]:
start_time = time.time()
print("Loading embedding matrix...")
embedding_matrix_para, nb_words = load_para(word_dict, lemma_dict)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
print('Start training ...')
model = build_model(embedding_matrix_para, nb_words, embedding_size)
model.summary()

In [None]:
pred_prob3 = np.zeros((len(test_word_sequences),), dtype=np.float32)
model.fit(train_word_sequences, y, batch_size=batch_size, epochs=num_epoch-1)
pred_prob3 += 0.3*np.squeeze(model.predict(test_word_sequences, batch_size=batch_size, verbose=2))
model.fit(train_word_sequences, y, batch_size=batch_size, epochs=1)
pred_prob3 += 0.7*np.squeeze(model.predict(test_word_sequences, batch_size=batch_size, verbose=2))

In [None]:
# import pynvml
# pynvml.nvmlInit()
# # 这里的1是GPU id
# handle = pynvml.nvmlDeviceGetHandleByIndex(0)
# meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
# print(meminfo.total/1024/1024)
# print(meminfo.used/1024/1024)
# print(meminfo.free/1024/1024)


In [None]:
submission = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/sample_submission.csv', index_col='id')
submission['prediction'] = (pred_prob1 + pred_prob2 + pred_prob3) / 3
# # submission.reset_index(drop=False, inplace=True)
submission.to_csv('submission.csv')