# Quora Insincere Questions Classification
> Detect toxic content to improve online conversations

In [1]:
import os
import sys
import gc
import glob
import operator 
import time
import re
import random
import functools
import numpy as np
import pandas as pd
from string import punctuation
from collections import Counter, OrderedDict
from tqdm import tqdm
tqdm.pandas()

import tensorflow as tf
import keras
from keras import Model
from keras.layers import *
from keras.layers.merge import _Merge
from keras.models import *
from keras.initializers import *
from keras.optimizers import *
from keras.callbacks import *
from keras.regularizers import *
from keras import backend as K
from keras.legacy import interfaces
from keras.engine.topology import Layer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.generic_utils import serialize_keras_object
from keras.utils.generic_utils import deserialize_keras_object
from keras.utils import multi_gpu_model

from sklearn.metrics import f1_score, recall_score, precision_score, roc_auc_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler

Using TensorFlow backend.


In [11]:
test = pd.read_csv('../input/test.csv')
sub = pd.read_csv("../input/submission.csv")

## Config
[ref: 如何在 Keras 开发过程中获取可复现的结果？](https://keras.io/zh/getting-started/faq/#how-can-i-obtain-reproducible-results-using-keras-during-development)

[ref: How can I obtain reproducible results using Keras during development?](https://keras.io/getting-started/faq/#how-can-i-obtain-reproducible-results-using-keras-during-development)

In [2]:
SEED = 2018
# python
os.environ['PYTHONHASHSEED'] = str(SEED)
# random
np.random.seed(SEED)
tf.set_random_seed(SEED)
# tf
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
session_conf.gpu_options.allow_growth = True
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)
# data
max_features = 95000
maxlen = 72
cv = True

## Load Data

In [3]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
print("Train shape: ", train.shape)
print("Test shape: ", test.shape)
sub = test[['qid']]

Train shape:  (1306122, 3)
Test shape:  (56370, 2)


In [4]:
# # 数据采样
# train_pos = train[train['target']==1]
# print("train_pos shape: ", train_pos.shape)
# train_neg = train[train['target']==0]
# print("train_neg shape: ", train_neg.shape)
# train_neg = train_neg.sample(frac=0.2)
# print("train_neg shape: ", train_neg.shape)
# train = pd.concat([train_pos, train_neg])
# train = train.sample(frac=1)
# print("Train shape: ", train.shape)
# train.head()

## Load Embedding

In [None]:
def load_emb(filename):
    def get_coefs(word, *arr): 
        return word, np.asarray(arr, dtype='float32')
    if "wiki-news-300d-1M.vec" in filename:
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(filename) if len(o)>100)
    else:
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(filename, encoding='latin'))
    return embeddings_index

glove = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
para = "../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt"
# fasttext = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
print("Extracting embedding")
embeddings_index_glove = load_emb(glove)
embeddings_index_para = load_emb(para)
# embeddings_index_fasttext = load_emb(fasttext)

In [7]:
# 所有词
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab


def check_coverage(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
    # 词向量中有的词占总词的种数的比例（比较种类）
    print('Found embeddings for {:.3%} of vocab'.format(len(known_words) / len(vocab)))
    # 整个文本中，知道词的数目占总次数的比例（比较数目）
    print('Found embeddings for  {:.3%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]
    return unknown_words


def add_lower(embedding, vocab):
    count = 0
    for word in vocab:
        if word in embedding and word.lower() not in embedding:  
            embedding[word.lower()] = embedding[word]
            count += 1
    print(f"Added {count} words to embedding")

## Preprocess

In [8]:
# Get set of all punctuations in dataset
tmp = []
for x in train.question_text:
    for c in x:
        if not c.isalnum():
            tmp.append(c)
for x in test.question_text:
    for c in x:
        if not c.isalnum():
            tmp.append(c)
puncs = set(tmp) - set(' ')
unpunc = puncs - set(punctuation)

In [9]:
contraction = { "ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because",
                "could've": "could have", "couldn't": "could not", "didn't": "did not",
                "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not",
                "haven't": "have not", "haven ' t""he'd": "he would","he'll": "he will", "he's": "he is",
                "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                "i'd": "i would", "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have",
                "i'm": "i am", "i've": "i have", "i'd": "i would", "i'd've": "i would have",
                "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have",
                "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will",
                "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
                "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have",
                "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not",
                "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not",
                "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not",
                "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have",
                "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have",
                "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is",
                "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is",
                "they'd": "they would", "they'd've": "they would have", "they'll": "they will",
                "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
                "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will",
                "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not",
                "what'll": "what will", "what'll've": "what will have", "what're": "what are", "what's": "what is",
                "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did",
                "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have",
                "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have",
                "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have",
                "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are",
                "y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will",
                "you'll've": "you will have", "you're": "you are", "you've": "you have" }

punc = {"ँ": "", "◦": "", "̆": "", "✏": "", "": "", "ี": "", "♡": "o", "△": "", "⇒": "", "": "", "＄": " dollar ",
        "→": "", "͚": "", "️": "", "⟩": "", "¡": "i", "್": "", "‬": "", "̘": "", "ា": "", "¿": "?", "⧼": "",
        "": "", "®": " r ", "ौ": "", "∼": "", "َ": "", "ూ": "", "”": "'", "̙": "", "⋅": "", "̷": "", "̓": "", "、": "",
        "⬇": "", "̔": "", "∗": "*", "͕": "", "͡": "", "̿": "", "‌": "", "͜": "", "̦": "", "": "", "♨": "", "̮": "",
        "ௌ": "", "»": " ", "➡": "", "̼": "", "̌": "", "̢": "", "？": "?", "": "", "ৃ": "", "ం": "", "⊥": "",
        "̧": "", "ਾ": "", "》": " ", "ਂ": "", "ិ": "", "∨": "", "ী": "", "े": "", "⧽": "", "⁡": "", "ु": "",
        "ٌ": "", "₦": " naira ", "̸": "", "़": "", "̃": "", "": "", "͎": "", "∧": "", "，": "", "÷": "/", "،": "",
        "↓": "", "✔": "", "⁠": "", "¶": "", "ೋ": "", "͖": "", "ে": "", "☝": "", "«": " ", "": "", "ं": "",
        "《": " ", "ॉ": "", "）": "", "͉": "", "⟨": "", "": "", "ْ": "", "‏": "", "₱": " peso ", "°": "",
        "͋": "", "✌": "", "্": "", "᠌": "", "♣": "", "×": "x", "ো": "", "؟": "?", "˜": "", "̩": "", "̱": "",
        "̺": "", "͔": "", "▾": "", "⎛": "", "ొ": "", "்": "", "̊": "", "̥": "", "ੁ": "", "่": "", "﻿": "", "˚": "",
        "ా": "", "ા": "", "™": " tm ", "ِ": "", "∈": "", "⃗": "", "≅": "=", "̵": "", "♭": "", "ಾ": "", "；": ".",
        "̒": "", "ி": "", "´": "'", "＞": ">", "̣": "", "ุ": "", "ّ": "", "▒": "", "।": "", "–": "-", "∖": "",
        "̰": "", "ॄ": "", "‘": "'", "̶": "-", "ो": "", "！": "!", "☺": "", "̎": "", "″": "", "＝": "=", "˂": "",
        "਼": "", "ः": "", "ֿ": "", "♏": "", "¦": "", "̝": "", "̈": "", "́": "", "‐": "-", "“": "'", "ാ": "",
        "≤": "<=", "ੀ": "", "": "", "\n": "", "◌": "", "ृ": "", "ு": "", "ा": "", "¥": " yen ", "‑": "-",
        "￼": "", "": "", "्": "", "̭": "", "": "", "¬": "", "͌": "", "̍": "", "„": "", "ី": "", "•": "", "↑": "",
        "͘": "", "": "", "͇": "", "̫": "", "ா": "", "͛": "", "︠": "", "⁻": "-", "᾽": "", "ি": "", "̟": "", "│": "|",
        "̕": "", "͊": "", "̑": "", "‎": "", "☁": "", "ಿ": "", "ी": "", "̀": "", "়": "", "̐": "", "☉": "", "": "",
        "⚧": "", "£": " pound ", "・": "", "⋯": "...", "−": "-", "∅": " ", "¸": ",", "̋": "", "̲": "", "⎝": "",
        "͆": "", "〗": "]", "／": "", "ั": "", "：": "", "ோ": "", "̽": "", "©": " c ", "": "", "്": "", "ು": "",
        "ు": "", "్": "", "ि": "", "⊨": "", "̈́": "", "̚": "", "̖": "", "̡": "", "·": ".", "✅": "", "ͅ": "",
        "ੰ": "", "̾": "", "…": "", "＾": "^", "≈": "=", "—": "-", "♀": "", "❤": "", "્": "", "ା": "", "¢": "",
        "⎞": "", "ె": "", "​": "", "̻": "", "（": "", "‪": "", "≠": "!=", "ॢ": "", "ં": "", "〖": "[", "­": "", "∂": "",
        "̬": "", "͐": "", "": "", "₊": "+", "℅": "%", "̛": "", "‰": "", "ਿ": "", "͈": "", "́": "", "͂": "", "̞": "",
        "ి": "", "้": "", "̗": "", "ു": "", "": "", "’": "'", "া": "", "ើ": "", "": "", "ះ": "", "」": "]", "︡": "",
        "ू": "", "̳": "", "ை": "", "⊂": "", "∇": "", "≥": ">=", "̄": "", "₹": " e ", "̜": "", "̴": "", "℃": "",
        "±": "+", "⌚": " time ", "≡": "", "̹": "", "̯": "", "′": "", "ీ": "", "ូ": "", "－": " ", "「": "[", "̀": "",
        "¨": "'", "ॣ": "", "⦁": "", "€": " euro ", "❓": "?", "ู": "", "͗": "", "̅": "", "̂": "", "͠": "", "̤": "",
        "្": "", "̉": "", "₩": "", "": "", "̪": "", "ै": "", "∘": "", "ៃ": "", "͑": "", "ំ": "", "͒": "", "☹": "",
        "͝": "", "‛": "'", "⎠": "", "¯": "", "。": ".", "∆": "", "ി": "", "̓": "", "∝": "", "†": "", "≱": "", "²": "2",
        "`": "'", 'à': 'a', '³': '3', 'π': 'pi', "₁": "1", "₃": "3", "₆": "6", "¼": "1/4", "⁷": "7", "¾": "3/4",
        "⁵": "5", "₅": "5", "½": "1/2", "₄": "4", "⅔": "2/3", "₂": "2", "¹": "1"}

mispell = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling',
           'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor',
           'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu': 'youtube ',
           'qoura': 'quora', 'quorans': 'quora users', 'quoran': 'quora user', 'sallary': 'salary', 'whta': 'what',
           'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much',
           'howmany': 'how many', 'whydo': 'why do', 'doi': 'do i', 'thebest': 'the best', 'howdoes': 'how does',
           'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating',
           'pennis': 'penis', 'etherium': 'ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data',
           '2k15': '2015', '2k16': '2016', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend',
           'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization',
           'demonitization': 'demonetization', 'demonetisation': 'demonetization', 'pokémon': 'pokemon',
           'nanodegree': 'nano degree', 'brexit': 'british exit', 'cryptocurrencies': 'crypto currencies',
           'coinbase': 'coin base', 'oneplus': 'one plus', 'redmi': 'red mi', 'GDPR': 'general data protection regulation',
           'DCEU': 'dc extended universe', 'litecoin': 'lite coin', 'unacademy': 'non academy', 'altcoin': 'bitcoin alternative',
           'altcoins': 'bitcoin alternative', 'sjw': 'social justice warriors', 'sjws': 'social justice warriors',
           'fiancé': 'fiance', 'microservices': 'micro services', 'bitconnect': 'bit connect', 'codeforces': 'code forces',
           'wannacry': 'wanna cry', 'onedrive': 'one drive', 'airpods': 'air pods', 'twinflame': 'twin flame',
           'undergraduation': 'under graduation', 'cos2x': 'cos 2 x', 'yourquote': 'your quote', 'xiomi': 'xiaomi',
           'undertale': 'under tale', 'genderfluid': 'gender fluid', 'são': 'sao', 'chapterwise': 'chapter wise',
           'deepmind': 'deep mind', '': '', 'arrowverse': 'arrow verse', 'overbrace': ' ', 'tensorflow': 'tensor flow',
           'hackerrank': 'hacker rank', 'microservice': 'micro service', 'reactjs': 'react js', 'hackerearth': 'hacker earth',
           'fiancée': 'fiance', 'blockchains': 'block chains', 'beyoncé': 'beyonce', 'neuralink': 'neura link',
           'openai': 'open ai', 'zoomcar': 'zoom car', 'hyperconjugation': 'hyper conjugation', 'autoencoder': 'auto encoder',
           'webassembly': 'web assembly', 'quoras': 'quora', 'digilocker': 'digi locker', 'oversmart': 'over smart',
           'cryptocoins': 'crypto coins', 'crytocurrencies': 'cryto currencies', 'cyrptocurrency': 'cyrpto currency',
           'café': 'cafe', 'whatapp': 'whatsapp', 'gaslighter': 'gas lighter', 'darkweb': 'dark web', 'webnovel': 'web novel'}

In [10]:
def replace_quote(text):
    quote = ['´', '‘', '’', "`"]
    for s in quote:
        text = text.replace(s, "'")
    return text
                      
def re_mapping(mapping):
    res = re.compile('(%s)' % '|'.join(mapping.keys()))
    return res

mapping = dict(set(contraction.items()) | set(mispell.items()))
re_map = re_mapping(mapping)
def replace_mapping(text):
    def replace(match):
        return mapping[match.group(0)]
    return re_map.sub(replace, text)

re_punc = re_mapping(punc)
def replace_punc(text):
    def replace(match):
        return punc[match.group(0)]
    return re_punc.sub(replace, text)

def sep_punc(x):
    for p in puncs:
        x = x.replace(p, f' {p} ')
    return x

def replace_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

def add_features(df):
    df['question_text'] = df['question_text'].progress_apply(lambda x: str(x))
    df['num_chars'] = df['question_text'].progress_apply(len)
    df['num_words'] = df.question_text.str.count('\S+')

    df['num_capital'] = df['question_text'].progress_apply(lambda x: sum(1 for c in x if c.isupper()))
    df['capital_rate'] = df['num_capital'] / df['num_words']

    df['num_uniquewords'] = df['question_text'].progress_apply(lambda x: len(set(x.split())))
    df['unique_rate'] = df['num_uniquewords'] / df['num_words']

    df["num_titlewords"] = df["question_text"].progress_apply(lambda x: len([w for w in x.split() if w.istitle()]))
    df['title_rate'] = df['num_titlewords'] / df['num_words']
    
    df["num_upperwords"] = df["question_text"].progress_apply(lambda x: len([w for w in x.split() if w.isupper()]))
    df['upper_rate'] = df['num_upperwords'] / df['num_words']
    
    df["num_exc"] = df["question_text"].progress_apply(lambda x: x.count("!")).astype('uint16')
    df["num_q"] = df['question_text'].progress_apply(lambda x: x.count("?")).astype('uint16')
    df["num_,"] = df['question_text'].progress_apply(lambda x: x.count(",")).astype('uint16')
    df["num_."] = df['question_text'].progress_apply(lambda x: x.count(".")).astype('uint16')
    df["mean_word_len"] = df["question_text"].progress_apply(lambda x: np.mean([len(w) for w in x.split()]))
    df["max_word_len"] = df['question_text'].progress_apply(lambda x: max([len(w) for w in x.split()]))

    df["num_unpunc"] = df["question_text"].progress_apply(lambda x: sum(x.count(p) for p in unpunc)).astype('uint16')
    df["num_punc"] = df["question_text"].progress_apply(lambda x: sum(x.count(p) for p in punctuation)).astype('uint16')
    df["num_mispell"] = df["question_text"].progress_apply(lambda x: sum(x.count(p) for p in mispell)).astype('uint16')
    
#     for s in [",", ";", '"', "...", "?", "!", ".", ":", "*", "-"]:
#         df[s] = df["question_text"].progress_apply(lambda x: np.mean([len(w) for w in x.split(s)]))
    return df

In [11]:
# 0.689
# feature_cols = ['capital_rate', 'unique_rate', 'title_rate', 'upper_rate']

# 0.694
# feature_cols = ['num_chars', 'num_words', 'num_capital', 'num_uniquewords', "num_titlewords", "num_upperwords",
#                 "num_exc", "num_q", "mean_word_len", "max_word_len", "num_unpunc", "num_punc", "num_mispell"]
# feature_cols += [",", ";", '"', "...", "?", "!", ".", ":", "*", "-"]

# 0.697
# feature_cols = ['num_chars', 'num_words', 'num_capital', 'num_uniquewords', "num_titlewords",
#                 "num_upperwords", "num_exc", "num_q", "mean_word_len", "max_word_len"]

# 0.698
# feature_cols = ["num_exc", "num_q", "mean_word_len", "max_word_len"]

# 0.699
# feature_cols = ['capital_rate', 'num_chars', 'num_words', "max_word_len", "mean_word_len",
#                 'num_capital', "num_punc", 'num_uniquewords', "num_q", "num_unpunc"]

# 0.697
# feature_cols = ['capital_rate', 'unique_rate', 'num_chars', 'num_words', "max_word_len", "mean_word_len",
#                 'num_capital', "num_punc", 'num_uniquewords', "num_q", "num_unpunc", "num_exc", "num_mispell"]

# 0.697
# feature_cols = ['capital_rate',  'num_chars', 'num_words', "max_word_len", "mean_word_len", 'num_capital',
#                 "num_punc", 'num_uniquewords', "num_q", "num_unpunc", "num_exc", "num_mispell"]

# 0.701
feature_cols = ['capital_rate',  'num_chars', 'num_words', "max_word_len", "mean_word_len",
                'num_capital', "num_punc", 'num_uniquewords', "num_q", "num_unpunc", "num_exc"]

# Add features
train = add_features(train)
test = add_features(test)

features = train[feature_cols].fillna(0)
test_features = test[feature_cols].fillna(0)
ss = StandardScaler()
ss.fit(np.vstack((features, test_features)))
features = ss.transform(features)
test_features = ss.transform(test_features)
print("Add features done")

100%|██████████| 1306122/1306122 [00:01<00:00, 1294245.98it/s]
100%|██████████| 1306122/1306122 [00:00<00:00, 1363556.60it/s]
100%|██████████| 1306122/1306122 [00:05<00:00, 219464.58it/s]
100%|██████████| 1306122/1306122 [00:02<00:00, 439676.27it/s]
100%|██████████| 1306122/1306122 [00:03<00:00, 429827.37it/s]
100%|██████████| 1306122/1306122 [00:02<00:00, 472993.50it/s]
100%|██████████| 1306122/1306122 [00:01<00:00, 1073922.44it/s]
100%|██████████| 1306122/1306122 [00:01<00:00, 1106692.92it/s]
100%|██████████| 1306122/1306122 [00:01<00:00, 1098404.29it/s]
100%|██████████| 1306122/1306122 [00:01<00:00, 1103008.75it/s]
100%|██████████| 1306122/1306122 [00:12<00:00, 102693.65it/s]
100%|██████████| 1306122/1306122 [00:03<00:00, 435253.45it/s]
100%|██████████| 1306122/1306122 [01:05<00:00, 19958.70it/s]
100%|██████████| 1306122/1306122 [00:07<00:00, 172517.81it/s]
100%|██████████| 1306122/1306122 [00:26<00:00, 49469.87it/s]
100%|██████████| 56370/56370 [00:00<00:00, 1229180.75it/s]
100%|██

Add features done


In [8]:
train['question_text'].tolist()+test['question_text'].tolist()

['How did Quebec nationalists see their province as a nation in the 1960s?',
 'Do you have an adopted dog, how would you encourage people to adopt and not shop?',
 'Why does velocity affect time? Does velocity affect space geometry?',
 'How did Otto von Guericke used the Magdeburg hemispheres?',
 'Can I convert montra helicon D to a mountain bike by just changing the tyres?',
 'Is Gaza slowly becoming Auschwitz, Dachau or Treblinka for Palestinians?',
 'Why does Quora automatically ban conservative opinions when reported, but does not do the same for liberal views?',
 'Is it crazy if I wash or wipe my groceries off? Germs are everywhere.',
 'Is there such a thing as dressing moderately, and if so, how is that different than dressing modestly?',
 'Is it just me or have you ever been in this phase wherein you became ignorant to the people you once loved, completely disregarding their feelings/lives so you get to have something go your way and feel temporarily at ease. How did things chan

In [12]:
# Before preprocess:
print("Before preprocess:")
vocab = build_vocab(train['question_text'].tolist()+test['question_text'].tolist())
print("glove:")
oov_glove = check_coverage(vocab, embeddings_index_glove)
print("para:")
oov_para = check_coverage(vocab, embeddings_index_para)
# print("fasttext:")
# oov_fasttext = check_coverage(vocab, embeddings_index_fasttext)

# Lower
train["question_text"] = train["question_text"].str.lower()
test["question_text"] = test["question_text"].str.lower()
print("Lower done")

# Add lower word to embedding:
add_lower(embeddings_index_glove, vocab)
add_lower(embeddings_index_para, vocab)
# add_lower(embeddings_index_fasttext, vocab)

# Replace quote
train['question_text'] = train['question_text'].progress_apply(lambda x: replace_quote(x))
test['question_text'] = test['question_text'].progress_apply(lambda x: replace_quote(x))
print("Replace quote done")

# Replace mapping(contraction & mispell)
train['question_text'] = train['question_text'].progress_apply(lambda x: replace_mapping(x))
test['question_text'] = test['question_text'].progress_apply(lambda x: replace_mapping(x))
print("Replace mapping done")

# Replace punc(效果变差)
# train['question_text'] = train['question_text'].progress_apply(lambda x: replace_punc(x))
# test['question_text'] = test['question_text'].progress_apply(lambda x: replace_punc(x))
# print("Replace punc done")

# Sep punc
train['question_text'] = train['question_text'].progress_apply(lambda x: sep_punc(x))
test['question_text'] = test['question_text'].progress_apply(lambda x: sep_punc(x))
print("Sep punc done")

# Replace numbers
train['question_text'] = train['question_text'].progress_apply(lambda x: replace_numbers(x))
test['question_text'] = test['question_text'].progress_apply(lambda x: replace_numbers(x))
print("Replace numbers done")

# After preprocess:
print("After preprocess:")
vocab = build_vocab(train['question_text'])
print("glove:")
oov_glove = check_coverage(vocab, embeddings_index_glove)
print("para:")
oov_para = check_coverage(vocab, embeddings_index_para)
# print("fasttext:")
# oov_fasttext = check_coverage(vocab, embeddings_index_fasttext)

Before preprocess:
glove:
Found embeddings for 33.024% of vocab
Found embeddings for  88.148% of all text
para:
Found embeddings for 19.541% of vocab
Found embeddings for  72.206% of all text
Lower done
Added 14725 words to embedding


  5%|▌         | 68715/1306122 [00:00<00:01, 687147.38it/s]

Added 0 words to embedding


100%|██████████| 1306122/1306122 [00:01<00:00, 814891.26it/s]
100%|██████████| 56370/56370 [00:00<00:00, 741222.46it/s]
  0%|          | 1616/1306122 [00:00<01:20, 16157.90it/s]

Replace quote done


100%|██████████| 1306122/1306122 [01:09<00:00, 18840.36it/s]
100%|██████████| 56370/56370 [00:02<00:00, 18932.24it/s]
  0%|          | 1301/1306122 [00:00<01:40, 13002.01it/s]

Replace mapping done


100%|██████████| 1306122/1306122 [01:26<00:00, 15176.92it/s]
100%|██████████| 56370/56370 [00:03<00:00, 15193.24it/s]
  1%|          | 10721/1306122 [00:00<00:12, 107208.31it/s]

Sep punc done


100%|██████████| 1306122/1306122 [00:10<00:00, 124877.63it/s]
100%|██████████| 56370/56370 [00:00<00:00, 124566.42it/s]


Replace numbers done
After preprocess:
glove:
Found embeddings for 69.050% of vocab
Found embeddings for  99.428% of all text
para:
Found embeddings for 73.524% of vocab
Found embeddings for  99.481% of all text


## Save preprocess result

In [13]:
# with open("tmp/clean_text.txt", "w", encoding="UTF-8") as f:
#     for s in train["question_text"]:
#         f.write(s + "\n")
# train.to_csv("tmp/preprocess.csv")

In [14]:
def get_maxlen(df):
    """获取训练集单个序列的最大长度
    """
    assert df is not None, "df can not be None"
    maxlen = 0
    for d in df:
        maxlen = max(maxlen, len(d.split()))
    return maxlen

def get_avglen(df):
    """获取训练集单个序列的平均长度
    """
    assert df is not None, "df can not be None"
    tmp = [len(d.split()) for d in df]
    counter = Counter(tmp)
    res = OrderedDict(sorted(counter.items(), key=lambda t: t[0]))
    return np.mean(tmp), res

# 最大长度 word level
print("maxlen_word_train:", get_maxlen(train.question_text))
print("maxlen_word_testa:", get_maxlen(test.question_text))
# 平均长度 word level
print("avglen_word_train:\n", get_avglen(train.question_text))
print("avglen_word_testa:\n", get_avglen(test.question_text))

maxlen_word_train: 602
maxlen_word_testa: 398
avglen_word_train:
 (14.729379031974043, OrderedDict([(1, 5), (2, 19), (3, 49), (4, 5348), (5, 20390), (6, 39603), (7, 68448), (8, 96094), (9, 113196), (10, 115721), (11, 109959), (12, 96637), (13, 84339), (14, 71963), (15, 61554), (16, 52403), (17, 44853), (18, 38151), (19, 32967), (20, 28461), (21, 24846), (22, 21667), (23, 19106), (24, 17205), (25, 15305), (26, 13820), (27, 12519), (28, 11251), (29, 10161), (30, 8936), (31, 7938), (32, 7033), (33, 6104), (34, 5222), (35, 4489), (36, 4044), (37, 3558), (38, 3167), (39, 2945), (40, 2663), (41, 2433), (42, 2341), (43, 2094), (44, 2042), (45, 1950), (46, 1801), (47, 1601), (48, 1510), (49, 1318), (50, 1162), (51, 989), (52, 872), (53, 796), (54, 607), (55, 519), (56, 441), (57, 333), (58, 249), (59, 200), (60, 152), (61, 125), (62, 65), (63, 52), (64, 41), (65, 36), (66, 36), (67, 21), (68, 9), (69, 13), (70, 14), (71, 11), (72, 10), (73, 7), (74, 13), (75, 9), (76, 10), (77, 4), (78, 11), (

In [15]:
def load_single_split(val_size=0.1):
    train_df, val_df = train_test_split(train, test_size=val_size, random_state=SEED)
    X_train = train_df["question_text"].values
    X_val = val_df["question_text"].values
    T_X = test["question_text"].values

    tokenizer = Tokenizer(num_words=max_features, filters='')
    tokenizer.fit_on_texts(list(X_train))
    X_train = tokenizer.texts_to_sequences(X_train)
    X_val = tokenizer.texts_to_sequences(X_val)
    T_X = tokenizer.texts_to_sequences(T_X)

    X_train = pad_sequences(X_train, maxlen=maxlen)
    X_val = pad_sequences(X_val, maxlen=maxlen)
    T_X = pad_sequences(T_X, maxlen=maxlen)

    Y_train = train_df['target'].values
    Y_val = val_df['target'].values  
    
    # shuffle
    train_idx = np.random.permutation(len(X_train))
    val_idx = np.random.permutation(len(X_val))
    X_train = X_train[train_idx]
    X_val = X_val[val_idx]
    Y_train = Y_train[train_idx]
    Y_val = Y_val[val_idx]
    return X_train, X_val, T_X, Y_train, Y_val, tokenizer.word_index

if cv:
    X = train["question_text"].fillna("_na_").values
    T_X = test["question_text"].fillna("_na_").values
    tokenizer = Tokenizer(num_words=max_features, filters='')
    tokenizer.fit_on_texts(list(X))
    X = tokenizer.texts_to_sequences(X)
    X = pad_sequences(X, maxlen=maxlen)
    T_X = tokenizer.texts_to_sequences(T_X)
    T_X = pad_sequences(T_X, maxlen=maxlen)
    Y = train['target'].values
    word_index = tokenizer.word_index
    print("len(word_index):", len(word_index))
else:
    X_train, X_val, T_X, Y_train, Y_val, word_index = load_single_split(val_size=0.1)

len(word_index): 187252


In [16]:
# save
# np.save("tmp/X", X)
# np.save("tmp/Y", Y)
# np.save("tmp/T_X", T_X)
# np.save("tmp/features", features)
# np.save("tmp/test_features", test_features)
# np.save("tmp/word_index", word_index)

# load
# X = np.load("tmp/X.npy")
# Y = np.load("tmp/Y.npy")
# T_X = np.load("tmp/T_X.npy")
# features = np.load("tmp/features.npy")
# test_features = np.load("tmp/test_features.npy")
# word_index = np.load("tmp/word_index.npy")

In [17]:
del train, test
gc.collect()

14

## Rebuild Embedding

In [18]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def build_emb(embeddings_index, max_features, word_index):
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    emb_size = all_embs.shape[1]

    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, emb_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

def build_emb_smart(emb1, emb2, max_features, word_index, emb_size=300):
    nb_words = min(max_features, len(word_index))

    all_emb1 = np.stack(emb1.values())
    emb1_mean, emb1_std = all_emb1.mean(), all_emb1.std()
    all_emb2 = np.stack(emb2.values())
    emb2_mean, emb2_std = all_emb2.mean(), all_emb2.std()

    embedding_matrix1 = np.random.normal(emb1_mean, emb1_std, (nb_words, emb_size))
    embedding_matrix2 = np.random.normal(emb2_mean, emb2_std, (nb_words, emb_size))
    embedding_matrix = np.mean([embedding_matrix1, embedding_matrix2], axis=0)

    for word, i in word_index.items():
        if i >= max_features: continue
        emb1_vector = emb1.get(word)
        emb2_vector = emb2.get(word)
        if emb1_vector is not None and emb2_vector is not None:
            embedding_matrix[i] = np.mean([emb1_vector, emb2_vector], axis=0)
        else:
            if emb1_vector is not None:
                embedding_matrix[i] = emb1_vector
            elif emb2_vector is not None:
                embedding_matrix[i] = emb2_vector
    return embedding_matrix

In [19]:
start_time = time.time()

emb_glove = build_emb(embeddings_index_glove, max_features, word_index)
emb_para = build_emb(embeddings_index_para, max_features, word_index)
# emb_fasttext = build_emb(embeddings_index_fasttext, max_features, word_index)

emb = np.mean([emb_glove, emb_para], axis=0)
# emb = np.mean([emb_glove, emb_para, emb_fasttext], axis=0)

# emb = build_emb_smart(embeddings_index_glove, embeddings_index_para, max_features, word_index)

total_time = (time.time() - start_time) / 60
print("Took {:.2f} minutes".format(total_time))
print(np.shape(emb))

Took 0.19 minutes
(95000, 300)


## Layer Zoo

In [20]:
def squash(x, axis=-1):
    s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
    scale = K.sqrt(s_squared_norm + K.epsilon())
    return x / scale


class Capsule(Layer):
    def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Capsule, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = squash
        else:
            self.activation = Activation(activation)

    def build(self, input_shape):
        super(Capsule, self).build(input_shape)
        input_dim_capsule = input_shape[-1]
        if self.share_weights:
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(1, input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     # shape=self.kernel_size,
                                     initializer='glorot_uniform',
                                     trainable=True)
        else:
            input_num_capsule = input_shape[-2]
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(input_num_capsule,
                                            input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     initializer='glorot_uniform',
                                     trainable=True)

    def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
            if i < self.routings - 1:
                b = K.batch_dot(outputs, u_hat_vecs, [2, 3])

        return outputs

    def compute_output_shape(self, input_shape):
        return (None, self.num_capsule, self.dim_capsule)


class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim


class JoinAttention(_Merge):
    def __init__(self, step_dim, hid_size,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism according to other vector.
        Supports Masking.
        # Input shape, list of
            2D tensor with shape: `(samples, features_1)`.
            3D tensor with shape: `(samples, steps, features_2)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            en = LSTM(64, return_sequences=False)(input)
            de = LSTM(64, return_sequences=True)(input2)
            output = JoinAttention(64, 20)([en, de])
        """
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.hid_size = hid_size
        super(JoinAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        if not isinstance(input_shape, list):
            raise ValueError('A merge layer [JoinAttention] should be called '
                             'on a list of inputs.')
        if len(input_shape) != 2:
            raise ValueError('A merge layer [JoinAttention] should be called '
                             'on a list of 2 inputs. '
                             'Got ' + str(len(input_shape)) + ' inputs.')
        if len(input_shape[0]) != 2 or len(input_shape[1]) != 3:
            raise ValueError('A merge layer [JoinAttention] should be called '
                             'on a list of 2 inputs with first ndim 2 and second one ndim 3. '
                             'Got ' + str(len(input_shape)) + ' inputs.')

        self.W_en1 = self.add_weight((input_shape[0][-1], self.hid_size),
                                 initializer=self.init,
                                 name='{}_W0'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.W_en2 = self.add_weight((input_shape[1][-1], self.hid_size),
                                 initializer=self.init,
                                 name='{}_W1'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.W_de = self.add_weight((self.hid_size,),
                                 initializer=self.init,
                                 name='{}_W2'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)

        if self.bias:
            self.b_en1 = self.add_weight((self.hid_size,),
                                     initializer='zero',
                                     name='{}_b0'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
            self.b_en2 = self.add_weight((self.hid_size,),
                                     initializer='zero',
                                     name='{}_b1'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
            self.b_de = self.add_weight((input_shape[1][1],),
                                     initializer='zero',
                                     name='{}_b2'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b_en1 = None
            self.b_en2 = None
            self.b_de = None

        self._reshape_required = False
        self.built = True

    def compute_output_shape(self, input_shape):
        return input_shape[1][0], input_shape[1][-1]

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, inputs, mask=None):
        en = inputs[0]
        de = inputs[1]
        de_shape = K.int_shape(de)
        step_dim = de_shape[1]

        hid_en = K.dot(en, self.W_en1)
        hid_de = K.dot(de, self.W_en2)
        if self.bias:
            hid_en += self.b_en1
            hid_de += self.b_en2
        hid = K.tanh(K.expand_dims(hid_en, axis=1) + hid_de)
        eij = K.reshape(K.dot(hid, K.reshape(self.W_de, (self.hid_size, 1))), (-1, step_dim))
        if self.bias:
            eij += self.b_de[:step_dim]

        a = K.exp(eij - K.max(eij, axis=-1, keepdims=True))

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask[1], K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = de * a
        return K.sum(weighted_input, axis=1)


def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)


class AttentivePooling(Layer):
    def __init__(self, W_regularizer=None, b_regularizer=None, **kwargs):
        self.supports_masking = False
        # self.mask =mask
        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)
        super(AttentivePooling, self).__init__(**kwargs)

    def build(self, input_shape):

        n_in = input_shape[2]
        n_out = 1
        lim = np.sqrt(6. / (n_in + n_out))
        # tanh initializer xavier
        self.W = K.random_uniform_variable((n_in, n_out), -lim, lim,
                                           name='{}_W'.format(self.name))
        self.b = K.zeros((n_out,), name='{}_b'.format(self.name))
        self.trainable_weights = [self.W, self.b]
        self.regularizer = []
        if self.W_regularizer is not None:
            self.add_loss(self.W_regularizer(self.W))
        if self.b_regularizer is not None:
            self.add_loss(self.b_regularizer(self.b))
        self.build = True

    def call(self, inputs, mask=None):

        memory = inputs
        print('memory shape', K.int_shape(memory))
        gi = K.tanh(K.dot(memory, self.W) + self.b)  # 32 *6 *1
        gi = K.sum(gi, axis=-1)  # 32 *6
        alfa = K.softmax(gi)
        self.alfa = alfa
        output = K.sum(memory * K.expand_dims(alfa, axis=-1), axis=1)  # sum(32 *6 *310)
        print('output shape', K.int_shape(output))
        return output

    def compute_output_shape(self, input_shape):
        shape = input_shape
        shape = list(shape)

        return (shape[0], shape[2])

    def compute_mask(self, inputs, mask=None):
        return None


class AttentionWithContext(Layer):
    """
    Attention operation, with a context/query vector, for temporal data.
    Supports Masking.
    Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
    "Hierarchical Attention Networks for Document Classification"
    by using a context vector to assist the attention
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.
    How to use:
    Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
    The dimensions are inferred based on the output shape of the RNN.
    Note: The layer has been tested with Keras 2.0.6
    Example:
        model.add(LSTM(64, return_sequences=True))
        model.add(AttentionWithContext())
        # next add a Dense layer (for classification/regression) or whatever...
    """
 
    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
 
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')
 
        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)
 
        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)
 
        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)
 
    def build(self, input_shape):
        assert len(input_shape) == 3
 
        self.W = self.add_weight((input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
 
        self.u = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)
 
        super(AttentionWithContext, self).build(input_shape)
 
    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None
 
    def call(self, x, mask=None):
        uit = dot_product(x, self.W)
 
        if self.bias:
            uit += self.b
 
        uit = K.tanh(uit)
        ait = dot_product(uit, self.u)
 
        a = K.exp(ait)
 
        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())
 
        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
 
        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)
 
    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]


class DropConnect(Wrapper):
    def __init__(self, layer, prob=1., **kwargs):
        self.prob = prob
        self.layer = layer
        super(DropConnect, self).__init__(layer, **kwargs)
        if 0. < self.prob < 1.:
            self.uses_learning_phase = True

    def build(self, input_shape):
        if not self.layer.built:
            self.layer.build(input_shape)
            self.layer.built = True
        super(DropConnect, self).build()

    def compute_output_shape(self, input_shape):
        return self.layer.compute_output_shape(input_shape)

    def call(self, x):
        if 0. < self.prob < 1.:
            self.layer.kernel = K.in_train_phase(K.dropout(self.layer.kernel, self.prob), self.layer.kernel)
            self.layer.bias = K.in_train_phase(K.dropout(self.layer.bias, self.prob), self.layer.bias)
        return self.layer.call(x)


class TargetedDropout(Layer):
    """See: https://openreview.net/pdf?id=HkghWScuoQ"""

    def __init__(self,
                 drop_rate,
                 target_rate,
                 **kwargs):
        """Initialize the layer.
        :param drop_rate: Dropout rate.
        :param target_rate: Targeting proportion.
        :param kwargs: Arguments for parent class.
        """
        super(TargetedDropout, self).__init__(**kwargs)
        self.supports_masking = True
        self.drop_rate = drop_rate
        self.target_rate = target_rate

    def get_config(self):
        config = {
            'drop_rate': self.drop_rate,
            'target_rate': self.target_rate,
        }
        base_config = super(TargetedDropout, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

    def compute_mask(self, inputs, mask=None):
        return mask

    def compute_output_shape(self, input_shape):
        return input_shape

    def _compute_target_mask(self, inputs, mask=None):
        input_shape = K.shape(inputs)
        input_type = K.dtype(inputs)
        mask_threshold = K.constant(1e8, dtype=input_type)
        channel_num = input_shape[-1]
        channel_dim = K.prod(input_shape[:-1])
        masked_inputs = inputs
        if mask is not None:
            masked_inputs = K.switch(
                K.cast(mask, K.floatx()) > 0.5,
                masked_inputs,
                K.ones_like(masked_inputs, dtype=input_type) * mask_threshold
            )
        norm = K.abs(masked_inputs)
        channeled_norm = K.transpose(K.reshape(norm, (channel_dim, channel_num)))
        weight_num = K.sum(
            K.reshape(K.cast(masked_inputs < mask_threshold, K.floatx()), (channel_dim, channel_num)),
            axis=0,
        )
        indices = K.stack(
            [
                K.arange(channel_num, dtype='int32'),
                K.cast(self.target_rate * weight_num, dtype='int32') - 1,
            ],
            axis=-1,
        )
        threshold = -K.tf.gather_nd(K.tf.nn.top_k(-channeled_norm, k=K.max(indices[:, 1]) + 1).values, indices)
        threshold = K.reshape(K.tile(threshold, [channel_dim]), input_shape)
        target_mask = K.switch(
            norm <= threshold,
            K.ones_like(inputs, dtype=K.floatx()),
            K.zeros_like(inputs, dtype=K.floatx()),
        )
        return target_mask

    def call(self, inputs, mask=None, training=None):
        target_mask = self._compute_target_mask(inputs, mask=mask)

        def dropped_mask():
            drop_mask = K.switch(
                K.random_uniform(K.shape(inputs)) < self.drop_rate,
                K.ones_like(inputs, K.floatx()),
                K.zeros_like(inputs, K.floatx()),
            )
            return target_mask * drop_mask

        def pruned_mask():
            return target_mask

        mask = K.in_train_phase(dropped_mask, pruned_mask, training=training)
        outputs = K.switch(
            mask > 0.5,
            K.zeros_like(inputs, dtype=K.dtype(inputs)),
            inputs,
        )
        return outputs


class DotProdSelfAttention(Layer):
    """The self-attention layer as in 'Attention is all you need'.
    paper reference: https://arxiv.org/abs/1706.03762
    
    """
    def __init__(self, units,
                 activation=None,
                 use_bias=False,
                 kernel_initializer='glorot_uniform',
                 bias_initializer='zeros',
                 kernel_regularizer=None,
                 bias_regularizer=None,
                 activity_regularizer=None,
                 kernel_constraint=None,
                 bias_constraint=None,
                 **kwargs):
        if 'input_shape' not in kwargs and 'input_dim' in kwargs:
            kwargs['input_shape'] = (kwargs.pop('input_dim'),)
        super(DotProdSelfAttention, self).__init__(*kwargs)
        self.units = units
        self.activation = activations.get(activation)
        self.use_bias = use_bias
        self.kernel_initializer = initializers.get(kernel_initializer)
        self.bias_initializer = initializers.get(bias_initializer)
        self.kernel_regularizer = regularizers.get(kernel_regularizer)
        self.bias_regularizer = regularizers.get(bias_regularizer)
        self.activity_regularizer = regularizers.get(activity_regularizer)
        self.kernel_constraint = constraints.get(kernel_constraint)
        self.bias_constraint = constraints.get(bias_constraint)
        self.input_spec = InputSpec(min_ndim=2)
        self.supports_masking = True

    def build(self, input_shape):
        assert len(input_shape) == 3
        input_dim = input_shape[-1]
        # We assume the output-dim of Q, K, V are the same
        self.kernels = dict.fromkeys(['Q', 'K', 'V'])
        for key, _ in self.kernels.items():
            self.kernels[key] = self.add_weight(shape=(input_dim, self.units),
                                                initializer=self.kernel_initializer,
                                                name='kernel_{}'.format(key),
                                                regularizer=self.kernel_regularizer,
                                                constraint=self.kernel_constraint)
        if self.use_bias:
            raise NotImplementedError
        super(DotProdSelfAttention, self).build(input_shape)
        
    def call(self, x):
        Q = K.dot(x, self.kernels['Q'])
        K_mat = K.dot(x, self.kernels['K'])
        V = K.dot(x, self.kernels['V'])
        attention = K.batch_dot(Q, K.permute_dimensions(K_mat, [0, 2, 1]))
        d_k = K.constant(self.units, dtype=K.floatx())
        attention = attention / K.sqrt(d_k)
        attention = K.batch_dot(K.softmax(attention, axis=-1), V)
        return attention
    
    def compute_output_shape(self, input_shape):
        assert input_shape and len(input_shape) >= 2
        assert input_shape[-1]
        output_shape = list(input_shape)
        output_shape[-1] = self.units
        return tuple(output_shape)


def encoder(input_tensor, emb_size, n_heads=4):
    """One encoder as in Attention Is All You Need
    """
    # Sub-layer 1
    # Multi-Head Attention
    multiheads = []
    d_v = emb_size // n_heads
    for i in range(n_heads):
        multiheads.append(DotProdSelfAttention(d_v)(input_tensor))
    multiheads = concatenate(multiheads, axis=-1)
    multiheads = Dense(emb_size)(multiheads)
    multiheads = Dropout(0.1)(multiheads)
    
    # Residual Connection
    res_con = add([input_tensor, multiheads])
    # Didn't use layer normalization, use Batch Normalization instead here
    res_con = BatchNormalization(axis=-1)(res_con)
    
    # Sub-layer 2
    # 2 Feed forward layer
    ff1 = Dense(64, activation='relu')(res_con)
    ff2 = Dense(emb_size)(ff1)
    output = add([res_con, ff2])
    output = BatchNormalization(axis=-1)(output)
    
    return output


def positional_signal(hidden_size: int, length: int, min_timescale: float=1.0, max_timescale: float=1e4):
    """
    Helper function, constructing basic positional encoding.
    The code is partially based on implementation from Tensor2Tensor library
    https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/layers/common_attention.py
    """
    if hidden_size % 2 != 0:
        raise ValueError(
            f"The hidden dimension of the model must be divisible by 2."
            f"Currently it is {hidden_size}")
    position = K.arange(0, length, dtype=K.floatx())
    num_timescales = hidden_size // 2
    log_timescale_increment = K.constant(
        (np.log(float(max_timescale) / float(min_timescale)) / (num_timescales - 1)),
        dtype=K.floatx())
    inv_timescales = (min_timescale * K.exp(K.arange(num_timescales, dtype=K.floatx()) * -log_timescale_increment))
    scaled_time = K.expand_dims(position, 1) * K.expand_dims(inv_timescales, 0)
    signal = K.concatenate([K.sin(scaled_time), K.cos(scaled_time)], axis=1)
    return K.expand_dims(signal, axis=0)


class AddPositionalEncoding(Layer):
    """
    Injects positional encoding signal described in section 3.5 of the original
    paper "Attention is all you need". Also a base class for more complex
    coordinate encoding described in "Universal Transformers".
    """

    def __init__(self, min_timescale: float=1.0, max_timescale: float=1.0e4, **kwargs):
        self.min_timescale = min_timescale
        self.max_timescale = max_timescale
        self.signal = None
        super().__init__(**kwargs)

    def get_config(self):
        config = super().get_config()
        config['min_timescale'] = self.min_timescale
        config['max_timescale'] = self.max_timescale
        return config

    def build(self, input_shape):
        _, length, hidden_size = input_shape
        self.signal = positional_signal(
            hidden_size, length, self.min_timescale, self.max_timescale)
        return super().build(input_shape)

    def call(self, inputs, **kwargs):
        return inputs + self.signal


class AttentionWeightedAverage(Layer):
    """
    Computes a weighted average of the different channels across timesteps.
    Uses 1 parameter pr. channel to compute the attention value for a single timestep.
    """

    def __init__(self, return_attention=False, **kwargs):
        self.init = initializers.get('uniform')
        self.supports_masking = True
        self.return_attention = return_attention
        super(AttentionWeightedAverage, self).__init__(** kwargs)

    def build(self, input_shape):
        self.input_spec = [InputSpec(ndim=3)]
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[2], 1),
                                 name='{}_W'.format(self.name),
                                 initializer=self.init)
        self.trainable_weights = [self.W]
        super(AttentionWeightedAverage, self).build(input_shape)

    def call(self, x, mask=None):
        # computes a probability distribution over the timesteps
        # uses 'max trick' for numerical stability
        # reshape is done to avoid issue with Tensorflow
        # and 1-dimensional weights
        logits = K.dot(x, self.W)
        x_shape = K.shape(x)
        logits = K.reshape(logits, (x_shape[0], x_shape[1]))
        ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))

        # masked timesteps have zero weight
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            ai = ai * mask
        att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
        weighted_input = x * K.expand_dims(att_weights)
        result = K.sum(weighted_input, axis=1)
        if self.return_attention:
            return [result, att_weights]
        return result

    def get_output_shape_for(self, input_shape):
        return self.compute_output_shape(input_shape)

    def compute_output_shape(self, input_shape):
        output_len = input_shape[2]
        if self.return_attention:
            return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
        return (input_shape[0], output_len)

    def compute_mask(self, input, input_mask=None):
        if isinstance(input_mask, list):
            return [None] * len(input_mask)
        else:
            return None


class KMaxPooling(Layer):
    """
    K-max pooling layer that extracts the k-highest activations from a sequence (2nd dimension).
    TensorFlow backend.
    """

    def __init__(self, k=2, **kwargs):
        super().__init__(**kwargs)
        self.input_spec = InputSpec(ndim=3)
        self.k = k

    def compute_output_shape(self, input_shape):
        return (input_shape[0], (input_shape[2] * self.k))

    def call(self, inputs):
        # swap last two dimensions since top_k will be applied along the last dimension
        shifted_input = tf.transpose(inputs, [0, 2, 1])

        # extract top_k, returns two tensors [values, indices]
        top_k = tf.nn.top_k(shifted_input, k=self.k, sorted=True, name=None)[0]

        # return flattened output
        return Flatten()(top_k)

    def get_config(self):
        config = {'k': self.k}
        base_config = super(KMaxPooling, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))


class SeqSelfAttention(Layer):

    ATTENTION_TYPE_ADD = 'additive'
    ATTENTION_TYPE_MUL = 'multiplicative'

    def __init__(self,
                 units=32,
                 attention_width=None,
                 attention_type=ATTENTION_TYPE_ADD,
                 return_attention=False,
                 history_only=False,
                 kernel_initializer='glorot_normal',
                 bias_initializer='zeros',
                 kernel_regularizer=None,
                 bias_regularizer=None,
                 kernel_constraint=None,
                 bias_constraint=None,
                 use_additive_bias=True,
                 use_attention_bias=True,
                 attention_activation=None,
                 attention_regularizer_weight=0.0,
                 **kwargs):
        """Layer initialization.
        For additive attention, see: https://arxiv.org/pdf/1806.01264.pdf
        :param units: The dimension of the vectors that used to calculate the attention weights.
        :param attention_width: The width of local attention.
        :param attention_type: 'additive' or 'multiplicative'.
        :param return_attention: Whether to return the attention weights for visualization.
        :param history_only: Only use historical pieces of data.
        :param kernel_initializer: The initializer for weight matrices.
        :param bias_initializer: The initializer for biases.
        :param kernel_regularizer: The regularization for weight matrices.
        :param bias_regularizer: The regularization for biases.
        :param kernel_constraint: The constraint for weight matrices.
        :param bias_constraint: The constraint for biases.
        :param use_additive_bias: Whether to use bias while calculating the relevance of inputs features
                                  in additive mode.
        :param use_attention_bias: Whether to use bias while calculating the weights of attention.
        :param attention_activation: The activation used for calculating the weights of attention.
        :param attention_regularizer_weight: The weights of attention regularizer.
        :param kwargs: Parameters for parent class.
        """
        self.supports_masking = True
        self.units = units
        self.attention_width = attention_width
        self.attention_type = attention_type
        self.return_attention = return_attention
        self.history_only = history_only
        if history_only and attention_width is None:
            self.attention_width = int(1e10)

        self.use_additive_bias = use_additive_bias
        self.use_attention_bias = use_attention_bias
        self.kernel_initializer = keras.initializers.get(kernel_initializer)
        self.bias_initializer = keras.initializers.get(bias_initializer)
        self.kernel_regularizer = keras.regularizers.get(kernel_regularizer)
        self.bias_regularizer = keras.regularizers.get(bias_regularizer)
        self.kernel_constraint = keras.constraints.get(kernel_constraint)
        self.bias_constraint = keras.constraints.get(bias_constraint)
        self.attention_activation = keras.activations.get(attention_activation)
        self.attention_regularizer_weight = attention_regularizer_weight
        self._backend = keras.backend.backend()

        if attention_type == SeqSelfAttention.ATTENTION_TYPE_ADD:
            self.Wx, self.Wt, self.bh = None, None, None
            self.Wa, self.ba = None, None
        elif attention_type == SeqSelfAttention.ATTENTION_TYPE_MUL:
            self.Wa, self.ba = None, None
        else:
            raise NotImplementedError('No implementation for attention type : ' + attention_type)

        super(SeqSelfAttention, self).__init__(**kwargs)

    def get_config(self):
        config = {
            'units': self.units,
            'attention_width': self.attention_width,
            'attention_type': self.attention_type,
            'return_attention': self.return_attention,
            'history_only': self.history_only,
            'use_additive_bias': self.use_additive_bias,
            'use_attention_bias': self.use_attention_bias,
            'kernel_initializer': keras.regularizers.serialize(self.kernel_initializer),
            'bias_initializer': keras.regularizers.serialize(self.bias_initializer),
            'kernel_regularizer': keras.regularizers.serialize(self.kernel_regularizer),
            'bias_regularizer': keras.regularizers.serialize(self.bias_regularizer),
            'kernel_constraint': keras.constraints.serialize(self.kernel_constraint),
            'bias_constraint': keras.constraints.serialize(self.bias_constraint),
            'attention_activation': keras.activations.serialize(self.attention_activation),
            'attention_regularizer_weight': self.attention_regularizer_weight,
        }
        base_config = super(SeqSelfAttention, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

    def build(self, input_shape):
        if isinstance(input_shape, list):
            input_shape = input_shape[0]
        if self.attention_type == SeqSelfAttention.ATTENTION_TYPE_ADD:
            self._build_additive_attention(input_shape)
        elif self.attention_type == SeqSelfAttention.ATTENTION_TYPE_MUL:
            self._build_multiplicative_attention(input_shape)
        super(SeqSelfAttention, self).build(input_shape)

    def _build_additive_attention(self, input_shape):
        feature_dim = input_shape[2]

        self.Wt = self.add_weight(shape=(feature_dim, self.units),
                                  name='{}_Add_Wt'.format(self.name),
                                  initializer=self.kernel_initializer,
                                  regularizer=self.kernel_regularizer,
                                  constraint=self.kernel_constraint)
        self.Wx = self.add_weight(shape=(feature_dim, self.units),
                                  name='{}_Add_Wx'.format(self.name),
                                  initializer=self.kernel_initializer,
                                  regularizer=self.kernel_regularizer,
                                  constraint=self.kernel_constraint)
        if self.use_additive_bias:
            self.bh = self.add_weight(shape=(self.units,),
                                      name='{}_Add_bh'.format(self.name),
                                      initializer=self.bias_initializer,
                                      regularizer=self.bias_regularizer,
                                      constraint=self.bias_constraint)

        self.Wa = self.add_weight(shape=(self.units, 1),
                                  name='{}_Add_Wa'.format(self.name),
                                  initializer=self.kernel_initializer,
                                  regularizer=self.kernel_regularizer,
                                  constraint=self.kernel_constraint)
        if self.use_attention_bias:
            self.ba = self.add_weight(shape=(1,),
                                      name='{}_Add_ba'.format(self.name),
                                      initializer=self.bias_initializer,
                                      regularizer=self.bias_regularizer,
                                      constraint=self.bias_constraint)

    def _build_multiplicative_attention(self, input_shape):
        feature_dim = input_shape[2]

        self.Wa = self.add_weight(shape=(feature_dim, feature_dim),
                                  name='{}_Mul_Wa'.format(self.name),
                                  initializer=self.kernel_initializer,
                                  regularizer=self.kernel_regularizer,
                                  constraint=self.kernel_constraint)
        if self.use_attention_bias:
            self.ba = self.add_weight(shape=(1,),
                                      name='{}_Mul_ba'.format(self.name),
                                      initializer=self.bias_initializer,
                                      regularizer=self.bias_regularizer,
                                      constraint=self.bias_constraint)

    def call(self, inputs, mask=None, **kwargs):
        if isinstance(inputs, list):
            inputs, positions = inputs
            positions = K.cast(positions, 'int32')
            mask = mask[1]
        else:
            positions = None

        input_len = K.shape(inputs)[1]

        if self.attention_type == SeqSelfAttention.ATTENTION_TYPE_ADD:
            e = self._call_additive_emission(inputs)
        elif self.attention_type == SeqSelfAttention.ATTENTION_TYPE_MUL:
            e = self._call_multiplicative_emission(inputs)

        if self.attention_activation is not None:
            e = self.attention_activation(e)
        e = K.exp(e - K.max(e, axis=-1, keepdims=True))
        if self.attention_width is not None:
            ones = tf.ones((input_len, input_len))
            if self.history_only:
                local = tf.matrix_band_part(
                    ones,
                    K.minimum(input_len, self.attention_width - 1),
                    0,
                )
            else:
                local = tf.matrix_band_part(
                    ones,
                    K.minimum(input_len, self.attention_width // 2),
                    K.minimum(input_len, (self.attention_width - 1) // 2),
                )
            e = e * K.expand_dims(local, 0)
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            mask = K.expand_dims(mask)
            e = K.permute_dimensions(K.permute_dimensions(e * mask, (0, 2, 1)) * mask, (0, 2, 1))

        # a_{t} = \text{softmax}(e_t)
        s = K.sum(e, axis=-1)
        s = K.tile(K.expand_dims(s, axis=-1), K.stack([1, 1, input_len]))
        a = e / (s + K.epsilon())

        # l_t = \sum_{t'} a_{t, t'} x_{t'}
        v = K.batch_dot(a, inputs)
        if self.attention_regularizer_weight > 0.0:
            self.add_loss(self._attention_regularizer(a))

        if positions is not None:
            pos_num = K.shape(positions)[1]
            batch_indices = K.tile(K.expand_dims(K.arange(K.shape(inputs)[0]), axis=-1), K.stack([1, pos_num]))
            pos_indices = K.stack([batch_indices, positions], axis=-1)
            v = tf.gather_nd(v, pos_indices)
            a = tf.gather_nd(a, pos_indices)

        if self.return_attention:
            return [v, a]
        return v

    def _call_additive_emission(self, inputs):
        input_shape = K.shape(inputs)
        batch_size, input_len = input_shape[0], input_shape[1]

        # h_{t, t'} = \tanh(x_t^T W_t + x_{t'}^T W_x + b_h)
        q, k = K.dot(inputs, self.Wt), K.dot(inputs, self.Wx)
        q = K.tile(K.expand_dims(q, 2), K.stack([1, 1, input_len, 1]))
        k = K.tile(K.expand_dims(k, 1), K.stack([1, input_len, 1, 1]))
        if self.use_additive_bias:
            h = K.tanh(q + k + self.bh)
        else:
            h = K.tanh(q + k)

        # e_{t, t'} = W_a h_{t, t'} + b_a
        if self.use_attention_bias:
            e = K.reshape(K.dot(h, self.Wa) + self.ba, (batch_size, input_len, input_len))
        else:
            e = K.reshape(K.dot(h, self.Wa), (batch_size, input_len, input_len))
        return e

    def _call_multiplicative_emission(self, inputs):
        # e_{t, t'} = x_t^T W_a x_{t'} + b_a
        e = K.batch_dot(K.dot(inputs, self.Wa), K.permute_dimensions(inputs, (0, 2, 1)))
        if self.use_attention_bias:
            e = e + self.ba
        return e

    def compute_output_shape(self, input_shape):
        if isinstance(input_shape, list):
            input_shape, pos_shape = input_shape
            output_shape = (input_shape[0], pos_shape[1], input_shape[2])
        else:
            output_shape = input_shape
        if self.return_attention:
            attention_shape = (input_shape[0], output_shape[1], input_shape[1])
            return [output_shape, attention_shape]
        return output_shape

    def compute_mask(self, inputs, mask=None):
        if isinstance(inputs, list):
            mask = mask[1]
        if self.return_attention:
            return [mask, None]
        return mask

    def _attention_regularizer(self, attention):
        batch_size = K.cast(K.shape(attention)[0], K.floatx())
        input_len = K.shape(attention)[-1]
        return self.attention_regularizer_weight * K.sum(K.square(K.batch_dot(
            attention,
            K.permute_dimensions(attention, (0, 2, 1))) - tf.eye(input_len))) / batch_size

    @staticmethod
    def get_custom_objects():
        return {'SeqSelfAttention': SeqSelfAttention}


def focal_loss(gamma=2., alpha=.25):
    def focal_loss_fixed(y_true, y_pred):
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1))-K.sum((1-alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0))
    return focal_loss_fixed

## Evaluation standards

In [21]:
def f1_smart(y_true, y_pred):
    args = np.argsort(y_pred)
    tp = y_true.sum()
    fs = (tp - np.cumsum(y_true[args[:-1]])) / np.arange(y_true.shape[0] + tp - 1, tp, -1)
    res_idx = np.argmax(fs)
    return 2 * fs[res_idx], (y_pred[args[res_idx]] + y_pred[args[res_idx + 1]]) / 2

def threshold_search(y_true, y_pred):
    best_threshold = 0
    best_score = 0
    for threshold in [i * 0.01 for i in range(100)]:
        score = f1_score(y_true=y_true, y_pred=(y_pred > threshold).astype(int))
        if score > best_score:
            best_threshold = threshold
            best_score = score
    return best_score, best_threshold

def f1(y_true, y_pred):
    '''
    metric from here 
    https://stackoverflow.com/questions/43547402/how-to-calculate-f1-macro-in-keras
    '''
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1, batch_size=1024):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data
        self.max_score = 0
        self.not_better_count = 0
        self.batch_size = batch_size

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, batch_size=self.batch_size, verbose=2)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))
            if (score > self.max_score):
                print("*** New High Score (previous: %.6f) \n" % self.max_score)
                model.save_weights("best_weights.h5")
                self.max_score=score
                self.not_better_count = 0
            else:
                self.not_better_count += 1
                if self.not_better_count > 3:
                    print("Epoch %05d: early stopping, high score = %.6f" % (epoch,self.max_score))
                    self.model.stop_training = True


def as_keras_metric(method):
    @functools.wraps(method)
    def wrapper(self, args, **kwargs):
        """ Wrapper for turning tensorflow metrics into keras metrics """
        value, update_op = method(self, args, **kwargs)
        K.get_session().run(tf.local_variables_initializer())
        with tf.control_dependencies([update_op]):
            value = tf.identity(value)
        return value
    return wrapper

## Callbacks

In [22]:
class CyclicLR(Callback):
    """This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with
    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or 
    per-cycle basis.
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
        cycle iteration.
    For more detail, please see paper.
    
    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    
    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```    
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore 
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {triangular, triangular2, exp_range}.
            Default 'triangular'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where 
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored 
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on 
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """

    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
        
        K.set_value(self.model.optimizer.lr, self.clr())

        
class WarmUp(Callback):
    def __init__(self):
        self.num_passed_batchs = 0
        self.warmup_epochs = 1
    def on_batch_begin(self, batch, logs=None):
        # params 是模型自动传递给 Callback 的一些参数
        if self.params['steps'] == None:
            self.steps_per_epoch = np.ceil(1. * self.params['samples'] / self.params['batch_size'])
        else:
            self.steps_per_epoch = self.params['steps']
        if self.num_passed_batchs < self.steps_per_epoch * self.warmup_epochs:
            # 前 1 个 epoch 中，学习率线性地从 0 增加到 0.001
            K.set_value(self.model.optimizer.lr,
                        0.001 * (self.num_passed_batchs + 1) / self.steps_per_epoch / self.warmup_epochs)
            self.num_passed_batchs += 1

## Optimizer

In [23]:
class AdamW(Optimizer):
    """Adam optimizer.
    Default parameters follow those provided in the original paper.
    # Arguments
        lr: float >= 0. Learning rate.
        beta_1: float, 0 < beta < 1. Generally close to 1.
        beta_2: float, 0 < beta < 1. Generally close to 1.
        epsilon: float >= 0. Fuzz factor.
        decay: float >= 0. Learning rate decay over each update.
        weight_decay: float >= 0. Decoupled weight decay over each update.
    # References
        - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
        - [Optimization for Deep Learning Highlights in 2017](http://ruder.io/deep-learning-optimization-2017/index.html)
        - [Fixing Weight Decay Regularization in Adam](https://arxiv.org/abs/1711.05101)
    """

    def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4,  # decoupled weight decay (1/6)
                 epsilon=1e-8, decay=0., **kwargs):
        super(AdamW, self).__init__(**kwargs)
        with K.name_scope(self.__class__.__name__):
            self.iterations = K.variable(0, dtype='int64', name='iterations')
            self.lr = K.variable(lr, name='lr')
            self.init_lr = lr # decoupled weight decay (2/6)
            self.beta_1 = K.variable(beta_1, name='beta_1')
            self.beta_2 = K.variable(beta_2, name='beta_2')
            self.decay = K.variable(decay, name='decay')
            self.wd = K.variable(weight_decay, name='weight_decay') # decoupled weight decay (3/6)
        self.epsilon = epsilon
        self.initial_decay = decay

    @interfaces.legacy_get_updates_support
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]
        wd = self.wd # decoupled weight decay (4/6)

        lr = self.lr
        if self.initial_decay > 0:
            lr *= (1. / (1. + self.decay * K.cast(self.iterations,
                                                  K.dtype(self.decay))))
        eta_t = lr / self.init_lr # decoupled weight decay (5/6)

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        self.weights = [self.iterations] + ms + vs

        for p, g, m, v in zip(params, grads, ms, vs):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - eta_t * wd * p # decoupled weight decay (6/6)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates

    def get_config(self):
        config = {'lr': float(K.get_value(self.lr)),
                  'beta_1': float(K.get_value(self.beta_1)),
                  'beta_2': float(K.get_value(self.beta_2)),
                  'decay': float(K.get_value(self.decay)),
                  'weight_decay': float(K.get_value(self.wd)),
                  'epsilon': self.epsilon}
        base_config = super(AdamW, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))


class SGDW(Optimizer):
    """Stochastic gradient descent optimizer.
    Includes support for momentum,
    learning rate decay, and Nesterov momentum.
    # Arguments
        lr: float >= 0. Learning rate.
        momentum: float >= 0. Parameter updates momentum.
        decay: float >= 0. Learning rate decay over each update.
        nesterov: boolean. Whether to apply Nesterov momentum.
        weight_decay: float >= 0. Decoupled weight decay over each update.
    # References
        - [Optimization for Deep Learning Highlights in 2017](http://ruder.io/deep-learning-optimization-2017/index.html)
        - [Fixing Weight Decay Regularization in Adam](https://arxiv.org/abs/1711.05101)
    """

    def __init__(self, lr=0.01, momentum=0., decay=0., weight_decay=1e-4, # decoupled weight decay (1/6)
                 nesterov=False, **kwargs):
        super(SGDW, self).__init__(**kwargs)
        with K.name_scope(self.__class__.__name__):
            self.iterations = K.variable(0, dtype='int64', name='iterations')
            self.lr = K.variable(lr, name='lr')
            self.init_lr = lr # decoupled weight decay (2/6)
            self.momentum = K.variable(momentum, name='momentum')
            self.decay = K.variable(decay, name='decay')
            self.wd = K.variable(weight_decay, name='weight_decay') # decoupled weight decay (3/6)
        self.initial_decay = decay
        self.nesterov = nesterov

    @interfaces.legacy_get_updates_support
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]
        wd = self.wd  # decoupled weight decay (4/6)

        lr = self.lr
        if self.initial_decay > 0:
            lr *= (1. / (1. + self.decay * K.cast(self.iterations,
                                                  K.dtype(self.decay))))
        eta_t = lr / self.init_lr # decoupled weight decay (5/6)
        
        # momentum
        shapes = [K.int_shape(p) for p in params]
        moments = [K.zeros(shape) for shape in shapes]
        self.weights = [self.iterations] + moments
        for p, g, m in zip(params, grads, moments):
            v = self.momentum * m - lr * g  # velocity
            self.updates.append(K.update(m, v))

            if self.nesterov:
                new_p = p + self.momentum * v - lr * g  - eta_t * wd * p  # decoupled weight decay (6/6)
            else:
                new_p = p + v - lr * wd * p # decoupled weight decay

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates

    def get_config(self):
        config = {'lr': float(K.get_value(self.lr)),
                  'momentum': float(K.get_value(self.momentum)),
                  'decay': float(K.get_value(self.decay)),
                  'nesterov': self.nesterov}
        base_config = super(SGDW, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

## Models

In [33]:
class GruCapsule():
    def model(self, embedding_matrix, maxlen, max_features):
        inp = Input(shape=(maxlen,))
        x_emb = Embedding(max_features,
                          embedding_matrix.shape[1],
                          weights=[embedding_matrix],
                          trainable=False)(inp)
        x = SpatialDropout1D(0.2)(x_emb)
        x = Bidirectional(CuDNNLSTM(80,
                                   return_sequences=True,
                                   kernel_initializer=glorot_normal(seed=2018),
                                   recurrent_initializer=orthogonal(gain=1.0, seed=2018)
                                  ))(x)
        
        x = Capsule(num_capsule=10, dim_capsule=10, routings=4, share_weights=True)(x)
        x = Flatten()(x)
        x = Dense(32, activation='relu', kernel_initializer=glorot_normal(seed=2018))(x)
        x = Dropout(0.1)(x)
        output = Dense(1, activation="sigmoid")(x)
        
        model = Model(inputs=inp, outputs=output)
        model.compile(loss='binary_crossentropy', optimizer='adam')
        return model


class GCDC():
    def model(self, embedding_matrix, maxlen, max_features):
        inp_seq = Input(shape=(maxlen,), name='seq')
        inp_feature = Input(shape=(len(feature_cols),), name='feature')
        emb_size = embedding_matrix.shape[1]
        x = Embedding(max_features, emb_size, weights=[embedding_matrix], trainable=False)(inp_seq)
        x = SpatialDropout1D(0.2)(x)
        x = Bidirectional(CuDNNLSTM(96, 
                                    return_sequences=True, 
                                    kernel_initializer=glorot_normal(seed=2018), 
                                    recurrent_initializer=orthogonal(gain=1.0, seed=2018)))(x)

        x_1 = Attention(maxlen)(x)
        # x_1 = DropConnect(Dense(32, activation="relu", kernel_initializer=glorot_normal(seed=SEED)), prob=0.1)(x_1)

        x_2 = Capsule(num_capsule=10, dim_capsule=10, routings=4, share_weights=True)(x)
        x_2 = Flatten()(x_2)
        # x_2 = DropConnect(Dense(32, activation="relu", kernel_initializer=glorot_normal(seed=SEED)), prob=0.1)(x_2)

        # x_3 = DropConnect(Dense(32, activation="relu", kernel_initializer=glorot_normal(seed=SEED)), prob=0.1)(inp_feature)
        # x = concatenate([x_1, x_2, x_3])
        
        x = concatenate([x_1, x_2, inp_feature])
        x = Dense(32, activation='relu', kernel_initializer=glorot_normal(seed=SEED))(x)
        x = Dropout(0.1)(x)

        output = Dense(1, activation="sigmoid")(x)
        model = Model(inputs=[inp_seq, inp_feature], outputs=output)
        model.compile(loss='binary_crossentropy', optimizer='adam')
        return model


class StackLstm():
    def model(self, embedding_matrix, maxlen, max_features):
        inp = Input(shape=(maxlen,))
        emb_size = embedding_matrix.shape[1]
        x_emb = Embedding(max_features, emb_size, weights=[embedding_matrix], trainable=False)(inp)
        x = SpatialDropout1D(0.2)(x_emb)
        x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
        x = Dropout(0.1)(x)
        x = Bidirectional(CuDNNLSTM(32, return_sequences=True))(x)
        x = Dropout(0.1)(x)
        x = Bidirectional(CuDNNLSTM(16, return_sequences=False))(x)
        output = Dense(1, activation="sigmoid")(x)

        model = Model(inputs=inp, outputs=output)
        model.compile(loss='binary_crossentropy', optimizer='adam')
        return model


class StackGru():
    def model(self, embedding_matrix, maxlen, max_features):
        inp = Input(shape=(maxlen,))
        emb_size = embedding_matrix.shape[1]
        x_emb = Embedding(max_features, emb_size, weights=[embedding_matrix], trainable=False)(inp)
        x = SpatialDropout1D(0.2)(x_emb)  
        x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
        x = Dropout(0.1)(x)
        x = Bidirectional(CuDNNGRU(32, return_sequences=True))(x)
        x = Dropout(0.1)(x)
        x = Bidirectional(CuDNNGRU(16, return_sequences=False))(x)
        output = Dense(1, activation="sigmoid")(x)

        model = Model(inputs=inp, outputs=output)
        model.compile(loss='binary_crossentropy', optimizer='adam')
        return model


class CNN2D():
    def model(self, embedding_matrix, maxlen, max_features):
        filter_sizes = [1,2,3,5]
        num_filters = 36
        inp = Input(shape=(maxlen,))
        emb_size = embedding_matrix.shape[1]
        x_emb = Embedding(max_features, emb_size, weights=[embedding_matrix])(inp)
        x = Reshape((maxlen, emb_size, 1))(x_emb)
        max_pool = []
        avg_pool = []
        attn = []
        for i in range(len(filter_sizes)):
            conv = Conv2D(num_filters, kernel_size=(filter_sizes[i], emb_size),
                                         kernel_initializer='he_normal', activation='elu')(x)
            max_pool.append(MaxPooling2D(pool_size=(maxlen - filter_sizes[i] + 1, 1))(conv))
        z = Concatenate(axis=1)(max_pool)
        z = Flatten()(z)
        z = Dropout(0.1)(z)
        output = Dense(1, activation="sigmoid")(z)
        model = Model(inputs=inp, outputs=output)
        model.compile(loss='binary_crossentropy', optimizer='adam')
        return model


class LstmEAtn():
    def model(self, embedding_matrix, maxlen, max_features):
        inp = Input(shape=(maxlen,))
        emb_size = embedding_matrix.shape[1]
        x_emb = Embedding(max_features, emb_size, weights=[embedding_matrix], trainable=False)(inp)
        x = SpatialDropout1D(0.2)(x_emb)
        x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
        y = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)

        atn_1 = AttentionWithContext()(x)
        atn_2 = AttentionWithContext()(y)
        atn_pool = AttentivePooling()(y)
        avg_pool = GlobalAveragePooling1D()(y)
        max_pool = GlobalMaxPooling1D()(y)
        x = concatenate([atn_1, atn_2, atn_pool, avg_pool, max_pool])
        x = Dense(16, activation="relu")(x)
        x = Dropout(0.1)(x)
        output = Dense(1, activation="sigmoid")(x)    

        model = Model(inputs=inp, outputs=output)
        model.compile(loss='binary_crossentropy', optimizer='adam')
        return model


class TDLstmAtn():
    def model(self, embedding_matrix, maxlen, max_features):
        inp = Input(shape=(maxlen,))
        emb_size = embedding_matrix.shape[1]
        x_emb = Embedding(max_features, emb_size, weights=[embedding_matrix], trainable=False)(inp)
        x = SpatialDropout1D(0.2)(x_emb)
        x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
        y = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)

        atn_1 = Attention(maxlen)(x)
        atn_2 = Attention(maxlen)(y)
        avg_pool = GlobalAveragePooling1D()(y)
        max_pool = GlobalMaxPooling1D()(y)

        x = concatenate([atn_1, atn_2, avg_pool, max_pool])
        x = TargetedDropout(drop_rate=0.5, target_rate=0.2)(x)
        x = BatchNormalization()(x)
        x = Dense(32, activation="relu", kernel_initializer=glorot_normal(seed=SEED))(x)
        x = TargetedDropout(drop_rate=0.5, target_rate=0.2)(x)
        x = BatchNormalization()(x)
        output = Dense(1, activation="sigmoid")(x)    

        model = Model(inputs=inp, outputs=output)
        model.compile(loss='binary_crossentropy', optimizer='adam')
        return model


class PRNN():
    def model(self, embedding_matrix, maxlen, max_features):
        inp = Input(shape=(maxlen,))
        emb_size = embedding_matrix.shape[1]
        x_emb = Embedding(max_features, emb_size, weights=[embedding_matrix], trainable=False)(inp)
        x = SpatialDropout1D(0.2)(x_emb)
        x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
        y = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)

        last = Lambda(lambda t: t[:, -1])(x)
        atten_1 = Attention(maxlen)(x)
        atten_2 = Attention(maxlen)(y)
        max_pool = GlobalMaxPooling1D()(y)
        avg_pool = GlobalAveragePooling1D()(y)
        x = concatenate([last, atten_1, atten_2, avg_pool, max_pool], axis=1)
        x = Dense(16, activation="relu")(x)
        x = Dropout(0.1)(x)
        output = Dense(1, activation="sigmoid")(x)

        model = Model(inputs=inp, outputs=output)
        model.compile(loss='binary_crossentropy', optimizer='adam')
        return model


class Transformer():
    def model(self, embedding_matrix, maxlen, max_features, n_encoder=1):
        inp = Input(shape=(maxlen,))
        emb_size = embedding_matrix.shape[1]
        x_emb = Embedding(max_features, emb_size, weights=[embedding_matrix], trainable=False)(inp)
        # Add positional encoding
        x = AddPositionalEncoding()(x_emb)
        for i in range(n_encoder):
            x = encoder(x, emb_size)

        x = concatenate([x_emb, x])
        x = SpatialDropout1D(0.2)(x)
        x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
        atn = Attention(maxlen)(x)
        avg_pool = GlobalAveragePooling1D()(x)
        max_pool = GlobalMaxPooling1D()(x)
        x = concatenate([atn, avg_pool, max_pool])
        x = Dense(16, activation="relu")(x)
        x = Dropout(0.1)(x)
        output = Dense(1, activation="sigmoid")(x)

        model = Model(inputs=inp, outputs=output)
        model.compile(loss='binary_crossentropy', optimizer='adam')
        return model


class LstmCNN():
    def model(self, embedding_matrix, maxlen, max_features):
        inp = Input(shape=(maxlen,), dtype='int32')
        emb_size = embedding_matrix.shape[1]
        conv_filters = 32
        x = Embedding(max_features, emb_size, weights=[embedding_matrix], trainable=False)(inp)
        x = SpatialDropout1D(0.2)(x)
        x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
        x = Dropout(0.1)(x)
        x = Reshape((2 * maxlen, 64, 1))(x)
        x = Conv2D(conv_filters, (3, 3))(x)
        x = MaxPool2D(pool_size=(2, 2))(x)
        x = Flatten()(x)
        output = Dense(1, activation="sigmoid")(x)

        model = Model(inputs=inp, outputs=output)
        model.compile(loss='binary_crossentropy', optimizer='adam')
        return model


class HybridCnnGru():
    def model(self, emb, maxlen, max_features):
        inp = Input(shape=(maxlen,))
        x_emb = Embedding(max_features,
                        emb.shape[1],
                        weights=[emb],
                        trainable=False)(inp)
        x = SpatialDropout1D(0.2)(x_emb)

        x_gru = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
        x_gru_capsule = Capsule(num_capsule=5, dim_capsule=5, routings=4,
                          share_weights=True)(x_gru)
        x_gru_capsule = Flatten()(x_gru_capsule)
        x_gru_attention = Attention(maxlen)(x_gru)
        x_gru_conc = Concatenate()([x_gru_capsule, x_gru_attention])

        x_conv_1 = Conv1D(64, kernel_size=1, strides=1, padding="same", kernel_initializer="he_uniform")(x)
        x_conv_2 = Conv1D(64, kernel_size=2, strides=1, padding="same", kernel_initializer="he_uniform")(x)
        x_conv_3 = Conv1D(64, kernel_size=3, strides=1, padding="same", kernel_initializer="he_uniform")(x)
        x_conv_5 = Conv1D(64, kernel_size=5, strides=1, padding="same", kernel_initializer="he_uniform")(x)

        x_conv = Concatenate()([x_conv_1, x_conv_2, x_conv_3, x_conv_5])
        x_conv_capsule = Capsule(num_capsule=5, dim_capsule=5, routings=4,
                          share_weights=True)(x_conv)
        x_conv_capsule = Flatten()(x_conv_capsule)
        x_conv_attention = Attention(maxlen)(x_conv)
        x_conv_conc = Concatenate()([x_conv_capsule, x_conv_attention])

        x = Concatenate()([x_conv_conc, x_gru_conc])
        x = Dropout(0.1)(x)
        x = Dense(32)(x)
        x = BatchNormalization()(x)
        x = PReLU()(x)
        x = Dropout(0.1)(x)
        x = Dense(32)(x)
        x = BatchNormalization()(x)
        x = PReLU()(x)
        x = Dropout(0.1)(x)

        output = Dense(1, activation="sigmoid")(x)
        model = Model(inputs=inp, outputs=output)
        model.compile(loss='binary_crossentropy', optimizer='adam')
        return model


class LstmAtn():
    def model(self, embedding_matrix, maxlen, max_features):
        inp = Input(shape=(maxlen,))
        emb_size = embedding_matrix.shape[1]
        x_emb = Embedding(max_features, emb_size, weights=[embedding_matrix], trainable=False)(inp)
        x = SpatialDropout1D(0.2)(x_emb)
        x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
        y = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)

        atn_1 = Attention(maxlen)(x)
        atn_2 = Attention(maxlen)(y)
        avg_pool = GlobalAveragePooling1D()(y)
        max_pool = GlobalMaxPooling1D()(y)

        x = concatenate([atn_1, atn_2, avg_pool, max_pool])
        x = Dense(16, activation="relu")(x)
        x = Dropout(0.1)(x)
        output = Dense(1, activation="sigmoid")(x)    

        model = Model(inputs=inp, outputs=output)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model


class LstmFAtn():
    def model(self, embedding_matrix, maxlen, max_features):
        inp_seq = Input(shape=(maxlen,), name='seq')
        inp_feature = Input(shape=(len(feature_cols),), name='feature')
        emb_size = embedding_matrix.shape[1]
        x_emb = Embedding(max_features, emb_size, weights=[embedding_matrix], trainable=False)(inp_seq)
        x = SpatialDropout1D(0.2)(x_emb)
        x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
        y = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)

        atn_1 = Attention(maxlen)(x)
        atn_2 = Attention(maxlen)(y)
        avg_pool = GlobalAveragePooling1D()(y)
        max_pool = GlobalMaxPooling1D()(y)
        x = concatenate([atn_1, atn_2, avg_pool, max_pool, inp_feature])
        x = DropConnect(Dense(32, activation="relu", kernel_initializer=glorot_normal(seed=SEED)), prob=0.2)(x)
#         x = Dense(32, activation='relu', kernel_initializer=glorot_normal(seed=SEED))(x)
#         x = Dropout(0.1)(x)

        output = Dense(1, activation="sigmoid")(x)
        model = Model(inputs=[inp_seq, inp_feature], outputs=output)
        model.compile(loss='binary_crossentropy', optimizer='adam')
        # model.compile(loss=[focal_loss(alpha=.25, gamma=2)], optimizer='adam')
        return model


class LstmAtnContext():
    def model(self, embedding_matrix, maxlen, max_features):
        inp_seq = Input(shape=(maxlen,), name='seq')
        inp_feature = Input(shape=(len(feature_cols),), name='feature')
        emb_size = embedding_matrix.shape[1]
        x_emb = Embedding(max_features, emb_size, weights=[embedding_matrix], trainable=False)(inp_seq)
        x = SpatialDropout1D(0.2)(x_emb)
        x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
        y = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)

        atn_1 = AttentionWithContext()(x)
        atn_2 = AttentionWithContext()(y)
        avg_pool = GlobalAveragePooling1D()(y)
        max_pool = GlobalMaxPooling1D()(y)

        x = concatenate([atn_1, atn_2, avg_pool, max_pool, inp_feature])
        x = Dense(32, activation='relu', kernel_initializer=glorot_normal(seed=SEED))(x)
        x = Dropout(0.1)(x)
        output = Dense(1, activation="sigmoid")(x)    

        model = Model(inputs=[inp_seq, inp_feature], outputs=output)
        model.compile(loss='binary_crossentropy', optimizer='adam')
        return model


class MM():
    def model(self, models, maxlen):
        inp = Input(shape=(maxlen,))
        outputs = []
        for m in models:
            y = m(inp)
            outputs.append(y)
        output = Average()(outputs)

        model = Model(inputs=inp, outputs=output)
        model.compile(loss='binary_crossentropy', optimizer='adam')
        return model

## Train

In [34]:
clr = CyclicLR(base_lr=0.001, max_lr=0.002, step_size=300., mode='exp_range', gamma=0.99994)
warmup = False
save_best_only = False
feature_input = True
models = []

## CV

In [35]:
def train_pred(model, epochs, X_train, X_val, T_X, Y_train, Y_val, mm=False):
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, min_lr=0.0001, verbose=2)
    roc_auc = RocAucEvaluation(validation_data=(X_val, Y_val), interval=1, batch_size=1024)
    # tensorboard --logdir=./log/run
    tb = TensorBoard(log_dir='./log/run')
    if save_best_only:
        filepath = "best_weights.h5"
        logloss = ModelCheckpoint(filepath, monitor='val_loss', verbose=2, save_best_only=True, mode='min')
    else:
        filepath = "model_{epoch:02d}.hdf5"
        logloss = ModelCheckpoint(filepath, monitor='val_loss', verbose=2, save_weights_only=True)
    if warmup:
        warm_up = WarmUp()
        callbacks = [logloss, warm_up, reduce_lr, tb]
        # callbacks = [roc_auc, warm_up, reduce_lr, tb]
    else:
        callbacks = [logloss, reduce_lr, tb]
        # callbacks = [roc_auc, reduce_lr, tb]

    history = model.fit(X_train, Y_train, batch_size=512, epochs=epochs, validation_data=(X_val, Y_val), verbose=2, callbacks=callbacks)
    best_loss = np.min(history.history['val_loss'])

    if save_best_only:
        model.load_weights(filepath)
        pred_val_y = np.squeeze(model.predict(X_val, batch_size=1024, verbose=2))
        pred_test_y = np.squeeze(model.predict(T_X, batch_size=1024, verbose=2))
    else:
        model.load_weights(f'model_0{epochs-1}.hdf5')
        y_pred1 = np.squeeze(model.predict(X_val, batch_size=1024, verbose=2))
        y_test1 = np.squeeze(model.predict(T_X, batch_size=1024, verbose=2))
        model.load_weights(f'model_0{epochs}.hdf5')
        y_pred2 = np.squeeze(model.predict(X_val, batch_size=1024, verbose=2))
        y_test2 = np.squeeze(model.predict(T_X, batch_size=1024, verbose=2))
        pred_val_y = (y_pred1 + y_pred2) / 2
        pred_test_y = (y_test1 + y_test2) / 2

    best_score, best_thresh = f1_smart(Y_val, pred_val_y)
    # best_score, best_thresh = threshold_search(Y_val, pred_val_y)
    print('Optimal F1: {:.4f} at threshold: {:.4f}'.format(best_score, best_thresh))
    if mm:
        models.append(model)
    else:
        del model
        gc.collect()
        K.clear_session()
        tf.reset_default_graph()
    return pred_val_y, pred_test_y, best_score, best_thresh, best_loss

def train_single(m, emb, epochs, name):
    print('\n', name)
    model = m().model(emb, maxlen, max_features)
    pred_val_y, pred_test_y, best_score, best_thresh, best_loss = train_pred(model, epochs, X_train, X_val, T_X, Y_train, Y_val)
    return [pred_val_y, pred_test_y, best_score, best_thresh, best_loss, name]

def train_mm(m, emb, epochs, name):
    print('\n', name)
    model = m().model(emb, maxlen, max_features)
    train_pred(model, epochs, X_train, X_val, T_X, Y_train, Y_val, mm=True)

if cv:
    kfolds, epochs = 5, 5
    run = 5
    kf = StratifiedKFold(n_splits=kfolds, random_state=26, shuffle=True).split(X, Y)
    loss = []
    thresh = []
    train_meta = np.zeros(Y.shape)
    test_meta = np.zeros(T_X.shape[0])
    if feature_input:
        x_test = [T_X, test_features]
    else:
        x_test = T_X

    for i, (train_idx, valid_idx) in enumerate(kf):
        X_train, X_val, Y_train, Y_val = X[train_idx], X[valid_idx], Y[train_idx], Y[valid_idx]
        if feature_input:
            features_train = features[train_idx]
            features_val= features[valid_idx]
            x_train = [X_train, features_train]
            x_val = [X_val, features_val]
        else:
            x_train = X_train
            x_val = X_val

        model = LstmFAtn().model(emb, maxlen, max_features)
        if i == 0: print(model.summary())
        pred_val_y, pred_test_y, best_score, best_thresh, best_loss = train_pred(model, epochs, x_train, x_val, x_test, Y_train, Y_val)
        loss.append(best_loss)
        thresh.append(best_thresh)
        train_meta[valid_idx] = pred_val_y
        test_meta += pred_test_y / run
        if i == run - 1:
            break
    
    if run == kfolds:
        best_score, best_thresh = f1_smart(np.squeeze(Y), train_meta)
        # best_score, best_thresh = threshold_search(np.squeeze(Y), train_meta)
        print('Optimal F1: {:.4f} at threshold: {:.4f}'.format(best_score, best_thresh))
    else:
        threshold = np.mean(thresh)
    print('mean_thresh: {:.4f} and mean_loss: {:.4f}'.format(np.mean(thresh), np.mean(loss)))
    test_meta = test_meta.reshape((-1, 1))
    pred_test_y = (test_meta > best_thresh).astype(int)
else:
    train_conf = [
        [LstmAtn, emb, 4, 'LstmAtn_emb_mean'],
        [PRNN, emb, 4, 'PRNN_emb_mean'],
        [LstmAtn, emb_glove, 4, 'LstmAtn_emb_glove'],
        [LstmEAtn, emb, 4, 'LstmEAtn_emb_mean'],
        [StackLstm, emb, 4, 'StackLstm_emb_mean'],
    #     [GCDC, emb, 5, 'GCDC_emb_mean'],
    #     [GruCapsule, emb, 5, 'GruCapsule_emb_mean'],
    ]
    
    #     # 根据10次平均值决定模型权重
    #     scores, losses = [], []
    #     for i in range(10):
    #         outputs = []
    #         for item in train_conf:
    #             outputs.append(train_single(*item))
    #         score = [output[2] for output in outputs]
    #         loss = [output[4] for output in outputs]
    #         print(score, loss)
    #         scores.append(score)
    #         losses.append(loss)
    #     score = np.mean(scores, axis=0)
    #     loss = np.mean(losses, axis=0)
    #     print(score, loss)

    ensemble_w = True
    weights = [0.3, 0.2, 0.2, 0.15, 0.15]
    outputs = []
    for item in train_conf:
        outputs.append(train_single(*item))

    if ensemble_w:
        y_pred = np.sum([outputs[i][0] * weights[i] for i in range(len(outputs))], axis=0)
        test_pred = np.sum([outputs[i][1] * weights[i] for i in range(len(outputs))], axis=0)
    else:
        y_pred = np.mean([outputs[i][0] for i in range(len(outputs))], axis=0)
        test_pred = np.mean([outputs[i][1] for i in range(len(outputs))], axis=0)

    best_score, best_thresh = f1_smart(Y_val, y_pred)
    # best_score, best_thresh = threshold_search(Y_val, y_pred)
    print('Optimal F1: {:.4f} at threshold: {:.4f}'.format(best_score, best_thresh))
    # threshold = 0.34
    pred_test_y = (test_pred > best_thresh).astype(int)

sub['prediction'] = pred_test_y
sub.to_csv("submission.csv", index=False)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
seq (InputLayer)                (None, 72)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 72, 300)      28500000    seq[0][0]                        
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 72, 300)      0           embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 72, 128)      187392      spatial_dropout1d_1[0][0]        
__________________________________________________________________________________________________
bidirectio

Model | Structure | Dropout | Embedding | Epochs | Fold | replace_punc | maxlen | max_features | batch_size | F1_local | thresh_local | loss_local | F1_online | thresh_online | loss_online | F1_submit | time/epoch(local-online) | notebook
:---- | :-------- | :------ | :-- | :----- | :--- | :----- | :----------- | :--------- | :--------- | :----- | :----- | :--------- | :----- | :----- | :------ | :------ | :------ | :------:
LstmAtn | Lstm(64)Gru(64)Dense(16) | 0.2/0.1 | mean_gp | 5 | 5 | True | 72 | 95000 | 512 | 0.6864 | 0.3620 | 0.0970 | 0.6874 | 0.3570 | 0.0968 | 0.692 | 90s-240s | qiqc_bp(v1)
LstmFAtn | Lstm(64)Gru(64)Dense(16) | 0.2/0.1 | mean_gp | 5 | 5 | False | 72_4(continous) | 95000 | 512 | 0.6889 | 0.3583 | 0.0963 | 0.6874 | 0.3506 | 0.0967 | 0.689 | 90s-242s | qiqc_bp(v16)
LstmFAtn | Lstm(64)Gru(64)Dense(32) | 0.2/0.1 | mean_gp | 5 | 5 | False | 72_8(cc) | 95000 | 512 | 0.6895 | 0.3949 | 0.0960 | 0.6906 | 0.3630 | 0.0959 | 0.694 | 90s-242s | qiqc_bp(v14)
LstmFAtn | Lstm(64)Gru(64)Dense(32) | 0.2/0.1 | mean_gp | 5 | 5 | False | 72_4(categorical) | 95000 | 512 | 0.6905 | 0.3708 | 0.0961 | 0.6903 | 0.3764 | 0.0960 | 0.698 | 90s-244s | qiqc_bp(v17)
LstmFAtn | Lstm(64)Gru(64)Dense(16) | 0.2/0.1 | mean_gp | 5 | 5 | False | 72_4(categorical) | 95000 | 512 | 0.6887 | 0.3897 | 0.0961 | 0. | 0. | 0. | 0. | 90s-240s | 
LstmFAtn | Lstm(64)Gru(64)Dense(32) | 0.2/0.1 | mean_gp | 5 | 5 | False | 72_10(random) | 95000 | 512 | 0.6888 | 0.3916 | 0.0963 | 0.6877 | 0.4023 | 0.0963 | 0.697 | 90s-244s | qiqc_bp(v22)
LstmFAtn | Lstm(64)Gru(64)Dense(32)seed(26） | 0.2/0.1 | mean_gp | 5 | 5 | False | 70_23(categorical) | 90000 | 512 | 0.6907 | 0.3795 | 0.0958 | 0.6884 | 0.3496 | 0.0961 | 0.694 | 90s-244s | qiqc_bp(v25)
LstmFAtn | Lstm(64)Gru(64)Dense(32) | 0.2/0.1 | mean_gp | 5 | 5 | False | 72_10(sort) | 95000 | 512 | 0.6903 | 0.3585 | 0.0956 | 0.6901 | 0.3837 | 0.0959 | 0.699 | 90s-244s | qiqc_bp(v27)
LstmFAtn | Lstm(64)Gru(64)Dense(32) | 0.2/0.1 | mean_gp | 5 | 5 | False | 72_13(sort) | 95000 | 512 | 0.6906 | 0.3558 | 0.0957 | 0.6903 | 0.3772 | 0.0957 | 0.697 | 90s-244s | qiqc_bp(v28)
LstmFAtn | Lstm(64)Gru(64)Dense(32) | 0.2/0.1 | mean_gp | 5 | 5 | False | 72_12(sort) | 95000 | 512 | 0.6902 | 0.3488 | 0.0959 | 0.6919 | 0.3861 | 0.0956 | 0.697 | 90s-244s | qiqc_bp(v29)
LstmFAtn | Lstm(64)Gru(64)Dense(32) | 0.2/0.1 | mean_gp | 5 | 5 | False | 72_11(sort) | 95000 | 512 | 0.6906 | 0.3562 | 0.0957 | 0.6908 | 0.3793 | 0.0955 | 0.700 | 90s-250s | qiqc_bp(v31)
LstmFAtn | Lstm(64)Gru(64)Dense(32)pool3y | 0.2/0.1 | mean_gp | 5 | 5 | False | 72_11(sort) | 95000 | 512 | 0.6897 | 0.3707 | 0.0962 | 0. | 0. | 0. | 0. | 94s-250s | qiqc_bp(v)
LstmFAtn | Lstm(64)Gru(64)Dense(32)pool3x | 0.2/0.1 | mean_gp | 5 | 5 | False | 72_11(sort) | 95000 | 512 | 0.6903 | 0.3610 | 0.0957 | 0. | 0. | 0. | 0. | 94s-250s | qiqc_bp(v)
LstmFAtn | Lstm(64)Gru(64)Dense(32)FL | 0.2/0.1 | mean_gp | 5 | 5 | False | 72_11(sort) | 95000 | 512 | 0.6861 | 0.3550 | 5.0649 | 0.6882 | 0.3584 | 5.0502 | 0.694 | 90s-239s/7057s | qiqc_bp(v32)
LstmFAtn | Lstm(64)Gru(64)Dense(32)FL | 0.2/0.1 | mean_gp | 5 | 5 | False | 72_11(sort) | 90000 | 512 | 0.8424 | 0.3737 | 11.2354 | 0. | 0. | 0. | 0. | 22s-s | qiqc_bp(v) 数据采样
LstmFAtn | Lstm(64)Gru(64)Dense(32) | 0.2/0.1 | mean_gp | 5 | 5 | False | 72_11(sort) | 90000 | 512 | 0.8451 | 0.3820 | 0.1983 | 0. | 0. | 0. | 0. | 23s-s | qiqc_bp(v) 数据采样
LstmFAtn | Lstm(64)Gru(64)Dense(32) | 0.2/0.1 | mean_gp | 5 | 5 | False | 72_13(sort) | 95000 | 512 | 0.6916 | 0.3811 | 0.0957 | 0.6909 | 0.3766 | 0.0958 | 0.699 | 90s-250s | qiqc_bp(v38)
GCDC | Lstm(96)Dense(32) | 0.2/0.1 | mean_gp | 5 | 5 | False | 72_13(sort) | 95000 | 512 | 0.6903 | 0.3502 | 0.0966 | 0. | 0. | 0. | 0. | 90s-244s | qiqc_bp(v)
LstmFAtn | Lstm(64)Gru(64)Dense(32) | 0.2/0.1 | mean_gp_add_lower | 5 | 5 | False | 72_13(sort) | 95000 | 512 | 0.6925 | 0.3592 | 0.0958 | 0. | 0. | 0. | 0. | 90s-250s | qiqc_bp(v)
LstmFAtn | Lstm(64)Gru(64)Dense(32) | 0.2/0.1 | mean_gp_add_lower | 5 | 5 | False | 72_11(sort) | 95000 | 512 | 0. | 0. | 0. | 0.6911 | 0.3817 | 0.0956 | 0.698 | 90s-250s | qiqc_bp(v)
LstmFAtn | Lstm(64)Gru(64)Dense(32) | 0.2/0.1 | mean_gp_smart | 5 | 5 | False | 72_13(sort) | 95000 | 512 | 0.6923 | 0.3358 | 0.0961 | 0. | 0. | 0. | 0. | 90s-250s | qiqc_bp(v)
LstmFAtn | Lstm(64)Gru(64)Dense(32)SEED(2019) | 0.2/0.1 | mean_gp_add_lower | 5 | 5 | False | 72_13(sort) | 95000 | 512 | 0.6930 | 0.3515 | 0.0961 | 0.6906 | 0.3532 | 0.0963 | 0.700 | 90s-250s | qiqc_bp(v)
LstmFAtn | Lstm(64)Gru(64)Dense(32)SEED(2019) | 0.2/0.1 | mean_all_add_lower | 5 | 5 | False | 72_11(sort) | 95000 | 512 | 0.6919 | 0.3539 | 0.0961 | 0. | 0. | 0. | 0. | 90s-250s | qiqc_bp(v)
LstmFAtn | Lstm(64)Gru(64)Dense(32) | 0.2/DC0.1 | mean_gp_add_lower | 5 | 5 | False | 72_11(sort) | 95000 | 512 | 0. | 0. | 0. | 0. | 0. | 0. | 0. | 90s-250s | qiqc_bp(v)
LstmFAtn | Lstm(64)Gru(64)Dense(32) | 0.2/DC0.2 | mean_gp_add_lower | 5 | 5 | False | 72_11(sort) | 95000 | 512 | 0. | 0. | 0. | 0. | 0. | 0. | 0. | 90s-250s | qiqc_bp(v)