In [None]:
import os
import gc
import re
import random
import time
import numpy as np
import pandas as pd

import torch
print("torch version:", torch.__version__)
import torch.nn as nn
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tqdm import tqdm
tqdm.pandas()
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from string import punctuation

In [None]:
def set_seed_torch(seed=2019):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
emb_size = 300
max_features = 95000
maxlen = 72
batch_size = 512
SEED = 2019
set_seed_torch(seed=SEED)
cv = True

In [None]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
print("Train shape: ", train.shape)
print("Test shape: ", test.shape)
sub = test[['qid']]

In [None]:
# Get set of all punctuations in dataset
tmp = []
for x in train.question_text:
    for c in x:
        if not c.isalnum():
            tmp.append(c)
for x in test.question_text:
    for c in x:
        if not c.isalnum():
            tmp.append(c)
puncs = set(tmp) - set(' ')

In [None]:
contraction = { "ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because",
                "could've": "could have", "couldn't": "could not", "didn't": "did not",
                "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not",
                "haven't": "have not", "haven ' t""he'd": "he would","he'll": "he will", "he's": "he is",
                "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                "i'd": "i would", "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have",
                "i'm": "i am", "i've": "i have", "i'd": "i would", "i'd've": "i would have",
                "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have",
                "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will",
                "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
                "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have",
                "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not",
                "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not",
                "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not",
                "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have",
                "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have",
                "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is",
                "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is",
                "they'd": "they would", "they'd've": "they would have", "they'll": "they will",
                "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
                "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will",
                "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not",
                "what'll": "what will", "what'll've": "what will have", "what're": "what are", "what's": "what is",
                "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did",
                "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have",
                "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have",
                "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have",
                "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are",
                "y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will",
                "you'll've": "you will have", "you're": "you are", "you've": "you have" }

mispell = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling',
           'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor',
           'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu': 'youtube ',
           'qoura': 'quora', 'quorans': 'quora users', 'quoran': 'quora user', 'sallary': 'salary', 'whta': 'what',
           'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much',
           'howmany': 'how many', 'whydo': 'why do', 'doi': 'do i', 'thebest': 'the best', 'howdoes': 'how does',
           'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating',
           'pennis': 'penis', 'etherium': 'ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data',
           '2k15': '2015', '2k16': '2016', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend',
           'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization',
           'demonitization': 'demonetization', 'demonetisation': 'demonetization', 'pokémon': 'pokemon',
           'nanodegree': 'nano degree', 'brexit': 'british exit', 'cryptocurrencies': 'crypto currencies',
           'coinbase': 'coin base', 'oneplus': 'one plus', 'redmi': 'red mi', 'GDPR': 'general data protection regulation',
           'DCEU': 'dc extended universe', 'litecoin': 'lite coin', 'unacademy': 'non academy', 'altcoin': 'bitcoin alternative',
           'altcoins': 'bitcoin alternative', 'sjw': 'social justice warriors', 'sjws': 'social justice warriors',
           'fiancé': 'fiance', 'microservices': 'micro services', 'bitconnect': 'bit connect', 'codeforces': 'code forces',
           'wannacry': 'wanna cry', 'onedrive': 'one drive', 'airpods': 'air pods', 'twinflame': 'twin flame',
           'undergraduation': 'under graduation', 'cos2x': 'cos 2 x', 'yourquote': 'your quote', 'xiomi': 'xiaomi',
           'undertale': 'under tale', 'genderfluid': 'gender fluid', 'são': 'sao', 'chapterwise': 'chapter wise',
           'deepmind': 'deep mind', '': '', 'arrowverse': 'arrow verse', 'overbrace': ' ', 'tensorflow': 'tensor flow',
           'hackerrank': 'hacker rank', 'microservice': 'micro service', 'reactjs': 'react js', 'hackerearth': 'hacker earth',
           'fiancée': 'fiance', 'blockchains': 'block chains', 'beyoncé': 'beyonce', 'neuralink': 'neura link',
           'openai': 'open ai', 'zoomcar': 'zoom car', 'hyperconjugation': 'hyper conjugation', 'autoencoder': 'auto encoder',
           'webassembly': 'web assembly', 'quoras': 'quora', 'digilocker': 'digi locker', 'oversmart': 'over smart',
           'cryptocoins': 'crypto coins', 'crytocurrencies': 'cryto currencies', 'cyrptocurrency': 'cyrpto currency',
           'café': 'cafe', 'whatapp': 'whatsapp', 'gaslighter': 'gas lighter', 'darkweb': 'dark web', 'webnovel': 'web novel'}

# sp = [",", ";", '"', "...", "?", "!", ".", ":", "*", "-"]

In [None]:
def replace_quote(text):
    quote = ['´', '‘', '’', "`"]
    for s in quote:
        text = text.replace(s, "'")
    return text
                      
def re_mapping(mapping):
    res = re.compile('(%s)' % '|'.join(mapping.keys()))
    return res

mapping = dict(set(contraction.items()) | set(mispell.items()))
re_map = re_mapping(mapping)
def replace_mapping(text):
    def replace(match):
        return mapping[match.group(0)]
    return re_map.sub(replace, text)

def sep_punc(x):
    for p in puncs:
        x = x.replace(p, f' {p} ')
    return x

def replace_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

def add_features(df):
    df['question_text'] = df['question_text'].progress_apply(lambda x: str(x))
    df['num_chars'] = df['question_text'].progress_apply(len)
    df['num_words'] = df.question_text.str.count('\S+')

    df['num_capital'] = df['question_text'].progress_apply(lambda x: sum(1 for c in x if c.isupper()))
    df['capital_rate'] = df['num_capital'] / df['num_words']

    df['num_uniquewords'] = df['question_text'].progress_apply(lambda x: len(set(x.split())))
    df['unique_rate'] = df['num_uniquewords'] / df['num_words']

    df["num_titlewords"] = df["question_text"].progress_apply(lambda x: len([w for w in x.split() if w.istitle()]))
    df['title_rate'] = df['num_titlewords'] / df['num_words']
    
    df["num_upperwords"] = df["question_text"].progress_apply(lambda x: len([w for w in x.split() if w.isupper()]))
    df['upper_rate'] = df['num_upperwords'] / df['num_words']
    
    df["num_exc"] = df["question_text"].progress_apply(lambda x: x.count("!")).astype('uint16')
    df["num_q"] = df['question_text'].progress_apply(lambda x: x.count("?")).astype('uint16')
    df["mean_word_len"] = df["question_text"].progress_apply(lambda x: np.mean([len(w) for w in x.split()]))
    df["max_word_len"] = df['question_text'].progress_apply(lambda x: max([len(w) for w in x.split()]))

    df["num_unpunc"] = df["question_text"].progress_apply(lambda x: sum(x.count(p) for p in (puncs-set(punctuation)))).astype('uint16')
    df["num_punc"] = df["question_text"].progress_apply(lambda x: sum(x.count(p) for p in punctuation)).astype('uint16')
    df["num_mispell"] = df["question_text"].progress_apply(lambda x: sum(x.count(p) for p in mispell)).astype('uint16')
    
#     for s in sp:
#         df[s] = df["question_text"].progress_apply(lambda x: np.mean([len(w) for w in x.split(s)]))
    return df

In [None]:
feature_cols = ['capital_rate',  'num_chars', 'num_words', "max_word_len", "mean_word_len",
                'num_capital', "num_punc", 'num_uniquewords', "num_q", "num_unpunc", "num_exc"]

In [None]:
# Add features
train = add_features(train)
test = add_features(test)

features = train[feature_cols].fillna(0)
test_features = test[feature_cols].fillna(0)
ss = StandardScaler()
ss.fit(np.vstack((features, test_features)))
features = ss.transform(features)
test_features = ss.transform(test_features)
print("Add features done")

# Lower
train["question_text"] = train["question_text"].str.lower()
test["question_text"] = test["question_text"].str.lower()
print("Lower done")

# Replace quote
train['question_text'] = train['question_text'].progress_apply(lambda x: replace_quote(x))
test['question_text'] = test['question_text'].progress_apply(lambda x: replace_quote(x))
print("Replace quote done")

# Replace mapping(contraction & mispell)
train['question_text'] = train['question_text'].progress_apply(lambda x: replace_mapping(x))
test['question_text'] = test['question_text'].progress_apply(lambda x: replace_mapping(x))
print("Replace mapping done")

# Sep punc
train['question_text'] = train['question_text'].progress_apply(lambda x: sep_punc(x))
test['question_text'] = test['question_text'].progress_apply(lambda x: sep_punc(x))
print("Sep punc done")

# Replace numbers
train['question_text'] = train['question_text'].progress_apply(lambda x: replace_numbers(x))
test['question_text'] = test['question_text'].progress_apply(lambda x: replace_numbers(x))
print("Replace numbers done")

In [None]:
def load_single_split(val_size=0.1):
    train_df, val_df = train_test_split(train, test_size=val_size, random_state=SEED)
    X_train = train_df["question_text"].values
    X_val = val_df["question_text"].values
    T_X = test["question_text"].values

    tokenizer = Tokenizer(num_words=max_features, filters='')
    tokenizer.fit_on_texts(list(X_train))
    X_train = tokenizer.texts_to_sequences(X_train)
    X_val = tokenizer.texts_to_sequences(X_val)
    T_X = tokenizer.texts_to_sequences(T_X)

    X_train = pad_sequences(X_train, maxlen=maxlen)
    X_val = pad_sequences(X_val, maxlen=maxlen)
    T_X = pad_sequences(T_X, maxlen=maxlen)

    Y_train = train_df['target'].values
    Y_val = val_df['target'].values  
    
    # shuffle
    train_idx = np.random.permutation(len(X_train))
    val_idx = np.random.permutation(len(X_val))
    X_train = X_train[train_idx]
    X_val = X_val[val_idx]
    Y_train = Y_train[train_idx]
    Y_val = Y_val[val_idx]
    return X_train, X_val, T_X, Y_train, Y_val, tokenizer.word_index

if cv:
    X = train["question_text"].fillna("_na_").values
    T_X = test["question_text"].fillna("_na_").values
    tokenizer = Tokenizer(num_words=max_features, filters='')
    tokenizer.fit_on_texts(list(X))
    X = tokenizer.texts_to_sequences(X)
    X = pad_sequences(X, maxlen=maxlen)
    T_X = tokenizer.texts_to_sequences(T_X)
    T_X = pad_sequences(T_X, maxlen=maxlen)
    Y = train['target'].values
    word_index = tokenizer.word_index
    print("len(word_index):", len(word_index))
else:
    X_train, X_val, T_X, Y_train, Y_val, word_index = load_single_split(val_size=0.1)

In [None]:
del train, test
gc.collect()

In [None]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_glove(max_features, word_index, vec_path):
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(vec_path, encoding='latin'))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    emb_size = all_embs.shape[1]

    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, emb_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

def load_para(max_features, word_index, vec_path):
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(vec_path, encoding='latin'))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    emb_size = all_embs.shape[1]

    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, emb_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [None]:
start_time = time.time()
emb_glove = load_glove(max_features, word_index, "../input/embeddings/glove.840B.300d/glove.840B.300d.txt")
emb_para =load_para(max_features, word_index, "../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt")
total_time = (time.time() - start_time) / 60
print("Took {:.2f} minutes".format(total_time))
emb = np.mean([emb_glove, emb_para], axis=0)
print(np.shape(emb))

In [None]:
class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)
        
        self.supports_masking = True

        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0
        
        weight = torch.zeros(feature_dim, 1)
        nn.init.xavier_uniform_(weight)
        self.weight = nn.Parameter(weight)
        
        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))
        
    def forward(self, x, mask=None):
        feature_dim = self.feature_dim
        step_dim = self.step_dim

        eij = torch.mm(
            x.contiguous().view(-1, feature_dim), 
            self.weight
        ).view(-1, step_dim)
        
        if self.bias:
            eij = eij + self.b
            
        eij = torch.tanh(eij)
        a = torch.exp(eij)
        
        if mask is not None:
            a = a * mask

        a = a / torch.sum(a, 1, keepdim=True) + 1e-10

        weighted_input = x * torch.unsqueeze(a, -1)
        return torch.sum(weighted_input, 1)

In [None]:
class LstmAtn(nn.Module):
    def __init__(self):
        super(LstmAtn, self).__init__()
        
        hidden_size = 64
        
        self.embedding = nn.Embedding(max_features, emb_size)
        self.embedding.weight = nn.Parameter(torch.tensor(emb, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        
        self.embedding_dropout = nn.Dropout2d(0.2)
        self.lstm = nn.LSTM(emb_size, hidden_size, bidirectional=True, batch_first=True)
        self.gru = nn.GRU(hidden_size*2, hidden_size, bidirectional=True, batch_first=True)
        
        self.lstm_attention = Attention(hidden_size*2, maxlen)
        self.gru_attention = Attention(hidden_size*2, maxlen)
        
        self.linear = nn.Linear(hidden_size*8+len(feature_cols), 32)
        # self.linear = nn.Linear(hidden_size*8, 32)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.out = nn.Linear(32, 1)
        
    def forward(self, inp):
        x_emb = self.embedding(inp[0])
        x = torch.squeeze(self.embedding_dropout(torch.unsqueeze(x_emb, 0)))
        
        lstm, _ = self.lstm(x)
        gru, _ = self.gru(lstm)
        
        lstm_atn = self.lstm_attention(lstm)
        gru_atn = self.gru_attention(gru)
        
        avg_pool = torch.mean(gru, 1)
        max_pool, _ = torch.max(gru, 1)
        f = torch.tensor(inp[1], dtype=torch.float).cuda()
        conc = torch.cat((lstm_atn, gru_atn, avg_pool, max_pool, f), 1)
        # conc = torch.cat((lstm_atn, gru_atn, avg_pool, max_pool), 1)
        conc = self.relu(self.linear(conc))
        conc = self.dropout(conc)
        out = self.out(conc)
        return out

In [None]:
class MyDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __getitem__(self, index):
        data, target = self.dataset[index]

        return data, target, index
    def __len__(self):
        return len(self.dataset)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def f1_smart(y_true, y_pred):
    args = np.argsort(y_pred)
    tp = y_true.sum()
    fs = (tp - np.cumsum(y_true[args[:-1]])) / np.arange(y_true.shape[0] + tp - 1, tp, -1)
    res_idx = np.argmax(fs)
    return 2 * fs[res_idx], (y_pred[args[res_idx]] + y_pred[args[res_idx + 1]]) / 2

def threshold_search(y_true, y_pred):
    best_threshold = 0
    best_score = 0
    for threshold in [i * 0.01 for i in range(100)]:
        score = f1_score(y_true=y_true, y_pred=(y_pred > threshold).astype(int))
        if score > best_score:
            best_threshold = threshold
            best_score = score
    return best_score, best_threshold

In [None]:
kfolds, epochs = 5, 7
kf = StratifiedKFold(n_splits=kfolds, random_state=26, shuffle=True).split(X, Y)
train_preds = np.zeros((len(X)))
test_preds = np.zeros((len(T_X)))

x_test_cuda = torch.tensor(T_X, dtype=torch.long).cuda()
test = TensorDataset(x_test_cuda)
test_loader = DataLoader(test, batch_size=batch_size, shuffle=False)

for i, (train_idx, valid_idx) in enumerate(kf):
    x_train_fold = torch.tensor(X[train_idx], dtype=torch.long).cuda()
    y_train_fold = torch.tensor(Y[train_idx, np.newaxis], dtype=torch.float32).cuda()
    x_val_fold = torch.tensor(X[valid_idx], dtype=torch.long).cuda()
    y_val_fold = torch.tensor(Y[valid_idx, np.newaxis], dtype=torch.float32).cuda()
    features_train = features[train_idx]
    features_val = features[valid_idx]
    
    model = LstmAtn()
    model.cuda()

    loss_fn = torch.nn.BCEWithLogitsLoss(reduction="mean")
    optimizer = torch.optim.Adam(model.parameters())
    scheduler = ReduceLROnPlateau(optimizer,
                                  mode='min',
                                  factor=0.5,
                                  patience=1,
                                  verbose=True,
                                  min_lr=0.0001)
    train = TensorDataset(x_train_fold, y_train_fold)
    valid = TensorDataset(x_val_fold, y_val_fold)
    
    train = MyDataset(train)
    valid = MyDataset(valid)
    
    train_loader = DataLoader(train, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(valid, batch_size=batch_size, shuffle=False)
    
    print(f'Fold {i + 1}')
    best_loss = 1
    losses = []
    filepath = "best_loss.pth"
    for epoch in range(epochs):
        start_time = time.time()
        
        model.train()
        avg_loss = 0.
        for i, (x_batch, y_batch, index) in enumerate(train_loader):
            f = features_train[index]
            y_pred = model([x_batch, f])
            loss = loss_fn(y_pred, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
        
        model.eval()
        valid_preds_fold = np.zeros((x_val_fold.size(0)))
        test_preds_fold = np.zeros(len(T_X))
        avg_val_loss = 0.
        for i, (x_batch, y_batch, index) in enumerate(valid_loader):
            f = features_val[index]
            y_pred = model([x_batch, f]).detach()
            avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
            valid_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
        
        elapsed_time = time.time() - start_time 
        print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t time={:.2f}s'.format(
            epoch + 1, epochs, avg_loss, avg_val_loss, elapsed_time))
        # choose model with best loss or roc_auc
        if best_loss > avg_val_loss:
            best_loss = avg_val_loss
            torch.save(model.state_dict(), filepath)
        scheduler.step(avg_val_loss)

    losses.append(best_loss)
    # model.load_state_dict(torch.load(filepath))
    
    for i, (x_batch,) in enumerate(test_loader):
        f = test_features[i * batch_size:(i+1) * batch_size]
        y_pred = model([x_batch, f]).detach()
        test_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
    train_preds[valid_idx] = valid_preds_fold
    test_preds += test_preds_fold / kfolds
    
    del model
    gc.collect()

In [None]:
best_score, best_thresh = f1_smart(Y, train_preds)
# best_score, best_thresh = threshold_search(Y, train_preds)
print('Optimal F1: {:.4f} at threshold: {:.4f}'.format(best_score, best_thresh))
print('mean_loss: {:.4f}'.format(np.mean(losses)))
sub['prediction'] = (test_preds > best_thresh).astype(int)
sub.to_csv("submission.csv", index=False)