In [1]:
import re
import time
import gc
import random
import os
import math
import shutil

import numpy as np
import pandas as pd

from tqdm import tqdm

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score

import gensim

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim.optimizer import Optimizer

In [2]:
notebookstart= time.time()

In [3]:
embed_size = 300 # how big is each word vector
max_features = 120000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 65 # max number of words in a question to use

n_splits = 4
batch_size = 2048
train_epochs = 5
LR = 0.001
WD = 0.0001

SEED = 916 #916

In [4]:
def seed_torch(seed=916):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [5]:
import psutil
from multiprocessing import Pool

num_partitions = 20  # number of partitions to split dataframe
num_cores = psutil.cpu_count()  # number of cores on your machine

print('number of cores:', num_cores)

def df_parallelize_run(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

number of cores: 2


In [6]:
def clean_text(x, maxlen=None):
    puncts = ['ь','ɾ','█','℅ι','_','#','ò','ᡤ','▲','ุ','\\','ύ','═','―','ʍ','∧','β','‑','²','▬','®','●','φ','б',
          '‡','ϕ','„','ấ','ʻ','＝','¯','£','⁴','˂','∖','ñ','ɛ','ø','č','и','చ','＾','ô','－','ʋ','Ü','ⁿ','∆','±',
          'Ó','Â','ế','™','’','⊕','Ż','π','Ľ','ψ','△','ʌ','ě','♭','$','ộ','ᠰ','ε','ʏ','à','օ','ú','п','ö','া',
          'о','ệ','(','»','ü',')','∑','ን','Γ','￼','ĺ','ా','Ō','ç','Ο','⅔','∪','ɨ','♫','ʖ','⟨','∫','ϵ','：','、',
          '╩','´','ỵ','℅','∠','∩','%','ː','с','→','ạ','͡','¬','ķ','з','@','ה','℃','í','ட','☹','Č','"','ῆ','Я','⃗',
          '”','╚','▒','ד','←','Ã','✌','़','П','Å','ן','λ','ā','§','ἰ','ֿ','↑','ì','∂','…','.','“','ɒ','›','≥','−',
          'ㅜ','ል','≤','¼','¶','æ','ß','∨','χ','║','ū','⧼','°','ả','†','Š','ə','，','¤','ు','г','ሮ','ō','★','ɑ',
          '「','~','ł','...', 'ి','ý','•','⊆','÷','ч',';','Í','Ø','،','ο','[','≠','י','=','≅','ụ','Σ','╗','ù','!',
          'â','︡','ె','¹','é','ν','ኤ','∞','⋅','♥','︠','}','ɦ','ﬁ','·','?','Ñ','▀','✅','ễ','ợ',']','✓','ת','▾','▓',
          ':','ᡠ','│','ή','የ','（','ş','³','>','è','ξ','¦','ğ','л','ᠨ','¨','ὤ','ف','ኢ','Ž','∘','ς','₹','͜','‹','α',
          'న','ᡳ','ɖ','╣','ʀ','ו','■','×','☺','►','ኝ','º','አ','⧽','ɔ','＞','ᠠ','▄','⊨','╔','√','å','ć','¿','ీ','Φ',
          'İ','░','兰','ζ','š','В','క','・','{','）','ã','½','Δ','©','ن','<','τ','⁷','^',"'",'‘','ś','σ','ḵ','¢',
          '′','ద','х','ం','⌚','ర','/','ք','ũ','∈','ê','ሁ','Ź','—','á','̃','*','θ','–','î','ι','⁰','ṭ','ɸ','ä','¸',
          'ռ','É','ḥ','♪','ï','`','д','∀','ʿ','❤','|','ő','☆','ș','О','€','¥','+','ó','▪','⟩','⇒','≈','─','▼','ž',
          'Ἀ','я','Ā','↓','ℇ','ᡵ','-','µ','«','δ','μ','ῥ','ʃ','♨','″','&','Ω','♦','ʊ','గ',',','¾','∙','η','ī','ċ',
          '╦','ẽ','⁴λ']
    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in puncts[:maxlen]:
        if punct in x:  # add this line
            x = x.replace(punct, f' {punct} ')
    return x


def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

mispell_dict = {
                'wasnt':'was not','Doesnt':'Does not','Couldn':'Could not','Whatare':'What are','Howdo':'How do',
                'Didnt':'Did not','Howmany':'How many','Howcan':'How can','Isnt':'Is not',
                'Shouldnt':'Should not','howto':'how to','Cannot':'Can not','doI':'do I',
                "whatis":"what is",'Whatis':'What is','hasnt':'has not','practise':'practice','behaviours':'behaviors',
                'colour':'color',
                'centre':'center','favourite':'favorite','favour':'favor','travelling':'traveling','counselling':'counseling',
                'theatre':'theater','cancelled':'canceled','labour':'labor','organisation':'organization','organised':'organized',
                'wwii':'world war 2','citicise':'criticize','youtu ':'youtube ','qoura':'Quora','sallary':'salary',
                'whta': 'what','Whta':'What','narcisist': 'narcissist','howdo':'how do','whatare':'what are','howcan':'how can',
                'howmuch':'how much','Howmuch':'How much','howmany':'how many','whydo':'why do','Whydo':'Why do',
                'doI': 'do I','theBest':'the best','socialising':'socializing','visualise':'visualize',
                'howdoes':'how does','mastrubation':'masturbation','mastrubate':'masturbate','masterbation':'masturbation',
                'masterbate':'masturbate',"mastrubating":'masturbating','masterbating':'masturbating',
                'labelled':'labeled','civilisation':'civilization','customised':'customized','polarisation':'polarization',
                'pennis':'penis','etherium':'ethereum','narcissit': 'narcissist','bigdata':'big data',
                #'2k17':'2017','2k18':'2018',
                'qouta':'quota','exboyfriend':'ex boyfriend','airhostess':'air hostess',"whst":'what',"Whst":'what',
                'watsapp': 'WhatsApp','demonitisation':'demonetization','demonitization':'demonetization',
                'didnt':'did not','doesnt':'does not','isnt':'is not','shouldnt':'should not',
                'aeroplane':'airplane','aeroplanes':'airplanes','rumours':'rumors','armour':'armor','odour':'odor',
                "tryin'":"trying",'quorans':'Quora users','Quorans':'Quora users',
                "quoran":"Quora user",'recognise':'recognize',
                'cryptocurrencies':'crypto currency','cryptocurrency':'crypto currency','aluminium':'aluminum',
                'friendzoned':'friend zoned','legalised':'legalized','intership':'internship',
                "brexit":'leave EU',"Brexit":'leave EU',"blockchain":"Blockchain",'licence':'license','cheque':'check',
                'practising':'practicing','wwwyoutubecom':'youtube','worshipping':'worshiping','apologise':'apologize',
                'neighbouring':'neighboring','jewellery':'jewelry','neighbour':'neighbor','behavioural':'behavioral',
                'neighbourhood':'neighborhood','counselling':'counseling','h1b':'H1B','civilised':'civilized',
                'blockchains':'Blockchain', 'demonetisation':'demonetization','bitcoins':'Bitcoin','ethereum':'Ethereum',
                'Bitcoins':'Bitcoin','criticised':'criticized','rumour':'rumor','organising':'organizing',
                'spoilt':'spoiled','flavoured':'flavored','dysfuntional':'dysfunctional','schizoids':'schizoid',
                'wierdest':'weirdest','intercaste':'inter_caste','JCPOA':'Joint Comprehensive Plan of Action',
                'bhakts':'Bhakti','Bhakts':'Bhakti','honours':'honors','learnt':'learned','selfie':'self snap','selfies':'self snap',
                'organise':'organize','neurotypicals':'normal','criticise':'criticize',
                'trumpism':'Trump principle','Trumpism':'Trump principle','tamilans':'Tamilians','acturial':'actuarial',
                'judgement':'judgment','licences':'licenses','legalise':'legalize','undergraduation':'undergraduate',
                'centralised':'centralized','biharis':'Biharis','tumour':'tumor','labelling':'labeling',
                'whyis':'why is','airpods':'AirPods','zhihu':'Zhihua','globalisation':'globalization',
                'sjws':'SJW', 'neuralink':'neurolink','fullform':'full-form',
                'cisgender':'normal gender','friendzone':'friend zone','colonisation':'colonization','nationalised':'nationalized',
                'xiomi':'Xiaomi', 'rohingya':'Rohingya','despacito':'desposito',
                'fortnite':'Fortnite','bittrex':'Bittrex',
                'reactjs':'javascript','nodejs':'javascript','programr':'programer',
                'hyperloop':'Hyperloop','aadhaar':'Aadhaar','baahubali':'Bahubali','Baahubali':'Bahubali',
                'snapchat':'Instagram','Snapchat':'Instagram','SnapChat':'Instagram','realise':'realize',
                'defence':'defense','offence':'offense',
                'btech':'bachelor_degree','Btech':'bachelor_degree','BTECH':'bachelor_degree','mtech':'master_degree',
                'behaviour':'behavior','anaesthesia':'Anaesthesia','incels':'involuntary celibate',
                "‘":"'", "´":"'", "—":"-", "₹":"rupee",
                "–":"-", "’":"'", "_":"-", "`":"'", '“':'"', '”':'"', '“':'"', 
                '•':'.', '−':'-', 
                'rahul':'Rahul','upvotes':'up vote', 'upvote':'up vote','upvoted':'up voted', 'downvote':'down vote',
                'downvotes':'down votes','downvoted':'down voted','mhtcet':'MHT_CET','MHTCET':'MHT_CET',
                'Qoura':'Quora','Quoras':'Quora','Quara':'Quora','∞':'infinity',
                'wwwquoracom':'Quora','nootropics':'smart drugs','Nootropics':'smart drugs',
                '\ ufeff':" ", 'करना':"",
                '∅':'phi','Doklam':'standoff',
                'donald':'Donald','trump':'Trump','TRUMP':'Trump','drumpf':'serial liar','Drumpf':'serial liar',
                'hillary':'Hillary', 'clinton':'Clinton','Trumpcare':'Trump plan','Trumpers':'Trump supporter',
                'narendra':'Narendra','gandhi':'Gandhi',
                'barack':'Barack','obama':'Obama','hussein':'Hussein',
                'obamacare':'Obama plan','Obamacare':'Obama plan','aadhar':'Aadhaar',
                'AlShamsi':'Al_Shamsi','auschwitz':'Auschwitz','muhammed':'Muhammed','aAadhaar':'Aadhaar',
                'whatsapp':'WhatsApp', 'instagram':'Instagram','sjw':'SJW','aadhaar':'Aadhaar','AAadhaar':'Aadhaar',
                'bahubali':'Bahubali', 'bitcoin':'Bitcoin', 'bhakti':'Bhakti', 'javascript':'JavaScript','kotlin':'Java',
                'madheshi':'Madheshi', 'quora':'Quora','SJWs':'SJW','femdom':'female dominance',
                'crossdress':'transvestite','paedophile':'pedophile'
               }


def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispellings, mispellings_re = _get_mispell(mispell_dict)


def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

In [7]:
def preprocess(text):
    """
    preprocess text main steps
    """
    text = clean_text(text)
    text = clean_numbers(text)
    text = replace_typical_misspell(text)
    
    return text


def text_clean_wrapper(df):
    df["question_text"] = df["question_text"].apply(preprocess)
    return df


In [8]:
def add_features(df):
    
    df['question_text'] = df['question_text'].apply(lambda x:str(x))
    df['total_length'] = df['question_text'].apply(len)
    df['capitals'] = df['question_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/float(row['total_length']),axis=1)
    df['num_words'] = df.question_text.str.count('\S+')
    df['num_unique_words'] = df['question_text'].apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['num_words']  

    return df


def load_and_prec():
    train_df = pd.read_csv("../input/train.csv")
    test_df = pd.read_csv("../input/test.csv")
    print("Train shape : ",train_df.shape)
    print("Test shape : ",test_df.shape)
    
    train_df = df_parallelize_run(train_df, text_clean_wrapper)
    test_df = df_parallelize_run(test_df, text_clean_wrapper)
    
    ###################### Add Features ###############################
    #  https://github.com/wongchunghang/toxic-comment-challenge-lstm/blob/master/toxic_comment_9872_model.ipynb
    train = add_features(train_df)
    test = add_features(test_df)

    features = train[['caps_vs_length', 'words_vs_unique']].fillna(0)
    test_features = test[['caps_vs_length', 'words_vs_unique']].fillna(0)

    ss = StandardScaler()
    ss.fit(np.vstack((features, test_features)))
    features = ss.transform(features)
    test_features = ss.transform(test_features)
    ###########################################################################
    
    ## fill up the missing values
    train_X = train_df["question_text"].fillna("_##_").values
    test_X = test_df["question_text"].fillna("_##_").values
    
    ## Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features, lower=False, filters=[])
    tokenizer.fit_on_texts(list(train_X))
    train_X = tokenizer.texts_to_sequences(train_X)
    test_X = tokenizer.texts_to_sequences(test_X)

    ## Pad the sentences 
    train_X = pad_sequences(train_X, maxlen=maxlen)
    test_X = pad_sequences(test_X, maxlen=maxlen)

    ## Get the target values
    train_y = train_df['target'].values
    
    #shuffling the data
    np.random.seed(SEED)
    trn_idx = np.random.permutation(len(train_X))

    train_X = train_X[trn_idx]
    train_y = train_y[trn_idx]
    
    return train_X, test_X, train_y, tokenizer.word_index, features, test_features

In [9]:
def load_glove(word_index, max_words=max_features, embed_size=300):
    EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    emb_mean, emb_std = -0.005838493338505765, 0.48782081729236354 #-0.005838499, 0.48782197 

    embedding_matrix = np.random.normal(emb_mean, emb_std, (max_words, embed_size))
    with open(EMBEDDING_FILE, encoding="utf8") as f: #, 'r'
        for line in f:
            word, vec = line.split(' ', 1)
            if word not in word_index:
                continue
            i = word_index[word]
            if i >= max_words:
                continue
            embedding_vector = np.asarray(vec.split(' '))[:300] #, dtype='float32'
            if len(embedding_vector) == 300:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix


def load_para(word_index):
    EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean, emb_std = all_embs.mean(), all_embs.std()
    print('mean: ', emb_mean, 'std: ', emb_std)
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word.lower())
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    
    return embedding_matrix


def load_w2v(word_index):
    word2vec = gensim.models.KeyedVectors.load_word2vec_format(
        '../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin', binary=True)
    print('vocab:',len(word2vec.vocab))
    all_embs = word2vec.vectors
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    print(emb_mean,emb_std)
    print(max_features,' from ',len(word_index.items()))
    # num_words = min(num_words, len(tokenizer.word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (max_features, embed_size))
    
    # embedding_matrix = np.zeros((num_words, dim))
    count = 0
    for word, i in word_index.items():
        if i>=max_features:
            break
        if word in word2vec.vocab:
            embedding_matrix[i] = word2vec.word_vec(word)
        else:
            count += 1
    del word2vec
    print('embedding matrix size:',embedding_matrix.shape)
    print('Number of words not in vocab:',count)
    return embedding_matrix


def load_fasttext(word_index, max_words=max_features, embed_size=300):
    EMBEDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    emb_mean, emb_std = -0.0033469954096391496, 0.10985541316945975 #-0.0033469985, 0.109855495

    embedding_matrix = np.random.normal(emb_mean, emb_std, (max_words, embed_size))
    with open(EMBEDDING_FILE, encoding="utf8") as f: #, 'r'
        for line in f:
            word, vec = line.split(' ', 1)
            if word not in word_index:
                continue
            i = word_index[word]
            if i >= max_words:
                continue
            embedding_vector = np.asarray(vec.split(' '))[:300] #, dtype='float32'
            if len(embedding_vector) == 300:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [10]:
class LockedDropout(nn.Module):
    """ LockedDropout applies the same dropout mask to every time step.

    **Thank you** to Sales Force for their initial implementation of :class:`WeightDrop`. Here is
    their `License
    <https://github.com/salesforce/awd-lstm-lm/blob/master/LICENSE>`__.

    Args:
        p (float): Probability of an element in the dropout mask to be zeroed.
    """

    def __init__(self, p=0.5):
        self.p = p
        super().__init__()

    def forward(self, x):
        """
        Args:
            x (:class:`torch.FloatTensor` [batch size, sequence length, rnn hidden size]): Input to
                apply dropout too.
        """
        if not self.training or not self.p:
            return x
        x = x.clone()
        mask = x.new_empty(1, x.size(1), x.size(2), requires_grad=False).bernoulli_(1 - self.p)
        mask = mask.div_(1 - self.p)
        mask = mask.expand_as(x)
        return x * mask


    def __repr__(self):
        return self.__class__.__name__ + '(' \
            + 'p=' + str(self.p) + ')'

In [11]:
class Linear(nn.Module):
    ''' Simple Linear layer with xavier init '''

    def __init__(self, d_in, d_out, bias=True):
        super(Linear, self).__init__()
        self.linear = nn.Linear(d_in, d_out, bias=bias)
        nn.init.kaiming_uniform_(self.linear.weight, mode='fan_in', nonlinearity='leaky_relu')
    
    def forward(self, x):
        return self.linear(x)

In [12]:
class RCNN(nn.Module):
    def __init__(self, batch_size=None, output_size=1, hidden_size=100, vocab_size=max_features, embedding_length=embed_size):
        super(RCNN, self).__init__()
        
        """
        Arguments
        ---------
        batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
        output_size : 2 = (pos, neg)
        hidden_sie : Size of the hidden_state of the LSTM
        vocab_size : Size of the vocabulary containing unique words
        embedding_length : Embedding dimension of GloVe word embeddings
        weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 
        """

        self.batch_size = batch_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embedding_length = embedding_length
        
        self.word_embeddings = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32), sparse=False)
        self.lockeddropout = LockedDropout(p=0.2)
        self.lstm = nn.LSTM(embedding_length, hidden_size, bidirectional=True, batch_first=False)
        self.gru = nn.GRU(hidden_size*2, hidden_size, bidirectional=True, batch_first=False)
        self.input = nn.Linear(embedding_length, 150)
        self.W_s1 = nn.Linear(hidden_size*2+150, 150, bias=True)
        self.W_s2 = nn.Linear(150, 65, bias=True)
        self.lrelu = torch.nn.LeakyReLU()
        self.W2 = Linear(2*hidden_size+150, 100)
        self.maxpool = nn.AdaptiveMaxPool1d(1)
        self.dropout_2 = nn.Dropout(0.2)
        self.label = Linear(102, output_size)
        
        
    def attention_net(self, lstm_output):

        """
        Now we will use self attention mechanism to produce a matrix embedding of the input sentence in which every row represents an
        encoding of the inout sentence but giving an attention to a specific part of the sentence. We will use 30 such embedding of 
        the input sentence and then finally we will concatenate all the 30 sentence embedding vectors and connect it to a fully 
        connected layer of size 2000 which will be connected to the output layer of size 2 returning logits for our two classes i.e., 
        pos & neg.
        Arguments
        ---------
        lstm_output = A tensor containing hidden states corresponding to each time step of the LSTM network.
        ---------
        Returns : Final Attention weight matrix for all the 30 different sentence embedding in which each of 30 embeddings give
                  attention to different parts of the input sentence.
        Tensor size : lstm_output.size() = (batch_size, num_seq, 2*hidden_size)
                      attn_weight_matrix.size() = (batch_size, 30, num_seq)
        """
        attn_weight_matrix = self.W_s2(torch.tanh(self.W_s1(lstm_output))) #/ self.temper
        attn_weight_matrix = attn_weight_matrix.permute(0, 2, 1)
        attn_weight_matrix = F.softmax(attn_weight_matrix, dim=2)

        return attn_weight_matrix
    

    def forward(self, input_sentence, batch_size=None):

        """ 
        Parameters
        ----------
        input_sentence: input_sentence of shape = (batch_size, num_sequences)
        batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)
        
        Returns
        -------
        Output of the linear layer containing logits for positive & negative class which receives its input as the final_hidden_state of the LSTM
        final_output.shape = (batch_size, output_size)
        """
        """
        The idea of the paper "Recurrent Convolutional Neural Networks for Text Classification" is that we pass the embedding vector
        of the text sequences through a bidirectional LSTM and then for each sequence, our final embedding vector is the concatenation of 
        its own GloVe embedding and the left and right contextual embedding which in bidirectional LSTM is same as the corresponding hidden
        state. This final embedding is passed through a linear layer which maps this long concatenated encoding vector back to the hidden_size
        vector. After this step, we use a max pooling layer across all sequences of texts. This converts any varying length text into a fixed
        dimension tensor of size (batch_size, hidden_size) and finally we map this to the output layer.
        """
        input = self.word_embeddings(input_sentence[0]) # embedded input of shape = (batch_size, num_sequences, embedding_length)
        input = self.lockeddropout(input) 
        input = input.permute(1, 0, 2) # input.size() = (num_sequences, batch_size, embedding_length)
        if batch_size is None:
            h_0 = Variable(torch.empty(2, len(input[1]), self.hidden_size).cuda()) # Initial hidden state of the LSTM
            h_0 = nn.init.xavier_uniform_(h_0, gain=5/3)
            c_0 = Variable(torch.empty(2, len(input[1]), self.hidden_size).cuda()) # Initial cell state of the LSTM
            c_0 = nn.init.xavier_uniform_(c_0, gain=5/3)
        else:
            h_0 = Variable(torch.empty(2, batch_size, self.hidden_size).cuda())
            h_0 = nn.init.xavier_uniform_(h_0, gain=5/3)
            c_0 = Variable(torch.empty(2, batch_size, self.hidden_size).cuda())
            c_0 = nn.init.xavier_uniform_(c_0, gain=5/3)

        output, (final_hidden_state,_) = self.lstm(input, (h_0, c_0))
        output, _ = self.gru(output, final_hidden_state)
        
        input = torch.tanh(self.input(input))
        final_encoding = torch.cat((output, input), 2) 
        final_encoding = final_encoding.permute(1, 0, 2) 
        attn_weight_matrix = self.attention_net(final_encoding) 
        final_encoding = torch.matmul(attn_weight_matrix, final_encoding)
        final_encoding = self.lrelu(final_encoding)        
        y = self.W2(final_encoding)
        y = y.permute(0, 2, 1)
        y = self.maxpool(y)
        f = torch.tensor(input_sentence[1], dtype=torch.float).cuda(non_blocking=True)
        y = y.squeeze(2)
        y = torch.cat((y, f),1)
        y = self.dropout_2(y)
        logits = self.label(y)

        return logits


In [13]:
class AdamW(Optimizer):
    """Implements Adam algorithm.
    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, amsgrad=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, amsgrad=amsgrad)
        super(AdamW, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(AdamW, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsgrad', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                amsgrad = group['amsgrad']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsgrad:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsgrad:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1

                # if group['weight_decay'] != 0:
                #     grad = grad.add(group['weight_decay'], p.data)

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsgrad:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
                else:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])

                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1

                # p.data.addcdiv_(-step_size, exp_avg, denom)
                p.data.add_(-step_size,  torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom) )

        return loss

In [14]:
# code inspired from: https://github.com/anandsaha/pytorch.cyclic.learning.rate/blob/master/cls.py
class CyclicLR(object):
    def __init__(self, optimizer, base_lr=1e-3, max_lr=6e-3,
                 step_size=2000, mode='triangular', gamma=1.,
                 scale_fn=None, scale_mode='cycle', last_batch_iteration=-1):

        if not isinstance(optimizer, Optimizer):
            raise TypeError('{} is not an Optimizer'.format(
                type(optimizer).__name__))
        self.optimizer = optimizer

        if isinstance(base_lr, list) or isinstance(base_lr, tuple):
            if len(base_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} base_lr, got {}".format(
                    len(optimizer.param_groups), len(base_lr)))
            self.base_lrs = list(base_lr)
        else:
            self.base_lrs = [base_lr] * len(optimizer.param_groups)

        if isinstance(max_lr, list) or isinstance(max_lr, tuple):
            if len(max_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} max_lr, got {}".format(
                    len(optimizer.param_groups), len(max_lr)))
            self.max_lrs = list(max_lr)
        else:
            self.max_lrs = [max_lr] * len(optimizer.param_groups)

        self.step_size = step_size

        if mode not in ['triangular', 'triangular2', 'exp_range'] \
                and scale_fn is None:
            raise ValueError('mode is invalid and scale_fn is None')

        self.mode = mode
        self.gamma = gamma

        if scale_fn is None:
            if self.mode == 'triangular':
                self.scale_fn = self._triangular_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = self._triangular2_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = self._exp_range_scale_fn
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode

        self.batch_step(last_batch_iteration + 1)
        self.last_batch_iteration = last_batch_iteration

    def batch_step(self, batch_iteration=None):
        if batch_iteration is None:
            batch_iteration = self.last_batch_iteration + 1
        self.last_batch_iteration = batch_iteration
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr

    def _triangular_scale_fn(self, x):
        return 1.

    def _triangular2_scale_fn(self, x):
        return 1 / (2. ** (x - 1))

    def _exp_range_scale_fn(self, x):
        return self.gamma**(x)

    def get_lr(self):
        step_size = float(self.step_size)
        cycle = np.floor(1 + self.last_batch_iteration / (2 * step_size))
        x = np.abs(self.last_batch_iteration / step_size - 2 * cycle + 1)

        lrs = []
        param_lrs = zip(self.optimizer.param_groups, self.base_lrs, self.max_lrs)
        for param_group, base_lr, max_lr in param_lrs:
            base_height = (max_lr - base_lr) * np.maximum(0, (1 - x))
            if self.scale_mode == 'cycle':
                lr = base_lr + base_height * self.scale_fn(cycle)
            else:
                lr = base_lr + base_height * self.scale_fn(self.last_batch_iteration)
            lrs.append(lr)
        return lrs

In [15]:
start_time = time.time()

train_X, test_X, train_y, word_index, features, test_features = load_and_prec()
embedding_matrix_1 = load_glove(word_index)
embedding_matrix_2 = load_para(word_index)
embedding_matrix_3 = load_w2v(word_index)
embedding_matrix_4 = load_fasttext(word_index)

total_time = (time.time() - start_time) / 60
print("Took {:.2f} minutes".format(total_time))

Train shape :  (1306122, 3)
Test shape :  (56370, 2)




mean:  -0.0053247833 std:  0.49346462
vocab: 3000000
-0.003527845 0.13315111
120000  from  230120
embedding matrix size: (120000, 300)
Number of words not in vocab: 21494
Took 7.82 minutes


In [16]:
del word_index, mispell_dict, mispellings, mispellings_re

In [17]:
embedding_matrix = np.concatenate((embedding_matrix_1, embedding_matrix_2, embedding_matrix_3, embedding_matrix_4), axis=1)
del embedding_matrix_1, embedding_matrix_2, embedding_matrix_3, embedding_matrix_4 
print(np.shape(embedding_matrix))

pca = PCA(n_components=300, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=SEED)
pca.fit(embedding_matrix)
embedding_matrix = pca.transform(embedding_matrix)

print(np.shape(embedding_matrix))

gc.collect()

(120000, 1200)
(120000, 300)


0

In [18]:
splits = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=916).split(train_X, train_y))

In [19]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [20]:
def z_score(array, threshold):
    x = array - threshold
    tmp = np.sum(x ** 2)
    tmp = np.sqrt(tmp / len(array))
    x = x / tmp
    return x

In [21]:
from sklearn.metrics import roc_curve, precision_recall_curve, classification_report


def threshold_search(y_true, y_proba, plot=False):
    precision, recall, thresholds = precision_recall_curve(y_true, y_proba)
    thresholds = np.append(thresholds, 1.001) 
    F = 2 / (1/precision + 1/recall)
    best_score = np.max(F)
    best_th = thresholds[np.argmax(F)]
    if plot:
        plt.plot(thresholds, F, '-b')
        plt.plot([best_th], [best_score], '*r')
        plt.show()
    search_result = {'threshold': best_th , 'f1': best_score}
    return search_result 

In [22]:
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')

        
def load_checkpoint(checkpoint_path, model, optimizer):
    state = torch.load(checkpoint_path)
    model.load_state_dict(state['state_dict'])
    optimizer.load_state_dict(state['optimizer'])
    print('model loaded from %s' % checkpoint_path)

In [23]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self,dataset):
        self.dataset = dataset

    def __getitem__(self, index):
        data, target = self.dataset[index]

        return data, target, index
    def __len__(self):
        return len(self.dataset)

In [24]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, logits=True, pos_weight=None, reduction='elementwise_mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.logits = logits
        self.reduction = reduction
        self.pos_weight = pos_weight

    def forward(self, inputs, targets):
        if self.logits:
            BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none', pos_weight=self.pos_weight)
        else:
            BCE_loss = F.binary_cross_entropy(inputs, targets, reduction='none', pos_weight=self.pos_weight)
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

        if self.reduction is None:
            return F_loss
        else:
            return torch.mean(F_loss)

In [25]:
train_preds = np.zeros((len(train_X)))
train_preds2 = np.zeros((len(train_X)))
test_preds = np.zeros((len(test_X)))

seed_torch(SEED)
torch.backends.cudnn.benchmark=True

x_test_cuda = torch.tensor(test_X, dtype=torch.long).cuda()
test = torch.utils.data.TensorDataset(x_test_cuda)
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

for i, (train_idx, valid_idx) in enumerate(splits):
    
    best_prec1 = 0
    
    x_train_fold = torch.tensor(train_X[train_idx], dtype=torch.long)#.cuda()
    y_train_fold = torch.tensor(train_y[train_idx, np.newaxis], dtype=torch.float32)#.cuda()
    kfold_X_features = torch.tensor(features[train_idx.astype(int)], dtype=torch.long)#.cuda()
    kfold_X_valid_features = torch.tensor(features[valid_idx.astype(int)], dtype=torch.long)#.cuda()
    x_val_fold = torch.tensor(train_X[valid_idx], dtype=torch.long)#.cuda()
    y_val_fold = torch.tensor(train_y[valid_idx, np.newaxis], dtype=torch.float32)#.cuda()

    model = RCNN()
    model.cuda()
    
    class_weight = torch.FloatTensor([1.25]).cuda()    
    gamma = 1.3
    loss_fn = FocalLoss(gamma=gamma, pos_weight=class_weight)
    
    step_size = 300
    base_lr, max_lr = 0.001, 0.003   
    optimizer = AdamW(model.parameters(), lr=max_lr, betas=(0.9, 0.98), weight_decay=0)
    ################################################################################################
    scheduler = CyclicLR(optimizer, base_lr=base_lr, max_lr=max_lr,
               step_size=step_size, mode='exp_range', gamma=0.99994)
    ###############################################################################################
    
    train = torch.utils.data.TensorDataset(x_train_fold, y_train_fold)
    valid = torch.utils.data.TensorDataset(x_val_fold, y_val_fold)
    
    train = MyDataset(train)
    valid = MyDataset(valid)
    
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True, drop_last=True,
                                              pin_memory=True, num_workers=4)
    valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False,
                                              pin_memory=True, num_workers=4)
    
    print(f'Fold {i + 1}')
    
    for epoch in range(train_epochs):
        start_time = time.time()
        
        model.train()
        avg_loss = 0.
        
        for i, (x_batch, y_batch, index) in enumerate(train_loader):
            f = kfold_X_features[index]
            y_pred = model([x_batch.cuda(non_blocking=True), f])
            
            if scheduler:
                scheduler.batch_step()
            
            loss = loss_fn(y_pred, y_batch.cuda(non_blocking=True))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
        
        model.eval()
        valid_preds_fold = np.zeros((x_val_fold.size(0)))
        test_preds_fold = np.zeros(len(test_X))
        valid_true_fold = np.zeros((x_val_fold.size(0)))

        valid_preds_tmp = np.zeros((x_val_fold.size(0)))
        valid_true_tmp = np.zeros((x_val_fold.size(0)))
        
        avg_val_loss = 0.
        
        for i, (x_batch, y_batch, index) in enumerate(valid_loader):
            f = kfold_X_valid_features[index]
            y_pred = model([x_batch.cuda(non_blocking=True),f]).detach()
            
            avg_val_loss += loss_fn(y_pred, y_batch.cuda(non_blocking=True)).item() / len(valid_loader)
            valid_preds_tmp[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
            valid_true_tmp[i * batch_size:(i+1) * batch_size] = y_batch.cpu().numpy()[:, 0]

        search_result = threshold_search(valid_true_tmp, valid_preds_tmp)
        val_f1, val_threshold = search_result['f1'], search_result['threshold']

        elapsed_time = time.time() - start_time
        print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t val_f1={:.4f} best_t={:.2f} \t time={:.2f}s'.format(
            epoch + 1, train_epochs, avg_loss, avg_val_loss, val_f1, val_threshold, elapsed_time))
        
        prec1 = val_f1
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        save_checkpoint({
            'epoch': epoch + 1,
            'state_dict': model.state_dict(),
            'best_prec1': best_prec1,
            'optimizer' : optimizer.state_dict(),
        }, is_best)
        print('Best: ', best_prec1)
    
    del x_train_fold, y_train_fold, train_loader
    
    checkpoint_path = 'model_best.pth.tar'
    model = RCNN()
    optimizer = AdamW(model.parameters(), lr=LR, betas=(0.9, 0.98), weight_decay=0)
    load_checkpoint(checkpoint_path, model, optimizer)
    model.cuda()
    model.eval()
    
    for i, (x_batch, y_batch, index) in enumerate(valid_loader):
        f = kfold_X_valid_features[index]
        y_pred = model([x_batch.cuda(non_blocking=True),f]).detach()

        valid_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
        valid_true_fold[i * batch_size:(i+1) * batch_size] = y_batch.cpu().numpy()[:, 0]

    search_result = threshold_search(valid_true_fold, valid_preds_fold)
    print(search_result)
    train_preds2[valid_idx] = z_score(valid_preds_fold, search_result['threshold'])
    
    del x_val_fold, y_val_fold, valid_loader
    
    for i, (x_batch,) in enumerate(test_loader):
        f = test_features[i * batch_size:(i+1) * batch_size]
        y_pred = model([x_batch, f]).detach()
        test_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]

    train_preds[valid_idx] = valid_preds_fold
    test_preds_fold = z_score(test_preds_fold, search_result['threshold'])
    test_preds += test_preds_fold / len(splits)

Fold 1


  


Epoch 1/5 	 loss=0.0651 	 val_loss=0.0552 	 val_f1=0.6615 best_t=0.44 	 time=298.69s
Best:  0.6615169505122973
Epoch 2/5 	 loss=0.0534 	 val_loss=0.0520 	 val_f1=0.6802 best_t=0.44 	 time=298.85s
Best:  0.6801935454420961
Epoch 3/5 	 loss=0.0496 	 val_loss=0.0513 	 val_f1=0.6867 best_t=0.50 	 time=298.66s
Best:  0.6866682017038914
Epoch 4/5 	 loss=0.0472 	 val_loss=0.0505 	 val_f1=0.6913 best_t=0.49 	 time=298.76s
Best:  0.6912983906551172
Epoch 5/5 	 loss=0.0447 	 val_loss=0.0511 	 val_f1=0.6952 best_t=0.53 	 time=298.67s
Best:  0.6951837185744545
model loaded from model_best.pth.tar
{'threshold': 0.5285874605178833, 'f1': 0.695209677792427}
Fold 2
Epoch 1/5 	 loss=0.0654 	 val_loss=0.0541 	 val_f1=0.6668 best_t=0.43 	 time=298.59s
Best:  0.6668197685100128
Epoch 2/5 	 loss=0.0535 	 val_loss=0.0518 	 val_f1=0.6788 best_t=0.47 	 time=298.75s
Best:  0.6787645495371644
Epoch 3/5 	 loss=0.0498 	 val_loss=0.0508 	 val_f1=0.6848 best_t=0.50 	 time=298.76s
Best:  0.6848091549787174
Epoch 4/5

In [26]:
search_result = threshold_search(train_y, train_preds)
search_result

  


{'threshold': 0.4996938705444336, 'f1': 0.6929971922982039}

In [27]:
search_result = threshold_search(train_y, train_preds2)
search_result

  


{'threshold': 0.0, 'f1': 0.6951061266104366}

In [28]:
sub = pd.read_csv('../input/sample_submission.csv')
sub.prediction = test_preds > 0
sub.to_csv("submission.csv", index=False)

In [29]:
print("Notebook Runtime: %0.2f Minutes"%((time.time() - notebookstart)/60))

Notebook Runtime: 110.63 Minutes


In [30]:
!rm model_best.pth.tar
!rm checkpoint.pth.tar
!head submission.csv

qid,prediction
00014894849d00ba98a9,False
000156468431f09b3cae,False
000227734433360e1aae,False
0005e06fbe3045bd2a92,False
00068a0f7f41f50fc399,False
000a2d30e3ffd70c070d,False
000b67672ec9622ff761,False
000b7fb1146d712c1105,False
000d665a8ddc426a1907,False
