## Packages and constant declaration

In [1]:
import torch

In [1]:
import time

In [2]:
import numpy as np  #linear algebra
import pandas as pd #Only CSV IO
import os
import re
from torch.utils import data #dataloader of batch
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from torch.autograd import Variable #We can ask require grad or not
from torch.optim.optimizer import Optimizer
from torch.optim import lr_scheduler
import torch.nn.functional as F
from torchvision import transforms

Using TensorFlow backend.


In [3]:
import matplotlib.pyplot as plt

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
from sklearn.preprocessing import StandardScaler # Simplify the preprocess
from multiprocessing import pool #multiprocessing, creating a pool parallel
from functools import partial 
#freeze some arguments of a function to be a new function
from sklearn.decomposition import PCA


In [49]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1" # easy to locate traceback
embedding_size = 300 # embedding vector length
max_word = 200000 # How many unique words to use 
max_q = 35 # max number of words in a question
maxq=35
batch_size = 4096#Batch size
n_epochs = 20 # epochs
latent = 400 # The dim of the latent variable

In [7]:
def seed(seed=1000):# Give random seed to everything
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed()

## Data preprocessing

In [10]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean(x): #process the punction
    x=str(x)
    for punct in puncts:
        if punct in x:
            x=x.replace(punct,f'{punct}')
    return x

def clean_numbers(x): # Replace the number with #, make all numbers the same
    if bool(re.search(r'\d',x)):
        x=re.sub('[0-9]{5,}','#####',x)
        x=re.sub('[0-9]{4}','####',x)
        x=re.sub('[0-9]{3}','###',x)
        x=re.sub('[0-9]{2}','##',x)
    return x

# The common abbreviation for some phrase
fullversiondict={"ain't": "is not", "aren't": "are not","can't": "cannot",
                 "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  
                 "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                 "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you",
                 "how'll": "how will","how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am",
                 "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", 
                 "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have",
                 "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
                 "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", 
                 "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", 
                 "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not","shouldn't've": "should not have", 
                 "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have",
                 "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", 
                 "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will",
                 "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", 
                 "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not",
                 "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", 
                 "you're": "you are", "you've": "you have", 'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 
                 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do',
                 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018',
                 'qouta': 'quota','exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}
def _get_full(fullversiondict):
    full_re = re.compile('(%s)'%'|'.join(fullversiondict.keys()))
    return fullversiondict , full_re # To make things we want to replace single string

fullversiondict,full_re=_get_full(fullversiondict)
def replacetext(text):
    def replace(match): 
        return fullversiondict[match.group(0)]
    return full_re.sub(replace,text) # The full.re here is match, can replace any string now


In [11]:
def read_pre():
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    print('Train shape:',train_df.shape)
    print('Test shape:',test_df.shape)
    # Lower case
    train_df['question_text'] = train_df['question_text'].apply(lambda x: x.lower())
    test_df['question_text'] = test_df['question_text'].apply(lambda x: x.lower())
    # Clean punction
    train_df["question_text"] = train_df["question_text"].apply(lambda x: clean(x))
    test_df["question_text"] = test_df["question_text"].apply(lambda x: clean(x))
    #clean numbers
    train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_numbers(x))
    test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_numbers(x))
    #replace abbreviation
    train_df["question_text"] = train_df["question_text"].apply(lambda x: replacetext(x))
    test_df["question_text"] = test_df["question_text"].apply(lambda x: replacetext(x))
    ## fill up the missing values using next valid value
    train_X = train_df["question_text"].fillna(method="bfill").values
    test_X = test_df["question_text"].fillna(method="bfill").values
    
    ####################
    #Tokenize the sentences
    tokenizer = Tokenizer(num_words = max_word)
    tokenizer.fit_on_texts(list(train_X)) # only use words in training set
    train_X =B tokenizer.texts_to_sequences(train_X)
    test_X = tokenizer.texts_to_sequences(test_X)
    word_index = tokenizer.word_index
    nb_words = min(max_word, len(word_index))
        
    #Pad the sentences, to make equal length of sentences
    train_X = pad_sequences(train_X, maxlen=maxq)
    test_X = pad_sequences(test_X, maxlen=maxq)
     
    #Similar to one-hot encoding   
    #train_X=np.delete(train_X,[np.where(np.sum(train_X,axis=1)==0)])
    #test_X=np.delete(test_X,[np.where(np.sum(test_X,axis=1)==0)])
    
    #Target
    train_y = train_df['target'].values
    return train_X, test_X, train_y, word_index
    

In [12]:
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x
#Data augmentation

In [13]:
# the pretrained embedding model
def load_glove(word_index):
    FILE= 'embeddings/glove.840B.300d//glove.840B.300d.txt'
    def get_coefs(word,*arr):return word, np.asarray(arr, dtype='float32')[:embedding_size]
    embeddings_index = dict(get_coefs(*o.split(' '))for o in open(FILE) if len(o)>300)
    #construct word array pair dictionary
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    nb_words = min(max_word, len(word_index))
    #Initialize a matrix using random value, in case that some words don't exist in our embedding
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words+1,embedding_size))
    #Associate the word in our training set with the embedding model
    for word, i in word_index.items():
        if i>= max_word: continue
        embedding_vector = embeddings_index.get(word) # get the vector for this word form dictionary
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    return (embedding_matrix)
##Do same thing to other types of embedding



In [14]:
train_X, test_X, train_y,  word_index = read_pre()
embedding_matrix_1 = load_glove(word_index)

Train shape: (1306122, 3)
Test shape: (375806, 2)


  import sys


In [32]:
train_X[train_y==1,:]

array([[   0,    0,    0, ...,    6,    1,   93],
       [   0,    0,    0, ...,  417,  613, 2319],
       [   0,    0,    0, ...,   47, 1263, 2080],
       ...,
       [   0,    0,    0, ..., 8179,   11, 3190],
       [ 120,  175,   60, ...,   60,   30, 4979],
       [   0,    0,    0, ...,   83,   30,  114]], dtype=int32)

In [2]:
train_XX = train_X
train_yy = train_y
for i in range(11):
    train_XX = np.concatenate((train_XX,train_X[train_y==1,:]))
    train_yy = np.concatenate((train_yy,train_y[train_y==1]))

NameError: name 'train_X' is not defined

In [None]:
# Define attention mechanism
class Attention(nn.Module):
    def __init__(self,feature_dim,next_dim,bias=True,**kwargs):
        super(Attention,self).__init__(**kwargs)
        self.masking = True
        self.biasbool = bias
        self.feature_dim = feature_dim #The input dim
        self.next_dim = next_dim #The output dim--word in a Q
        weight = torch.zeros(feature_dim,1) #A vector
        nn.init.xavier_uniform_(weight)
        self.weight = nn.Parameter(weight)
        if bias:
            self.bias = nn.Parameter(torch.zeros(next_dim))
    
    def forward(self, x, mask=None):
        feature_dim = self.feature_dim
        next_dim = self.next_dim
        eij = torch.mm(x.contiguous().view(-1,feature_dim),#(batch*time, feature)
                      self.weight).view(-1,next_dim)#(feature,time)
        if self.biasbool:
            eij = eij+self.bias #linear combination, vector
        eij = torch.tanh(eij) #nonlinear units
        a = torch.exp(eij) #softmax 
        
        if mask is not None:
            a = a * mask
        a = a / torch.sum(a, 1, keepdim=True) + 1e-10
        # a batch size by time t tensor, weight of different time step of 
        #print('weight',a.shape)
        weighted_input = x * torch.unsqueeze(a, -1)#a batch size by time t by inputsize tensor,
        #print('weighted',weighted_input.shape)
        #print(torch.sum(weighted_input, 1).shape)
        return torch.sum(weighted_input, 1)
    # sum to a batch size by input size feature, weighted sum of different t
        

In [3]:
class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        
        hidden_size = 70
        
        self.embedding = nn.Embedding(max_word, embedding_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix_1, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        #The embedding vectors here are not parameter needing update
        self.embedding_dropout = SpatialDropout(0.15)
        #Using embedding  and data augmentation
        
        self.lstm = nn.LSTM(embedding_size, hidden_size,num_layers=2, bidirectional=True, batch_first=True)
        #One layer of bidirectional LSTM, output is two times its output dim
        self.gru = nn.GRU(hidden_size * 2, hidden_size,num_layers=2, bidirectional=True, batch_first=True)
        #One layer of bidirectional Gated recurrent units, output is two times its output dim
        self.lstm_attention = Attention(hidden_size * 2, max_q)
        # Attention mechanism to encode the information from previous net
        self.gru_attention = Attention(hidden_size * 2, max_q)
        self.norm1 = nn.BatchNorm1d(max_q)
        self.norm2 = nn.BatchNorm1d(max_q)
        self.linear = nn.Linear(560, 40)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.15)
        self.out = nn.Linear(40, 1) # Because we just two labels
    
    def forward(self, x):
        h_embedding = self.embedding(x)
        h_embedding = self.embedding_dropout(h_embedding)
        #Data augmentation
        #print('after embedding',h_embedding.shape)
        h_lstm, _ = self.lstm(h_embedding)
        #print('lstm',h_lstm.shape)
        lstm_n = self.norm1(h_lstm)
        h_gru, _ = self.gru(h_lstm)
        gru_n = self.norm2(h_gru)
        #print('gru',h_gru.shape)
        h_lstm_att = self.lstm_attention(lstm_n)
        #print('attlstm',h_lstm_att.shape)
        h_gru_att = self.gru_attention(gru_n)#Encoder-decoder
        #print('attgru',h_gru_att.shape)
        # global average pooling
        avg_pool = torch.mean(h_gru, 1)
        #print('avgpool',avg_pool.shape)
        # global max pooling
        max_pool, _ = torch.max(h_gru, 1)
        #print('maxpool',max_pool.shape) 
        conc = torch.cat((h_lstm_att#The weighted output of lstm in terms of t, (batch, hiddensize*2)
                          , h_gru_att#The weighted output of gru in terms of t (batch, hiddensize*2)
                          , avg_pool#The average_pooling and maxpooling of gru in terms of t, (batch, hiddensize*2)
                          , max_pool), 1)
        # The first two terms detect the importance of t, the last two detect all time information
        #print('cat',conc.shape)
        conc = self.relu(self.linear(conc)) # Because our label is not time series,
                                  #it makes sense to extract suitable local or global infromation
        conc = self.dropout(conc)
        out = self.out(conc)
        out = out
        return out


NameError: name 'nn' is not defined

In [44]:
splits1=list(StratifiedKFold(n_splits=10, shuffle=True,
                            random_state=400).split(train_XX,train_yy))
testidx = splits1[1][1]
train_idx1 = splits1[1][0]
train_X1 = train_XX[train_idx1,:]
train_y1 = train_yy[train_idx1]
splits2=list(StratifiedKFold(n_splits=9, shuffle=True,
                            random_state=300).split(train_X1,train_y1))

In [None]:
seed(100)

traidx = splits2[0][0]
validx = splits2[0][1]
    #Train test idx
trainfoldx = torch.tensor(train_X1[traidx], dtype=torch.long).to(device)
trainfoldy = torch.tensor(train_y1[traidx], dtype=torch.float32).to(device)
x_val_fold = torch.tensor(train_X1[validx], dtype=torch.long).to(device)
y_val_fold = torch.tensor(train_y1[validx, np.newaxis], dtype=torch.float32).to(device)
    #Our model
mbatch_cost = []
model = NeuralNet()
model.to(device)
loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.0001)
    #Dataloader
train = torch.utils.data.TensorDataset(trainfoldx,trainfoldy)
valid = torch.utils.data.TensorDataset(x_val_fold, y_val_fold)
train_loader = torch.utils.data.DataLoader(dataset=train , batch_size=batch_size,shuffle=True)
valid_loader = torch.utils.data.DataLoader(dataset=valid, batch_size=batch_size, shuffle=False)
    

for epoch in range(n_epochs):
    start = time.time()
    model.train()
    grand_loss = 0
    for  batch_idx, (x_batch, y_batch) in enumerate(train_loader):
        adjust_learning_rate(optimizer,epoch,0.0001,0.5)
        y_pred = model(x_batch)
        loss = loss_fn(torch.squeeze(y_pred), y_batch)
            
            #backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        mbatch_cost.append(loss.item())
        grand_loss += loss.item()/len(train_loader)
        if not batch_idx % 50:
            print ('Epoch: %03d/%03d | Batch %03d/%03d | Loss: %.4f' 
                   %(epoch+1, n_epochs, batch_idx, len(train_loader), loss.item()))
           
                     #evaluation          
    model.eval()
        
    valid_pred = np.array(list())
       
        
    val_loss = 0
        
    for i, (x_batch, y_batch) in enumerate(valid_loader):
        y_pred = model(x_batch).detach()
        val_loss += loss_fn(y_pred,y_batch).item()/len(valid_loader)
        valid_pred = np.append(valid_pred, np.array(torch.sigmoid(y_pred.cpu()))[:, 0])
    epoch_time=time.time()-start
    valid_acc = ((valid_pred>0.5).astype(int)==np.array(y_val_fold.squeeze().cpu())).sum()/len(y_val_fold)
    print('Epoch {}/{} \t train_loss={:.4f} \t val_loss={:.4f}\t val_accu={:.4f} \t time={:.2f}s'.format(epoch + 1, n_epochs, grand_loss, val_loss,valid_acc, epoch_time))

    
    
 


Epoch: 001/020 | Batch 000/429 | Loss: 0.7049
Epoch: 001/020 | Batch 050/429 | Loss: 0.5957
Epoch: 001/020 | Batch 100/429 | Loss: 0.3827
Epoch: 001/020 | Batch 150/429 | Loss: 0.3534
Epoch: 001/020 | Batch 200/429 | Loss: 0.3267
Epoch: 001/020 | Batch 250/429 | Loss: 0.3160
Epoch: 001/020 | Batch 300/429 | Loss: 0.2967
Epoch: 001/020 | Batch 350/429 | Loss: 0.3177
Epoch: 001/020 | Batch 400/429 | Loss: 0.3193
Epoch 1/20 	 train_loss=0.3812 	 val_loss=0.2925	 val_accu=0.8838 	 time=122.03s
Epoch: 002/020 | Batch 000/429 | Loss: 0.3113


# Test

In [None]:
testfoldx = torch.tensor(train_X[testidx], dtype=torch.long).to(device)
testfoldy = torch.tensor(train_y[testidx], dtype=torch.float32).to(device)
test = torch.utils.data.TensorDataset(testfoldx,testfoldy)
test_loader = torch.utils.data.DataLoader(dataset=test , batch_size=batch_size,shuffle=True)

In [None]:

test_pred = np.array(list())
test_loss = 0
for i, (x_batch, y_batch) in enumerate(valid_loader):
    y_pred = model(x_batch).detach()
    test_loss += loss_fn(y_pred,y_batch).item()/len(valid_loader)
    test_pred = np.append(test_pred, np.array(torch.sigmoid(y_pred.cpu()))[:, 0])
test_acc = ((test_pred>0.5).astype(int)==np.array(testfoldy.squeeze().cpu())).sum()/len(testfoldy)


In [None]:
test_loss

In [None]:
test_acc

In [None]:
model = NeuralNet()
model.to(device)

In [None]:
X1=train_X[1:2,]

In [None]:
model(torch.tensor(X1).long().to(device))

In [None]:
plt.plot(mbatch_cost)

In [None]:
plt.plot(mbatch_cost)