In [94]:
import torch.autograd as autograd
import torch
from torch.autograd import Variable
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm
import codecs
import random
from sklearn import metrics

from gensim.models import Word2Vec
from data_process import DataHandle, get_task_data, get_word2idx
from training_lib import get_model_inputs, trainer,generate_testing_result
# !pip install tqdm
USE_GPU = True

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(device)

cpu


In [95]:
print('Libraries imported!')
def myaccuracy(output, target):
    print('get acc')
    average = 'macro'
    length = len(target)
    if output.shape[-1] != 1:
        output = torch.argmax(output, dim=1).float()

    target = target.float()
    output = output.view(target.shape)
    predict = torch.round(output)
    correct = (predict == target).float()
    acc = correct.sum() / length

    y_true = np.array(target)
    y_pred = predict.detach().numpy()

    matrix = metrics.confusion_matrix(y_true, y_pred)
    f1_score = metrics.f1_score(y_true, y_pred, average=average)
    return acc, f1_score, matrix
def word2vec_embedding(tokenized_corpus, embed_size=50, min_count=1, window=5):
    sentences = tokenized_corpus
    model = Word2Vec(sentences, min_count=min_count, window=window, size=embed_size)
    # model.build_vocab(sentences)  # prepare the model vocabulary
    # train word vectors
    model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
    # add the first vector as pading
    embed_vectors = np.vstack([np.zeros((1, embed_size)), model.wv.vectors])
    vocabulary = ['<pad>'] + model.wv.index2word
    return embed_vectors, vocabulary
# we fix the seeds to get consistent results
SEED = 234
torch.manual_seed(SEED)
np.random.seed(SEED)
class LSTM(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, num_classes):
        super(LSTM,self).__init__()
        # embedding (lookup layer) layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim,hidden_dim,1,bidirectional=True)
        self.bn2 = nn.BatchNorm1d(hidden_dim*2*2)
        # output layer
        self.hidden_dim=hidden_dim
        self.out = nn.Linear(hidden_dim*2*2, num_classes)
#         self.hidden = self.init_hidden()

    def forward(self, x):
        embedded = self.embedding(x)
#         print(embedded)
        states, hidden = self.lstm(embedded.permute([1, 0, 2]))
        encoding = torch.cat([states[0],states[-1]], dim=1)
        out = F.sigmoid(self.out(self.bn2(encoding)))

        
        return out
    
class GRU(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, num_classes):
        super(GRU,self).__init__()
        # embedding (lookup layer) layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # hidden layer
#         self.gru = nn.GRU(embedding_dim,hidden_dim,1,bidirectional=True,dropout = 0.2)
        self.gru = nn.GRU(embedding_dim,hidden_dim,1,bidirectional=True)
        self.bn2 = nn.BatchNorm1d(hidden_dim*2*2)
        self.hidden_dim=hidden_dim
        self.out = nn.Linear(hidden_dim*2*2, num_classes)

#         self.hidden = self.init_hidden()

    def forward(self, x):
        embedded = (self.embedding(x))
        states, hidden = self.gru(embedded.permute([1, 0, 2]))#(sentenceL,BatchL,WordEemL)
        encoding = F.leaky_relu(torch.cat([states[0],states[-1]], dim=1),0.1)
        out = self.out(self.bn2(encoding))
        out=F.sigmoid(out)     
        return out
    
def accuracy(output, target):
    predict = torch.round(torch.sigmoid(output))
#     print(predict)
    correct = (predict == target).float()
    acc = correct.sum() / len(correct)
    return acc

def embed_GRU_model(embedding, Vocabulary=100, EMBEDDING_DIM=50, HIDDEN_DIM=50, OUTPUT_DIM=1, lr=0.001,task='a'):
    # embedding: np.array
    # we define our embedding dimension (dimensionality of the output of the first layer)
    # Hidden_dim: dimensionality of the output of the second hidden layer
    # OUTPUT_dim: the outut dimension is the number of classes, 1 for binary classification
    assert embedding.shape[1] == EMBEDDING_DIM
    model = GRU(vocab_size=Vocabulary,hidden_dim=HIDDEN_DIM, embedding_dim=EMBEDDING_DIM, num_classes=OUTPUT_DIM)
    model.embedding.weight.data.copy_(torch.from_numpy(embedding))
    model.embedding.weight.require_grad = False
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
    # we use the Binary cross-entropy loss with sigmoid (applied to logits)
    # Recall we did not apply any activation to our output layer, we need to make our outputs look like probality.
    if task == 'c':
        w = torch.Tensor([1.0,2.0,8.0])
        loss_fn = nn.NLLLoss(weight=w)
    if task =='b':
#         w = torch.Tensor([9.0,1.0])
        loss_fn = nn.BCELoss(weight=None)
    if task =='a':
        loss_fn = nn.BCELoss()
    return model, optimizer, loss_fn

def embed_LSTM_model(embedding, Vocabulary=100, EMBEDDING_DIM=50, HIDDEN_DIM=50, OUTPUT_DIM=1, lr=0.001,task='a'):
    # embedding: np.array
    # we define our embedding dimension (dimensionality of the output of the first layer)
    # Hidden_dim: dimensionality of the output of the second hidden layer
    # OUTPUT_dim: the outut dimension is the number of classes, 1 for binary classification
    assert embedding.shape[1] == EMBEDDING_DIM
    model = LSTM(vocab_size=Vocabulary,hidden_dim=HIDDEN_DIM, embedding_dim=EMBEDDING_DIM, num_classes=OUTPUT_DIM)
    model.embedding.weight.data.copy_(torch.from_numpy(embedding))
    model.embedding.weight.require_grad = False
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
    # we use the Binary cross-entropy loss with sigmoid (applied to logits)
    # Recall we did not apply any activation to our output layer, we need to make our outputs look like probality.
    if task == 'c':
        w = torch.Tensor([1.0,2.0,8.0])
        loss_fn = nn.NLLLoss(weight=w)
    if task =='b':
#         w = torch.Tensor([9.0,1.0])
        loss_fn = nn.BCELoss(weight=None)
    if task =='a':
        loss_fn = nn.BCELoss()
    return model, optimizer, loss_fn




Libraries imported!


In [96]:
if __name__ == '__main__':
    _, tokenized_corpus = get_word2idx()
    task = 'a'
    output_dim = 3 if task == 'c' else 1
    emsize = 10
    print('Begin to get word2vec ')
    embedding, vocabulary = word2vec_embedding(tokenized_corpus, embed_size=emsize)
    word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
    train_sent_tensor, train_label_tensor = get_model_inputs(train=True, task=task, word2idx=word2idx)
    print('size of training set: ', train_sent_tensor.shape)
    print(embedding.shape)

Getting word2idx with train and test set------
Begin to get word2vec 




---------------Prepare data for task a---------------
---------You are requiring train data!---------
------------Begin to get corpus-----------
------------Begin to tokenize corpus--------------
------------Begin to get vocabulary--------------
size of training set:  torch.Size([13240, 105])
(15291, 10)


In [97]:
print("Training on GRU network with embedding on task",task)
model, optimizer, loss_fn = embed_GRU_model(embedding, Vocabulary=len(word2idx), 
                                            lr=0.018,EMBEDDING_DIM=emsize, HIDDEN_DIM=8,task=task)
print(model)
trained_model=trainer(model, optimizer, loss_fn, train_sent_tensor, train_label_tensor,epoch_num=1, 
        batch_size=32,valid_size=0.1)
generate_testing_result(model=trained_model, word2idx=word2idx, task=task)

Training on GRU network with embedding on task a
GRU(
  (embedding): Embedding(15291, 10)
  (gru): GRU(10, 8, bidirectional=True)
  (bn2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (out): Linear(in_features=32, out_features=1, bias=True)
)
Epoch:   1 | Train accuracy: 82.94% | Valid acc: 78.55%
Epoch:   1 | Train f1_score: 0.80 | Valid f1_score: 0.74
[[7289  658]
 [1375 2594]]
Valid confusion matrix: 
[[796  97]
 [187 244]]
---------------Prepare data for task a---------------
---------You are requiring test data!---------
------------Begin to get corpus-----------
------------Begin to tokenize corpus--------------
------------Begin to get vocabulary--------------
Test result for task a generated!


In [98]:
# print(train_label_tensor.shape)
print("Training on LSTM network with embedding on task",task)
model, optimizer, loss_fn = embed_LSTM_model(embedding, Vocabulary=len(word2idx), 
                                            lr=0.021,EMBEDDING_DIM=emsize, HIDDEN_DIM=5,task=task,OUTPUT_DIM=1)
print(model)
trained_model=trainer(model, optimizer, loss_fn, train_sent_tensor, train_label_tensor,epoch_num=1, 
        batch_size=64,valid_size=0.1)
generate_testing_result(model=trained_model, word2idx=word2idx, task=task)

Training on LSTM network with embedding on task a
LSTM(
  (embedding): Embedding(15291, 10)
  (lstm): LSTM(10, 5, bidirectional=True)
  (bn2): BatchNorm1d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (out): Linear(in_features=20, out_features=1, bias=True)
)
Epoch:   1 | Train accuracy: 80.15% | Valid acc: 76.96%
Epoch:   1 | Train f1_score: 0.74 | Valid f1_score: 0.67
[[7716  222]
 [2143 1835]]
Valid confusion matrix: 
[[868  34]
 [271 151]]
---------------Prepare data for task a---------------
---------You are requiring test data!---------
------------Begin to get corpus-----------
------------Begin to tokenize corpus--------------
------------Begin to get vocabulary--------------
Test result for task a generated!


In [114]:
if __name__ == '__main__':
    _, tokenized_corpus = get_word2idx()
    task = 'b'
    output_dim = 3 if task == 'c' else 1
    emsize = 10
    print('Begin to get word2vec ')
    embedding, vocabulary = word2vec_embedding(tokenized_corpus, embed_size=emsize)
    word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
    train_sent_tensor, train_label_tensor = get_model_inputs(train=True, task=task, word2idx=word2idx)
    print('size of training set: ', train_sent_tensor.shape)
    print(embedding.shape)

Getting word2idx with train and test set------
Begin to get word2vec 




---------------Prepare data for task b---------------
---------You are requiring train data!---------
------------Begin to get corpus-----------
------------Begin to tokenize corpus--------------
------------Begin to get vocabulary--------------
size of training set:  torch.Size([4400, 103])
(15291, 10)


In [115]:
testx = train_sent_tensor[:1000,:]
testy = train_label_tensor[:1000]
train_sent_tensor=train_sent_tensor[1001:,:]
train_label_tensor=train_label_tensor[1001:]
balance = train_sent_tensor[train_label_tensor==0,:]
d = balance.shape[0]
train_sent_tensor=torch.cat((train_sent_tensor,balance), 0) 
train_sent_tensor=torch.cat((train_sent_tensor,balance), 0) 
train_label_tensor=torch.cat((train_label_tensor,torch.zeros(2*d)),0)
print(train_label_tensor.shape)
print(train_sent_tensor.shape)

torch.Size([4235])
torch.Size([4235, 103])


In [116]:
print("Training on GRU network with embedding on task",task)
model, optimizer, loss_fn = embed_GRU_model(embedding, Vocabulary=len(word2idx), lr=0.002,
                                            EMBEDDING_DIM=emsize, HIDDEN_DIM=5,task=task)
print(model)
trained_model=trainer(model, optimizer, loss_fn, train_sent_tensor, train_label_tensor,
        epoch_num=3, batch_size=64,valid_size=0.01)
pre=trained_model(testx)
acc, f1_score, matrix = myaccuracy(pre,testy)
print('test acc',acc,'test f1', f1_score)
print(matrix)
generate_testing_result(model=trained_model, word2idx=word2idx, task=task)

Training on GRU network with embedding on task b
GRU(
  (embedding): Embedding(15291, 10)
  (gru): GRU(10, 5, bidirectional=True)
  (bn2): BatchNorm1d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (out): Linear(in_features=20, out_features=1, bias=True)
)
Epoch:   1 | Train accuracy: 70.73% | Valid acc: 48.84%
Epoch:   1 | Train f1_score: 0.43 | Valid f1_score: 0.33
[[  27 1206]
 [  21 2938]]
Valid confusion matrix: 
[[ 0 21]
 [ 1 21]]
Epoch:   2 | Train accuracy: 70.99% | Valid acc: 48.84%
Epoch:   2 | Train f1_score: 0.44 | Valid f1_score: 0.33
[[  33 1200]
 [  16 2943]]
Valid confusion matrix: 
[[ 0 21]
 [ 1 21]]
Epoch:   3 | Train accuracy: 76.79% | Valid acc: 53.49%
Epoch:   3 | Train f1_score: 0.64 | Valid f1_score: 0.42
[[ 355  878]
 [  95 2864]]
Valid confusion matrix: 
[[ 2 19]
 [ 1 21]]
get acc
test acc tensor(0.8400) test f1 0.5555555555555556
[[ 20  86]
 [ 74 820]]
---------------Prepare data for task b---------------
---------You are requiring test 

In [57]:
# print(train_label_tensor.shape)
print("Training on LSTM network with embedding on task",task)
model, optimizer, loss_fn = embed_LSTM_model(embedding, Vocabulary=len(word2idx), 
                                            lr=0.002,EMBEDDING_DIM=emsize, HIDDEN_DIM=5,task=task,OUTPUT_DIM=1)
trained_model=trainer(model, optimizer, loss_fn, train_sent_tensor, train_label_tensor,
        epoch_num=3, batch_size=64,valid_size=0.1)
pre=trained_model(testx)
acc, f1_score, matrix = myaccuracy(pre,testy)
print('test acc',acc,'test f1', f1_score)
print(matrix)
generate_testing_result(model=trained_model, word2idx=word2idx, task=task)

Training on LSTM network with embedding on task b
Epoch:   1 | Train accuracy: 70.22% | Valid acc: 70.52%
Epoch:   1 | Train f1_score: 0.44 | Valid f1_score: 0.42
[[  38 1095]
 [  40 2638]]
Valid confusion matrix: 
[[  1 120]
 [  5 298]]
Epoch:   2 | Train accuracy: 71.06% | Valid acc: 72.17%
Epoch:   2 | Train f1_score: 0.45 | Valid f1_score: 0.45
[[  47 1086]
 [  17 2661]]
Valid confusion matrix: 
[[  4 117]
 [  1 302]]
Epoch:   3 | Train accuracy: 74.15% | Valid acc: 74.76%
Epoch:   3 | Train f1_score: 0.57 | Valid f1_score: 0.57
[[ 214  919]
 [  66 2612]]
Valid confusion matrix: 
[[ 23  98]
 [  9 294]]
get acc
test acc tensor(0.8540) test f1 0.530697524911604
[[ 12  94]
 [ 52 842]]
---------------Prepare data for task b---------------
---------You are requiring test data!---------
------------Begin to get corpus-----------
------------Begin to tokenize corpus--------------
------------Begin to get vocabulary--------------
Test result for task b generated!


In [102]:
if __name__ == '__main__':
    _, tokenized_corpus = get_word2idx()
    task = 'c'
    output_dim = 3 if task == 'c' else 1
    emsize = 10
    print('Begin to get word2vec ')
    embedding, vocabulary = word2vec_embedding(tokenized_corpus, embed_size=emsize)
    word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
    train_sent_tensor, train_label_tensor = get_model_inputs(train=True, task=task, word2idx=word2idx)
    print('size of training set: ', train_sent_tensor.shape)
    print(embedding.shape)

Getting word2idx with train and test set------
Begin to get word2vec 




---------------Prepare data for task c---------------
---------You are requiring train data!---------
------------Begin to get corpus-----------
------------Begin to tokenize corpus--------------
------------Begin to get vocabulary--------------
size of training set:  torch.Size([3876, 103])
(15291, 10)


In [103]:
print(train_label_tensor[train_label_tensor==0].shape)
print(train_label_tensor[train_label_tensor==1].shape)
print(train_label_tensor[train_label_tensor==2].shape)

torch.Size([2407])
torch.Size([1074])
torch.Size([395])


In [104]:

# print(train_label_tensor.shape)
print("Training on GRU network with embedding on task",task)
model, optimizer, loss_fn = embed_GRU_model(embedding, Vocabulary=len(word2idx), 
                                            lr=0.002,EMBEDDING_DIM=emsize, HIDDEN_DIM=20,task=task,OUTPUT_DIM=3)
print(model)
trained_model=trained_model=trainer(model, optimizer, loss_fn, train_sent_tensor, train_label_tensor,
                      epoch_num=10, batch_size=64,valid_size=0.1)
generate_testing_result(model=trained_model, word2idx=word2idx, task=task)

Training on GRU network with embedding on task c
GRU(
  (embedding): Embedding(15291, 10)
  (gru): GRU(10, 20, bidirectional=True)
  (bn2): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (out): Linear(in_features=80, out_features=3, bias=True)
)
Epoch:   1 | Train accuracy: 52.32% | Valid acc: 53.09%
Train f1score: 
0.43321123242582704
Valid f1score: 
0.42838827401460494
Train confusion matrix: 
[[1288  336  538]
 [ 303  429  230]
 [ 117  139  108]]
Valid confusion matrix: 
[[149  38  58]
 [ 35  47  30]
 [  9  12  10]]
Epoch:   2 | Train accuracy: 62.47% | Valid acc: 64.18%
Train f1score: 
0.49494980674900885
Valid f1score: 
0.4951412456008044
Train confusion matrix: 
[[1446  519  197]
 [ 198  671   93]
 [ 121  181   62]]
Valid confusion matrix: 
[[167  51  27]
 [ 20  77  15]
 [ 10  16   5]]
Epoch:   3 | Train accuracy: 63.68% | Valid acc: 68.04%
Train f1score: 
0.4916735503642953
Valid f1score: 
0.5238274675913874
Train confusion matrix: 
[[1496  506

In [112]:
# print(train_label_tensor.shape)
print("Training on LSTM network with embedding on task",task)
model, optimizer, loss_fn = embed_LSTM_model(embedding, Vocabulary=len(word2idx), 
                                            lr=0.001,EMBEDDING_DIM=emsize, HIDDEN_DIM=20,task=task,OUTPUT_DIM=3)
print(model)
trained_model=trainer(model, optimizer, loss_fn, train_sent_tensor, 
        train_label_tensor,epoch_num=3, batch_size=64,valid_size=0.1)
generate_testing_result(model=trained_model, word2idx=word2idx, task=task)

Training on LSTM network with embedding on task c
LSTM(
  (embedding): Embedding(15291, 10)
  (lstm): LSTM(10, 20, bidirectional=True)
  (bn2): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (out): Linear(in_features=80, out_features=3, bias=True)
)
Epoch:   1 | Train accuracy: 51.86% | Valid acc: 53.09%
Train f1score: 
0.4148286202719649
Valid f1score: 
0.41905453569780643
Train confusion matrix: 
[[1164  818  174]
 [ 271  590  112]
 [ 118  186   55]]
Valid confusion matrix: 
[[130  97  24]
 [ 23  71   7]
 [ 13  18   5]]
Epoch:   2 | Train accuracy: 61.87% | Valid acc: 65.98%
Train f1score: 
0.46836744343289527
Valid f1score: 
0.49894710519735197
Train confusion matrix: 
[[1486  534  136]
 [ 256  634   83]
 [ 133  188   38]]
Valid confusion matrix: 
[[174  58  19]
 [ 20  78   3]
 [ 11  21   4]]
Epoch:   3 | Train accuracy: 62.33% | Valid acc: 66.24%
Train f1score: 
0.48136309252865445
Valid f1score: 
0.5285999525392269
Train confusion matrix: 
[[1401