In [None]:
from tqdm import tqdm
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import scipy.sparse as sp
import time
import random
from concurrent.futures import ProcessPoolExecutor as prpExecutor
import sys
import copy

In [None]:
# ============================
# Data File Path
# ============================
TRAIN_DATA_FILE_PATH = "/hdd1/Spoiler_Detection/TVTropes/train_parsed.pickle"
TEST_DATA_FILE_PATH = "/hdd1/Spoiler_Detection/TVTropes/test_parsed.pickle"
PRE_TRAINED_WORD_EMBEDDING_FILE_PATH = "/hdd1/Spoiler_Detection/TVTropes/word_embedding.npy"

# ============================
# Model Hyper Parameter
# ============================
EMBEDDING_DIM = 300
HIDDEN_STATES = [100, 100]
NUM_HEADS = 3
LEAKY_ALPHA = 0.2

# ============================
# Training Hyper Parameter
# ============================
EPOCHS = 300
LEARNING_RATE = 0.001
BATCH_SIZE = 32
WEIGHT_DECAY = 1e-5
DROPOUT_RATE = 0.2
RANDOM_SEED = 26

# ============================
# Set Random Seed
# ============================
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [None]:
# ============================
# Data Pre-Processing
# ============================
def load_data(train_file_path, test_file_path):
    with open(train_file_path, "rb") as f:
        train = [line.strip() for line in pickle.load(f)]
    with open(test_file_path, "rb") as f:
        test = [line.strip() for line in pickle.load(f)]
        
    return train, test

# ============================
# Data Pre Processing
# ============================
print("Load Data...")
start = time.time()
train, test = load_data(TRAIN_DATA_FILE_PATH,
                        TEST_DATA_FILE_PATH)
print(int(time.time() - start))

In [None]:
train[0]

In [None]:
def make_dictionary(train, test):
    data = []
    data += train
    data += test
    
    global maximum_length
    
    maximum_length = max([len(line.split("\t")[0].split()) for line in data])

    word2id = {"<PAD>":0}
    id2word = ["<PAD>"]
    edge2id = {"<NONE>":0, "<SELF>": 1}
    id2edge = ["<NONE>", "<SELF>"]
    
    for line in tqdm(data):
        tokens = line.split("\t")
        for word in tokens[0].split():
            if word not in word2id:
                word2id[word] = len(word2id)
                id2word.append(word)
        for edges in tokens[1:-2]:
            _tokens = edges.split(":")
            if len(_tokens) != 3:
                start, end = _tokens[0], _tokens[1]
                edge = ":".join(_tokens[2:])
            else:
                start, end, edge = _tokens
            if edge not in edge2id:
                edge2id[edge] = len(edge2id)
                id2edge.append(edge)

    num_edges = len(edge2id)
    for i in range(num_edges):
        key = id2edge[i]
        if key != "<NONE>" and key != "<SELF>":
            opposite = key+"''"
            edge2id[opposite] = edge2id[key]+num_edges-2
    
    return word2id, id2word, edge2id, id2edge, maximum_length

print("Make Dictionary...")
start = time.time()
word2id, id2word, edge2id, id2edge, maximum_length \
= make_dictionary(train, test)
print(int(time.time() - start))

In [None]:
def make_input_data_as_index(_data, word2id, edge2id):
    data = []
    for line in _data:
        tokens = line.split("\t")
        sentence, label, genres = tokens[0], int(tokens[-2]), tokens[-1]
        _edges = []
        for edges in tokens[1:-2]:
            _tokens = edges.split(":")
            if len(_tokens) != 3:
                start, end = _tokens[0], _tokens[1]
                edge = ":".join(_tokens[2:])
            else:
                start, end, edge = _tokens
            _edges.append(":".join([start, end, str(edge2id[edge])]))
            del _tokens
        data.append([sentence, label, " ".join(_edges)])
        del tokens
    del _data

    return data

print("Make Input as Index...")
start = time.time()
train = make_input_data_as_index(train, word2id, edge2id)
test = make_input_data_as_index(test, word2id, edge2id)
print(int(time.time() - start))

In [None]:
def make_input_adjacency_matrix(line):
    sentence, label, edges = line[0], float(line[1]), line[2].split()
    edges = np.asarray([edge.split(":") for edge in edges])
    adjacency_matrix = matrix_to_torch_sparse_tensor(np.asarray(edges), maximum_length)
    
    return [sentence, adjacency_matrix, label]

def matrix_to_torch_sparse_tensor(edges, maximum_length):
    indices = torch.from_numpy(
        np.vstack((edges[:, 0], edges[:, 1])).astype(np.int64))
    values = torch.from_numpy(edges[:, 2].astype(np.int64))
    shape = torch.Size((maximum_length, maximum_length))

    return torch.sparse.FloatTensor(indices, values, shape)

print("Make Adjacency Matrix...")
start = time.time()
pool = prpExecutor(max_workers=20)
train = [make_input_adjacency_matrix(line) for line in tqdm(train)]
test = [make_input_adjacency_matrix(line) for line in tqdm(test)]
del pool
print(int(time.time() - start))

In [None]:
def load_pre_trained_word_embedding(word_embedding_file_path):
    return torch.from_numpy(np.load(word_embedding_file_path).astype(np.float32))

print("Load Pre-trained Word Embedding...")
word_embedding = load_pre_trained_word_embedding(PRE_TRAINED_WORD_EMBEDDING_FILE_PATH)

In [None]:
def make_batch(data, batch_size, word2id, maximum_length, is_train=True):
    indices = np.arange(len(data))
    if is_train:
        random.shuffle(indices)
    
    if len(data) % batch_size == 0:
        batch_num = int(len(data)/batch_size)
    else:
        batch_num = int(len(data)/batch_size) + 1
        
    for i in range(batch_num):
        left = i*batch_size
        right = min((i+1)*batch_size, len(data))
        
        sentences = []
        adjacency_matrics = []
        labels = []
        
        for j in indices[left:right]:
            sentence = [word2id[word] for word in data[j][0].split()]
            sentence += [0]*(maximum_length - len(sentence))
            sentences.append(sentence)
            adjacency_matrics.append(data[j][1])
            labels.append(data[j][2])
        
        yield sentences, adjacency_matrics, labels

In [None]:
# ============================
# Model
# ============================
class GCNLayer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(GCNLayer, self).__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim

        self.weight = nn.Parameter(
            torch.randn(self.input_dim, self.output_dim))
        nn.init.xavier_normal_(self.weight)

        self.bias = nn.Parameter(torch.zeros(self.output_dim))

    def forward(self, x, attention_weight):
        x = x*attention_weight.unsqueeze(3)
        x = x.sum(2)
        output = torch.matmul(x, self.weight)
        output = output + self.bias

        return output

class Model(nn.Module):
    def __init__(self, num_words, num_edges, alpha, embedding_dim, hidden_dim, maximum_length, pre_trained, dropout_rate):
        super(Model, self).__init__()

        self.num_words = num_words
        self.num_edges = num_edges
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.maximum_length = maximum_length
        self.dropout_rate = dropout_rate
        self.alpha = alpha

        # =============================================
        # Data Preparation
        # =============================================
        self.word_embedding \
        = nn.Embedding.from_pretrained(pre_trained, padding_idx = 0, freeze=False)
#         = nn.Embedding(self.num_words, self.embedding_dim)
        self.edge_embedding_1 \
        = nn.Embedding(self.num_edges, 2*self.hidden_dim[0], padding_idx = 0)
        self.edge_embedding_2 \
        = nn.Embedding(self.num_edges, self.hidden_dim[0], padding_idx = 0)
        
        self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim[0], bidirectional=True, batch_first=True)
        
        self.gcn_layer_1 \
        = GCNLayer(2*self.hidden_dim[0], self.hidden_dim[0])
        self.gcn_layer_2 \
        = GCNLayer(self.hidden_dim[0], self.hidden_dim[1])
        
        self.max_pooling = nn.MaxPool1d(self.maximum_length)
        self.output_layer = nn.Sequential(
            nn.Dropout(self.dropout_rate),
            nn.Linear(self.hidden_dim[1], 1)
        )
        self.dropout = nn.Dropout(self.dropout_rate)
        
        self.leakyrelu = nn.LeakyReLU(self.alpha)

    def forward(self, sentences, adjacency_matrics):
        zero_vec = torch.zeros_like(adjacency_matrics)
        adjacency_matrics_t = torch.where(adjacency_matrics > 0, adjacency_matrics + int((self.num_edges-2)/2), zero_vec)
        adjacency_matrics_t = adjacency_matrics_t.transpose(1,2)
        eye = torch.eye(adjacency_matrics.size(1), dtype=torch.long).cuda()
        eye = eye.unsqueeze(0).expand(sentences.size(0),
                                      self.maximum_length,
                                      self.maximum_length)
        adjacency_matrics = adjacency_matrics \
                          + adjacency_matrics_t \
                          + eye
        embedded_words = self.word_embedding(sentences) # B X N X D
        h0 = torch.zeros(2, sentences.size(0), self.hidden_dim[0]).cuda() # 2 for bidirection 
        c0 = torch.zeros(2, sentences.size(0), self.hidden_dim[0]).cuda()
        lstm = self.lstm(embedded_words, (h0, c0))[0]
        lstm = lstm.unsqueeze(1).expand(sentences.size(0),
                                                  self.maximum_length,
                                                  self.maximum_length,
                                                  2*self.hidden_dim[0]) # B X N X N X D
        embedded_edges_1 = self.edge_embedding_1(adjacency_matrics) # B X N X N X D
        attention_weight_1 = lstm*embedded_edges_1 # B X N X N X D
        attention_weight_1 = torch.sum(attention_weight_1, dim=3) # B X N X N
        attention_weight_1 = self.leakyrelu(attention_weight_1)
        
        zero_vec = -9e15*torch.ones_like(attention_weight_1)
        attention_weight_1 = torch.where(adjacency_matrics > 0, attention_weight_1, zero_vec)
        attention_weight_1 = torch.softmax(attention_weight_1, dim=2)
        attention_weight_1 = self.dropout(attention_weight_1)
        
        gcn_1 = self.gcn_layer_1(lstm, attention_weight_1)
        gcn_1 = torch.relu(gcn_1) # B X N X H
        gcn_1 = self.dropout(gcn_1)
        
        embedded_edges_2 = self.edge_embedding_2(adjacency_matrics)
        x = gcn_1.unsqueeze(1).expand(sentences.size(0),
                                      self.maximum_length,
                                      self.maximum_length,
                                      self.hidden_dim[0])
        attention_weight_2 = x*embedded_edges_2 # B X N X N X H
        attention_weight_2 = torch.sum(attention_weight_2, dim=3) # B X N X N X 1
        attention_weight_2 = self.leakyrelu(attention_weight_2)
        
        zero_vec = -9e15*torch.ones_like(attention_weight_2)
        attention_weight_2 = torch.where(adjacency_matrics > 0, attention_weight_2, zero_vec)
        attention_weight_2 = torch.softmax(attention_weight_2, dim=2)
        attention_weight_2 = self.dropout(attention_weight_2)
        
        gcn_2 = self.gcn_layer_2(x, attention_weight_2)
        gcn_2 = torch.relu(gcn_2)
        sentence_representations = self.max_pooling(gcn_2.transpose(1, 2)).squeeze()
        output = self.output_layer(sentence_representations)
        
        return output

In [None]:
# =============================================
# Model Initialize
# =============================================
print("Model Initializing..")
criterion = nn.CrossEntropyLoss()

model = Model(len(word2id), len(edge2id), LEAKY_ALPHA, EMBEDDING_DIM, HIDDEN_STATES, maximum_length, word_embedding, DROPOUT_RATE).cuda()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

In [None]:
# =============================================
# Model Training
# =============================================
model.eval()

batches = make_batch(test, BATCH_SIZE, word2id, maximum_length, False)
if len(test) % BATCH_SIZE == 0:
    batch_num = int(len(test)/BATCH_SIZE)
else:
    batch_num = int(len(test)/BATCH_SIZE) + 1

step = 0
count = 0
correct = 0
positive_answer = 0
positive_actual = 0
for batch in batches:
        sentences, adjacency_matrics, labels = batch
        input_sentences = torch.tensor(sentences, dtype = torch.long).cuda()
        input_adjacency_matrics = torch.stack([matrix.to_dense() for matrix in adjacency_matrics], dim=0).cuda()
        input_labels = torch.tensor(labels, dtype=torch.long).cuda()
        logits = torch.argmax(model(input_sentences, input_adjacency_matrics), dim=1)

        positive_answer += logits.sum().item()
        positive_actual += (input_labels == 1.0).float().sum().item()
        correct+=(logits*input_labels).sum().item()
        count+= (logits==input_labels).sum().item()
        step+=1

accuracy = 100*float(count)/len(test)
if positive_answer == 0:
    precision = 0.0
else:
    precision = float(correct)/positive_answer
recall = float(correct)/positive_actual
if (precision+recall) == 0.0:
    f1 = 0.0
else:
    f1 = 2*precision*recall/(precision+recall)
print("[A/P/R/F]: [{:.4f}/{:.4f}/{:.4f}/{:.4f}]".format(accuracy, precision, recall, f1))

best_model = {}
best = np.zeros(3)
print("Model Training..\n")
for i in range(EPOCHS):
    model.train()
    start = time.time()
    
    if len(train) % BATCH_SIZE == 0:
        batch_num = int(len(train)/BATCH_SIZE)
    else:
        batch_num = int(len(train)/BATCH_SIZE) + 1
    
    loss = .0
    batches = make_batch(train, BATCH_SIZE, word2id, maximum_length)
    step = 0

    for batch in batches:
        sentences, adjacency_matrics, labels = batch
        input_sentences = torch.tensor(sentences, dtype = torch.long).cuda()
        input_adjacency_matrics = torch.stack([matrix.to_dense() for matrix in adjacency_matrics], dim=0).cuda()
        input_labels = torch.tensor(labels, dtype=torch.long).cuda()
        optimizer.zero_grad()
        logits = model(input_sentences, input_adjacency_matrics)
        _loss = criterion(logits, input_labels).sum()
        _loss.backward()
        optimizer.step()
        loss += _loss.item()
        step+=1
        
        sys.stdout.flush()
        sys.stdout.write("\r" + "Epoch: [{}/{}] Batch: [{}/{}]".format(i+1, EPOCHS, step, batch_num))
    
    if (i+1) % 1 == 0:
        model.eval()
        
        batches = make_batch(test, BATCH_SIZE, word2id, maximum_length, False)
        if len(test) % BATCH_SIZE == 0:
            batch_num = int(len(test)/BATCH_SIZE)
        else:
            batch_num = int(len(test)/BATCH_SIZE) + 1
        
        step = 0
        count = 0
        correct = 0
        positive_answer = 0
        positive_actual = 0
        for batch in batches:
            sentences, adjacency_matrics, labels = batch
            input_sentences = torch.tensor(sentences, dtype = torch.long).cuda()
            input_adjacency_matrics = torch.stack([matrix.to_dense() for matrix in adjacency_matrics], dim=0).cuda()
            input_labels = torch.tensor(labels, dtype=torch.long).cuda()
            logits = torch.argmax(model(input_sentences, input_adjacency_matrics), dim=1)

            positive_answer += logits.sum().item()
            positive_actual += (input_labels == 1.0).float().sum().item()
            correct+=(logits*input_labels).sum().item()
            count+= (logits==input_labels).sum().item()
            step+=1

        accuracy = 100*float(count)/len(test)
        if positive_answer == 0:
            precision = 0.0
        else:
            precision = float(correct)/positive_answer
        recall = float(correct)/positive_actual
        if (precision+recall) == 0.0:
            f1 = 0.0
        else:
            f1 = 2*precision*recall/(precision+recall)
        print(" Loss: {} [A/P/R/F]: [{:.4f}/{:.4f}/{:.4f}/{:.4f}]".format(loss, accuracy, precision, recall, f1))
        if f1 > best[2]:
            best = precision, recall, f1
            best_model = copy.deepcopy(model.state_dict())
            print(" Current Best:)")
    
    else:
        print(" Loss: {}".format(loss))