In [1]:
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import scipy.sparse as sp
import time
import random
from concurrent.futures import ProcessPoolExecutor as prpExecutor
import sys
import copy

In [31]:
# ============================
# Data File Path
# ============================
TRAIN_DATA_FILE_PATH = "/hdd1/Spoiler_Detection/ACL/INGGEOL/train_10000.json"
VALIDATION_DATA_FILE_PATH = "/hdd1/Spoiler_Detection/ACL/INGGEOL/validation_10000.json"
PRE_TRAINED_WORD_EMBEDDING_FILE_PATH = "../word_embedding_10000.npy"

# ============================
# Model Hyper Parameter
# ============================
EMBEDDING_DIM = 300
HIDDEN_STATES = [50, 50]

# ============================
# Training Hyper Parameter
# ============================
EPOCHS = 300
LEARNING_RATE = 0.00005
BATCH_SIZE = 256
WEIGHT_DECAY = 1e-5
DROPOUT_RATE = 0.5
RANDOM_SEED = 26

# ============================
# Set Random Seed
# ============================
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [13]:
# ============================
# Data Pre-Processing
# ============================
def load_data(train_file_path, validation_file_path):
    with open(train_file_path) as f:
        train = json.load(f)["data"]
    with open(validation_file_path) as f:
        validation = json.load(f)["data"]
        
    return train, validation
    
def make_dictionary(train, validation):
    data = []
    data += train
    data += validation
    
    maximum_length = max([len(line.split("\t")[0].split()) for line in data])

    word2id = {"<PAD>":0}
    id2word = ["<PAD>"]
    edge2id = {}
    id2edge = []
    
    for line in data:
        tokens = line.split("\t")
        for word in tokens[0].split():
            if word not in word2id:
                word2id[word] = len(word2id)
                id2word.append(word)
        for edges in tokens[2:]:
            _tokens = edges.split(":")
            if len(_tokens) != 3:
                start, end = _tokens[0], _tokens[1]
                edge = ":".join(_tokens[2:])
            else:
                start, end, edge = _tokens
            if edge not in edge2id:
                edge2id[edge] = len(edge2id)
                id2edge.append(edge)

    return word2id, id2word, edge2id, id2edge, maximum_length

def make_input_data_as_index(_train, _validation, word2id, edge2id):
    train, validation = [], []
    for line in _train:
        tokens = line.split("\t")
        tokens[0] = [word2id[word] for word in tokens[0].split()]
        _edges = []
        for edges in tokens[2:]:
            _tokens = edges.split(":")
            if len(_tokens) != 3:
                start, end = _tokens[0], _tokens[1]
                edge = ":".join(_tokens[2:])
            else:
                start, end, edge = _tokens
            _edges.append([start, end, edge2id[edge]])
        train.append([tokens[0], tokens[1], _edges])

    for line in _validation:
        tokens = line.split("\t")
        tokens[0] = [word2id[word] for word in tokens[0].split()]
        _edges = []
        for edges in tokens[2:]:
            _tokens = edges.split(":")
            if len(_tokens) != 3:
                start, end = _tokens[0], _tokens[1]
                edge = ":".join(_tokens[2:])
            else:
                start, end, edge = _tokens
            _edges.append([start, end, edge2id[edge]])
        validation.append([tokens[0], tokens[1], _edges])

    return train, validation

def make_input_adjacency_matrix(line):
    words, label, edges = line[0], float(line[1]), line[2]
    adjacency_matrix = make_adjacency_matrix(np.asarray(edges), len(words))
    
    return [words, adjacency_matrix, label]

def normalize_matrix(matrix):
    rowsum = np.asarray(matrix.sum(1))
    row_inv = np.power(np.sqrt(rowsum), -1).flatten()
    row_inv[np.isinf(row_inv)] = 0.
    row_matrix_inv = sp.diags(row_inv)
    matrix = row_matrix_inv.dot(matrix)

    return matrix

def sparse_matrix_to_torch_sparse_tensor(sparse_matrix, maximum_length):
    sparse_matrix = sparse_matrix.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_matrix.row, sparse_matrix.col)).astype(np.int64))
    values = torch.from_numpy(sparse_matrix.data)
    shape = torch.Size((maximum_length, maximum_length))

    return torch.sparse.FloatTensor(indices, values, shape)

def make_adjacency_matrix(edges, num_words):
    adjacency_matrix = sp.coo_matrix(
        (np.ones(len(edges)), (edges[:, 0].astype(np.int32), edges[:, 1].astype(np.int32))),
        shape=(num_words, num_words),
        dtype=np.float32
    )
    
    identity_matrix = sp.coo_matrix(
        (np.ones(len(edges)), (np.arange(len(edges)), np.arange(len(edges)))),
        shape=(num_words, num_words),
        dtype=np.float32
    )
    
    adjacency_matrix = adjacency_matrix + identity_matrix + adjacency_matrix.transpose()
    normalized_adjacency_matrix = normalize_matrix(adjacency_matrix)
    
    return normalized_adjacency_matrix

def load_pre_trained_word_embedding(word_embedding_file_path):
    return torch.from_numpy(np.load(word_embedding_file_path).astype(np.float32))

def make_batch(data, batch_size, is_train=True):
    indices = np.arange(len(data))
    if is_train:
        random.shuffle(indices)
    
    if len(data) % batch_size == 0:
        batch_num = int(len(data)/batch_size)
    else:
        batch_num = int(len(data)/batch_size) + 1
        
    for i in range(batch_num):
        left = i*batch_size
        right = min((i+1)*batch_size, len(data))
        
        sentences = []
        adjacency_matrics = []
        labels = []
        
        for j in indices[left:right]:
            sentences.append(data[j][0])
            adjacency_matrics.append(data[j][1])
            labels.append(data[j][2])
        
        yield sentences, adjacency_matrics, labels

In [4]:
# ============================
# Data Pre Processing
# ============================
print("Load Data...")
train, validation = load_data(TRAIN_DATA_FILE_PATH, VALIDATION_DATA_FILE_PATH)

print("Make Dictionary...")
word2id, id2word, edge2id, id2edge, maximum_length = make_dictionary(train, validation)

print("Make Input as Index...")
train, validation = make_input_data_as_index(train, validation, word2id, edge2id)

print("Make Adjacency Matrix...")
start = time.time()
pool = prpExecutor(max_workers=16)
train = list(pool.map(make_input_adjacency_matrix, train))
validation = list(pool.map(make_input_adjacency_matrix, validation))
del pool
print(int(time.time() - start))

print("Make Sparse Tensor...")
start = time.time()
for line in train+validation:
    line[0] += [0] * (maximum_length - len(line[0]))
    line[1] = sparse_matrix_to_torch_sparse_tensor(line[1], maximum_length)
print(int(time.time() - start))

print("Load Pre-trained Word Embedding...")
word_embedding = load_pre_trained_word_embedding(PRE_TRAINED_WORD_EMBEDDING_FILE_PATH)

Load Data...
Make Dictionary...
Make Input as Index...
Make Adjacency Matrix...




62
Make Sparse Tensor...
35
Load Pre-trained Word Embedding...


In [5]:
print("Load Pre-trained Word Embedding...")
word_embedding = load_pre_trained_word_embedding(PRE_TRAINED_WORD_EMBEDDING_FILE_PATH)

Load Pre-trained Word Embedding...


In [26]:
# ============================
# Model
# ============================
class GCNLayer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(GCNLayer, self).__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim

        self.weight = nn.Parameter(
            torch.randn(self.input_dim, self.output_dim))
        nn.init.xavier_normal_(self.weight)

        self.bias = nn.Parameter(torch.zeros(self.output_dim))

    def forward(self, x, adj_matrics):
        x = torch.matmul(adj_matrics, x)
        output = torch.matmul(x, self.weight)
        output = output + self.bias

        return output

class Model(nn.Module):
    def __init__(self, num_words, embedding_dim, hidden_dim, maximum_length, pre_trained, dropout_rate):
        super(Model, self).__init__()

        self.num_words = num_words
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.maximum_length = maximum_length
        self.dropout_rate = dropout_rate

        # =============================================
        # Data Preparation
        # =============================================		
        self.word_embedding \
        = nn.Embedding.from_pretrained(pre_trained, padding_idx = 0)
#         self.word_embedding.from_pretrained(pre_trained)
        
        self.gcn_layer_1 \
        = GCNLayer(self.embedding_dim, self.hidden_dim[0])
        self.gcn_layer_2 \
        = GCNLayer(self.hidden_dim[0], self.hidden_dim[1])
        
        self.max_pooling = nn.MaxPool1d(self.maximum_length)
        self.output_layer = nn.Linear(self.hidden_dim[1], 1)

    def forward(self, sentences, adjacency_matrics, batch_size):
        embedded_words = self.word_embedding(sentences)
        gcn_1 = self.gcn_layer_1(embedded_words, adjacency_matrics)
        gcn_1 = F.relu(gcn_1)
        gcn_1 = F.dropout(gcn_1, self.dropout_rate)
        gcn_2 = self.gcn_layer_2(gcn_1, adjacency_matrics)
        gcn_2 = F.relu(gcn_2)
        sentence_representations = self.max_pooling(gcn_2.transpose(1, 2)).squeeze()
        sentence_representations = F.dropout(sentence_representations, self.dropout_rate)
        output = self.output_layer(sentence_representations)
        
        return output

In [36]:
# =============================================
# Model Initialize
# =============================================
print("Model Initializing..")
pos_weight = 20*torch.ones([1]).cuda()
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

model = Model(len(word2id), EMBEDDING_DIM, HIDDEN_STATES, maximum_length, word_embedding, DROPOUT_RATE).cuda()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

Model Initializing..


In [None]:
# =============================================
# Model Training
# =============================================
best_model = {}
best = np.zeros(3)
print("Model Training..\n")
for i in range(EPOCHS):
    model.train()
    start = time.time()
    
    if len(train) % BATCH_SIZE == 0:
        batch_num = int(len(train)/BATCH_SIZE)
    else:
        batch_num = int(len(train)/BATCH_SIZE) + 1
    
    loss = .0
    batches = make_batch(train, BATCH_SIZE)
    step = 0

    for batch in batches:
        sentences, adjacency_matrics, labels = batch
        input_sentences = torch.tensor(sentences, dtype = torch.long).cuda()
        input_adjacency_matrics = torch.stack([matrix.to_dense() for matrix in adjacency_matrics], dim=0).cuda()
        input_labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(dim=1).cuda()
        optimizer.zero_grad()
        logits = model(input_sentences, input_adjacency_matrics, len(sentences))
        _loss = criterion(logits, input_labels).sum()
        _loss.backward()
        optimizer.step()
        loss += _loss.item()
        step+=1
        
        sys.stdout.flush()
        sys.stdout.write("\r" + "Epoch: [{}/{}] Batch: [{}/{}]".format(i+1, EPOCHS, step, batch_num))
    
    if (i+1) % 1 == 0:
        model.eval()
        
        batches = make_batch(validation, BATCH_SIZE)
        if len(validation) % BATCH_SIZE == 0:
            batch_num = int(len(validation)/BATCH_SIZE)
        else:
            batch_num = int(len(validation)/BATCH_SIZE) + 1
        
        step = 0
        count = 0
        correct = 0
        positive_answer = 0
        positive_actual = 0
        for batch in batches:
            sentences, adjacency_matrics, labels = batch
            input_sentences = torch.tensor(sentences, dtype = torch.long).cuda()
            input_adjacency_matrics = torch.stack([matrix.to_dense() for matrix in adjacency_matrics], dim=0).cuda()
            input_labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(dim=1).cuda()
            logits = model(input_sentences, input_adjacency_matrics, len(sentences))

            predicted = (logits > 0.5).float()
            positive_answer += predicted.sum().item()
            positive_actual += (input_labels == 1.0).float().sum().item()
            correct+=(predicted*input_labels).sum().item()
            count+= (predicted==input_labels).sum().item()
            step+=1

        accuracy = 100*float(count)/len(validation)
        if positive_answer == 0:
            precision = 0.0
        else:
            precision = float(correct)/positive_answer
        recall = float(correct)/positive_actual
        if (precision+recall) == 0.0:
            f1 = 0.0
        else:
            f1 = 2*precision*recall/(precision+recall)
        print(" Loss: {} [A/P/R/F]: [{:.4f}/{:.4f}/{:.4f}/{:.4f}]".format(loss, accuracy, precision, recall, f1))
        if f1 > best[2]:
            best = precision, recall, f1
            best_model = copy.deepcopy(model.state_dict())
            print(" Current Best:)")
    
    else:
        print(" Loss: {}".format(loss))

Model Training..

Epoch: [1/300] Batch: [463/463] Loss: 475.4751196503639 [A/P/R/F]: [96.5564/0.0444/0.0182/0.0258]
 Current Best:)
Epoch: [2/300] Batch: [463/463] Loss: 457.76973366737366 [A/P/R/F]: [97.2026/0.0000/0.0000/0.0000]
Epoch: [3/300] Batch: [463/463] Loss: 445.254254758358 [A/P/R/F]: [97.2178/0.0909/0.0121/0.0214]
Epoch: [4/300] Batch: [463/463] Loss: 437.50181555747986 [A/P/R/F]: [96.9745/0.0526/0.0121/0.0197]
Epoch: [5/300] Batch: [463/463] Loss: 430.921693444252 [A/P/R/F]: [96.7769/0.0392/0.0121/0.0185]
Epoch: [6/300] Batch: [463/463] Loss: 426.68244674801826 [A/P/R/F]: [96.4652/0.0588/0.0273/0.0373]
 Current Best:)
Epoch: [7/300] Batch: [463/463] Loss: 423.02322110533714 [A/P/R/F]: [96.0699/0.0807/0.0545/0.0651]
 Current Best:)
Epoch: [8/300] Batch: [463/463] Loss: 418.24900379776955 [A/P/R/F]: [95.8039/0.0858/0.0697/0.0769]
 Current Best:)
Epoch: [9/300] Batch: [463/463] Loss: 417.98925602436066 [A/P/R/F]: [95.3630/0.0808/0.0818/0.0813]
 Current Best:)
Epoch: [10/300] 

In [None]:
count = 0
for line in train:
    if line[2] == 1.0:
        count+=1
print(100*float(count)/len(train))

count = 0
for line in validation:
    if line[2] == 1.0:
        count+=1
print(100*float(count)/len(validation))

In [19]:
torch.save(best_model, "../best_model_without_fine_tune.pt")

In [16]:
model.word_embedding

Embedding(49842, 50, padding_idx=0)

In [35]:
word_embedding[1,:50], model.word_embedding.weight[1,: 50]

(tensor([5.1910e-01, 5.2540e-04, 3.5943e-01, 5.2106e-01, 9.2570e-01, 1.1660e-01,
         7.9218e-01, 2.4372e-01, 6.2472e-01, 4.5988e-01, 3.2842e-01, 5.4276e-01,
         1.8415e-01, 2.4003e-01, 2.2915e-01, 1.8152e-01, 7.1876e-01, 4.1605e-01,
         1.3763e-02, 9.5724e-02, 2.7980e-01, 6.1360e-01, 9.2846e-02, 6.9006e-01,
         2.9711e-01, 5.1656e-01, 9.7445e-01, 8.9844e-02, 1.9295e-01, 6.4581e-01,
         5.9681e-01, 1.5447e-02, 4.5644e-02, 6.5717e-01, 6.6877e-01, 5.5065e-01,
         7.0487e-01, 9.9404e-01, 3.5898e-01, 3.2252e-01, 4.0845e-02, 7.5114e-03,
         5.9773e-01, 6.4211e-02, 8.9571e-02, 5.2875e-01, 3.2387e-01, 9.6272e-01,
         4.9603e-01, 6.7986e-01]),
 tensor([5.1910e-01, 5.2540e-04, 3.5943e-01, 5.2106e-01, 9.2570e-01, 1.1660e-01,
         7.9218e-01, 2.4372e-01, 6.2472e-01, 4.5988e-01, 3.2842e-01, 5.4276e-01,
         1.8415e-01, 2.4003e-01, 2.2915e-01, 1.8152e-01, 7.1876e-01, 4.1605e-01,
         1.3763e-02, 9.5724e-02, 2.7980e-01, 6.1360e-01, 9.2846e-02, 6.900

In [29]:
word_embedding.shape, model.word_embedding

(torch.Size([49842, 300]), Embedding(49842, 300, padding_idx=0))