In [1]:
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import scipy.sparse as sp
import time
import random
from concurrent.futures import ProcessPoolExecutor as prpExecutor
import sys

In [2]:
# ============================
# Data File Path
# ============================
DATA_FILE_PATH = "/hdd1/Spoiler_Detection/ACL/INGGEOL/node_edge_info_10000_new.json"

# ============================
# Model Hyper Parameter
# ============================
EMBEDDING_DIM = 50
HIDDEN_STATES = [50, 50]

# ============================
# Training Hyper Parameter
# ============================
EPOCHS = 100
LEARNING_RATE = 0.00005
BATCH_SIZE = 256
WEIGHT_DECAY = 1e-5
DROPOUT_RATE = 0.5
RANDOM_SEED = 26

In [3]:
# ============================
# Data Pre-Processing
# ============================
def load_data(file_path):
    data = []
    with open(file_path) as f:
        _data = json.load(f)
    for review in _data:
        data += review["text_info"]
    return data

def make_dictionary(data):
    maximum_length = max([len(line.split("\t")[0].split()) for line in data])

    word2id = {"<PAD>":0}
    id2word = ["<PAD>"]
    edge2id = {}
    id2edge = []
    
    for line in data:
        tokens = line.split("\t")
        for word in tokens[0].split():
            if word not in word2id:
                word2id[word] = len(word2id)
                id2word.append(word)
        for edges in tokens[2:]:
            _tokens = edges.split(":")
            if len(_tokens) != 3:
                start, end = _tokens[0], _tokens[1]
                edge = ":".join(_tokens[2:])
            else:
                start, end, edge = _tokens
            if edge not in edge2id:
                edge2id[edge] = len(edge2id)
                id2edge.append(edge)

    return word2id, id2word, edge2id, id2edge, maximum_length

def make_input_data_as_index(_data, word2id, edge2id):
    data = []
    for line in _data:
        tokens = line.split("\t")
        tokens[0] = [word2id[word] for word in tokens[0].split()]
        _edges = []
        for edges in tokens[2:]:
            _tokens = edges.split(":")
            if len(_tokens) != 3:
                start, end = _tokens[0], _tokens[1]
                edge = ":".join(_tokens[2:])
            else:
                start, end, edge = _tokens
            _edges.append([start, end, edge2id[edge]])
        data.append([tokens[0], tokens[1], _edges])
        
    return data

def make_input_adjacency_matrix(line):
    words, label, edges = line[0], float(line[1]), line[2]
    adjacency_matrix = make_adjacency_matrix(np.asarray(edges), len(words))
    
    return [words, adjacency_matrix, label]

def normalize_matrix(matrix):
    rowsum = np.asarray(matrix.sum(1))
    row_inv = np.power(np.sqrt(rowsum), -1).flatten()
    row_inv[np.isinf(row_inv)] = 0.
    row_matrix_inv = sp.diags(row_inv)
    matrix = row_matrix_inv.dot(matrix)

    return matrix

def sparse_matrix_to_torch_sparse_tensor(sparse_matrix, maximum_length):
    sparse_matrix = sparse_matrix.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_matrix.row, sparse_matrix.col)).astype(np.int64))
    values = torch.from_numpy(sparse_matrix.data)
    shape = torch.Size((maximum_length, maximum_length))

    return torch.sparse.FloatTensor(indices, values, shape)

def make_adjacency_matrix(edges, num_words):
    adjacency_matrix = sp.coo_matrix(
        (np.ones(len(edges)), (edges[:, 0].astype(np.int32), edges[:, 1].astype(np.int32))),
        shape=(num_words, num_words),
        dtype=np.float32
    )
    
    identity_matrix = sp.coo_matrix(
        (np.ones(len(edges)), (np.arange(len(edges)), np.arange(len(edges)))),
        shape=(num_words, num_words),
        dtype=np.float32
    )
    
    adjacency_matrix = adjacency_matrix + identity_matrix + adjacency_matrix.transpose()
    normalized_adjacency_matrix = normalize_matrix(adjacency_matrix)
    
    return normalized_adjacency_matrix

def make_batch(data, batch_size, is_train=True):
    indices = np.arange(len(data))
    if is_train:
        random.shuffle(indices)
    
    if len(data) % batch_size == 0:
        batch_num = int(len(data)/batch_size)
    else:
        batch_num = int(len(data)/batch_size) + 1
        
    for i in range(batch_num):
        left = i*batch_size
        right = min((i+1)*batch_size, len(data))
        
        sentences = []
        adjacency_matrics = []
        labels = []
        
        for j in indices[left:right]:
            sentences.append(data[j][0])
            adjacency_matrics.append(data[j][1])
            labels.append(data[j][2])
        
        yield sentences, adjacency_matrics, labels

In [4]:
# ============================
# Data Pre Processing
# ============================
print("Load Data...")
data = load_data(DATA_FILE_PATH)

print("Make Dictionary...")
word2id, id2word, edge2id, id2edge, maximum_length = make_dictionary(data)

print("Make Input as Index...")
data = make_input_data_as_index(data, word2id, edge2id)

print("Make Adjacency Matrix...")
start = time.time()
pool = prpExecutor(max_workers=16)
data = list(pool.map(make_input_adjacency_matrix, data))
del pool
print(int(time.time() - start))

print("Make Sparse Tensor...")
start = time.time()
for line in data:
    line[0] += [0] * (maximum_length - len(line[0]))
    line[1] = sparse_matrix_to_torch_sparse_tensor(line[1], maximum_length)
print(int(time.time() - start))

Load Data...
Make Dictionary...
Make Input as Index...
Make Adjacency Matrix...




310
Make Sparse Tensor...
1525


In [5]:
# ============================
# Model
# ============================
class GCNLayer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(GCNLayer, self).__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim

        self.weight = nn.Parameter(
            torch.randn(self.input_dim, self.output_dim))
        nn.init.xavier_normal_(self.weight)

        self.bias = nn.Parameter(torch.zeros(self.output_dim))

    def forward(self, x, adj_matrics):
        x = torch.matmul(adj_matrics, x)
        output = torch.matmul(x, self.weight)
        output = output + self.bias

        return output

class Model(nn.Module):
    def __init__(self, num_words, embedding_dim, hidden_dim, maximum_length, dropout_rate):
        super(Model, self).__init__()

        self.num_words = num_words
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.maximum_length = maximum_length
        self.dropout_rate = dropout_rate

        # =============================================
        # Data Preparation
        # =============================================		
        self.word_embedding \
        = nn.Embedding(self.num_words, self.embedding_dim, padding_idx = 0)
        nn.init.xavier_normal_(self.word_embedding.weight)
        self.gcn_layer_1 \
        = GCNLayer(self.embedding_dim, self.hidden_dim[0])
        self.gcn_layer_2 \
        = GCNLayer(self.hidden_dim[0], self.hidden_dim[1])
        
        self.max_pooling = nn.MaxPool1d(self.maximum_length)
        self.output_layer = nn.Linear(self.hidden_dim[1], 1)

    def forward(self, sentences, adjacency_matrics, batch_size):
        embedded_words = self.word_embedding(sentences)
        gcn_1 = self.gcn_layer_1(embedded_words, adjacency_matrics)
        gcn_1 = F.relu(gcn_1)
        gcn_1 = F.dropout(gcn_1, self.dropout_rate)
        gcn_2 = self.gcn_layer_2(gcn_1, adjacency_matrics)
        gcn_2 = F.relu(gcn_2)
        sentence_representations = self.max_pooling(gcn_2.transpose(1, 2)).squeeze()
        sentence_representations = F.dropout(sentence_representations, self.dropout_rate)
        output = self.output_layer(sentence_representations)
        
        return output

In [14]:
# =============================================
# Model Initialize
# =============================================
print("Model Initializing..")
pos_weight = 20*torch.ones([1]).cuda()
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

model = Model(len(word2id), EMBEDDING_DIM, HIDDEN_STATES, maximum_length, DROPOUT_RATE).cuda()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

Model Initializing..


In [None]:
# =============================================
# Model Training
# =============================================
print("Model Training..\n")
for i in range(EPOCHS):
    model.train()
    start = time.time()
    
    if len(data) % BATCH_SIZE == 0:
        batch_num = int(len(data)/BATCH_SIZE)
    else:
        batch_num = int(len(data)/BATCH_SIZE) + 1
    
    loss = .0
    batches = make_batch(data, BATCH_SIZE)
    step = 0
    count = 0
    correct = 0
    positive_answer = 0
    positive_actual = 0
    for batch in batches:
        sentences, adjacency_matrics, labels = batch
        input_sentences = torch.tensor(sentences, dtype = torch.long).cuda()
        input_adjacency_matrics = torch.stack([matrix.to_dense() for matrix in adjacency_matrics], dim=0).cuda()
        input_labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(dim=1).cuda()
        optimizer.zero_grad()
        logits = model(input_sentences, input_adjacency_matrics, len(sentences))
        _loss = criterion(logits, input_labels).sum()
        _loss.backward()
        optimizer.step()
        loss += _loss.item()
        step+=1
        
        predicted = (logits > 0.5).float()
        positive_answer += predicted.sum().item()
        positive_actual += (input_labels == 1.0).float().sum().item()
        correct+=(predicted*input_labels).sum().item()
        count+= (predicted==input_labels).sum().item()
        
        sys.stdout.flush()
        sys.stdout.write("\r" + "Epoch: [{}/{}] Batch: [{}/{}]".format(i+101, EPOCHS, step, batch_num))
    accuracy = 100*float(count)/len(data)
    if positive_answer == 0:
        precision = 0.0
    else:
        precision = float(correct)/positive_answer
    recall = float(correct)/positive_actual
    if (precision+recall) == 0.0:
        f1 = 0.0
    else:
        f1 = 2*precision*recall/(precision+recall)
    print(" Loss: {} [A/P/R/F]: [{:.4f}/{:.4f}/{:.4f}/{:.4f}]".format(loss, accuracy, precision, recall, f1))

Model Training..

Epoch: [101/100] Batch: [514/514] Loss: 80.01352509856224 [A/P/R/F]: [97.7787/0.5491/0.9080/0.6844]
Epoch: [102/100] Batch: [514/514] Loss: 80.67250456660986 [A/P/R/F]: [97.6996/0.5393/0.9106/0.6774]
Epoch: [103/100] Batch: [514/514] Loss: 79.71830304712057 [A/P/R/F]: [97.7703/0.5482/0.9066/0.6832]
Epoch: [104/100] Batch: [514/514] Loss: 80.68383051455021 [A/P/R/F]: [97.7300/0.5431/0.9083/0.6798]
Epoch: [105/100] Batch: [514/514] Loss: 80.73507383465767 [A/P/R/F]: [97.8220/0.5546/0.9089/0.6888]
Epoch: [106/100] Batch: [514/514] Loss: 80.76739838346839 [A/P/R/F]: [97.8159/0.5538/0.9083/0.6881]
Epoch: [107/100] Batch: [514/514] Loss: 80.31451964378357 [A/P/R/F]: [97.8106/0.5531/0.9086/0.6876]
Epoch: [108/100] Batch: [514/514] Loss: 79.03134386986494 [A/P/R/F]: [97.8235/0.5548/0.9077/0.6887]
Epoch: [109/100] Batch: [134/514]