In [15]:
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import scipy.sparse as sp
import time
import random
from concurrent.futures import ProcessPoolExecutor as prpExecutor

In [2]:
# ============================
# Data File Path
# ============================
DATA_FILE_PATH = "/hdd1/Spoiler_Detection/ACL/INGGEOL/node_edge_info_10000_new.json"

# ============================
# Model Hyper Parameter
# ============================
EMBEDDING_DIM = 50
HIDDEN_STATES = [50, 50]

# ============================
# Training Hyper Parameter
# ============================
LEARNING_RATE = 0.001
BATCH_SIZE = 256
WEIGHT_DECAY = 1e-5
DROPOUT_RATE = 0.5
RANDOM_SEED = 26

In [11]:
# ============================
# Data Pre-Processing
# ============================
def load_data(file_path):
    data = []
    with open(file_path) as f:
        _data = json.load(f)
    for review in _data:
        data += review["text_info"]
    return data

def make_dictionary(data):
    maximum_length = max([len(line.split("\t")[0].split()) for line in data])

    word2id = {"<PAD>":0}
    id2word = ["<PAD>"]
    edge2id = {}
    id2edge = []
    
    for line in data:
        tokens = line.split("\t")
        for word in tokens[0].split():
            if word not in word2id:
                word2id[word] = len(word2id)
                id2word.append(word)
        for edges in tokens[2:]:
            _tokens = edges.split(":")
            if len(_tokens) != 3:
                start, end = _tokens[0], _tokens[1]
                edge = ":".join(_tokens[2:])
            else:
                start, end, edge = _tokens
            if edge not in edge2id:
                edge2id[edge] = len(edge2id)
                id2edge.append(edge)

    return word2id, id2word, edge2id, id2edge, maximum_length

def make_input_data_as_index(_data, word2id, edge2id):
    data = []
    for line in _data:
        tokens = line.split("\t")
        tokens[0] = [word2id[word] for word in tokens[0].split()]
        _edges = []
        for edges in tokens[2:]:
            _tokens = edges.split(":")
            if len(_tokens) != 3:
                start, end = _tokens[0], _tokens[1]
                edge = ":".join(_tokens[2:])
            else:
                start, end, edge = _tokens
            _edges.append([start, end, edge2id[edge]])
        data.append([tokens[0], tokens[1], _edges])
        
    return data

def make_input_adjacency_matrix(_data):
    data = []
    for line in _data:
        words, label, edges = line[0], float(line[1]), line[2]
        adjacency_matrix = make_adjacency_matrix(np.asarray(edges), len(words))
        data.append([words, adjacency_matrix, label])
    
    return data

def normalize_matrix(matrix):
    rowsum = np.asarray(matrix.sum(1))
    row_inv = np.power(np.sqrt(rowsum), -1).flatten()
    row_inv[np.isinf(row_inv)] = 0.
    row_matrix_inv = sp.diags(row_inv)
    matrix = row_matrix_inv.dot(matrix)

    return matrix

def sparse_matrix_to_torch_sparse_tensor(sparse_matrix, maximum_length):
    sparse_matrix = sparse_matrix.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_matrix.row, sparse_matrix.col)).astype(np.int64))
    values = torch.from_numpy(sparse_matrix.data)
    shape = torch.Size((maximum_length, maximum_length))

    return torch.sparse.FloatTensor(indices, values, shape)

def make_adjacency_matrix(edges, num_words):
    adjacency_matrix = sp.coo_matrix(
        (np.ones(len(edges)), (edges[:, 0].astype(np.int32), edges[:, 1].astype(np.int32))),
        shape=(num_words, num_words),
        dtype=np.float32
    )
    
    identity_matrix = sp.coo_matrix(
        (np.ones(len(edges)), (np.arange(len(edges)), np.arange(len(edges)))),
        shape=(num_words, num_words),
        dtype=np.float32
    )
    
    adjacency_matrix = adjacency_matrix + identity_matrix + adjacency_matrix.transpose()
    normalized_adjacency_matrix = normalize_matrix(adjacency_matrix)
    
    return normalized_adjacency_matrix

def make_batch(data, batch_size, is_train=True):
    indices = np.arange(len(data))
    if is_train:
        random.shuffle(indices)
    
    if len(data) % batch_size == 0:
        batch_num = int(len(data)/batch_size)
    else:
        batch_num = int(len(data)/batch_size) + 1
        
    for i in range(batch_num):
        left = i*batch_size
        right = min((i+1)*batch_size, len(data))
        
        sentences = []
        adjacency_matrics = []
        labels = []
        
        for j in indices[left:right]:
            sentences.append(data[j][0])
            adjacency_matrics.append(data[j][1])
            labels.append(data[j][2])
        
        yield sentences, adjacency_matrics, labels

In [5]:
# ============================
# Data Pre Processing
# ============================
print("Load Data...")
data = load_data(DATA_FILE_PATH)

print("Make Dictionary...")
word2id, id2word, edge2id, id2edge, maximum_length = make_dictionary(data)

print("Make Input as Index...")
data = make_input_data_as_index(data, word2id, edge2id)

print("Make Adjacency Matrix...")
start = time.time()
pool = prpExecutor(max_workers=16)
data = list(pool.map(make_input_adjacency_matrix, [data]))[0]
print(int(time.time() - start))

print("Make Sparse Tensor...")
start = time.time()
for line in data:
    line[0] += [0] * (maximum_length - len(line[0]))
    line[1] = sparse_matrix_to_torch_sparse_tensor(line[1], maximum_length)
print(int(time.time() - start))

Load Data...
Make Dictionary...
Make Input as Index...
Make Adjacency Matrix...




332
Make Sparse Tensor...
1529


In [26]:
# ============================
# Model
# ============================
class GCNLayer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(GCNLayer, self).__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim

        self.weight = nn.Parameter(
            torch.randn(self.input_dim, self.output_dim))
        nn.init.xavier_normal_(self.weight)

        self.bias = nn.Parameter(torch.zeros(self.output_dim))

    def forward(self, x, adj_matrics):
        x = torch.matmul(adj_matrics, x)
        output = torch.matmul(x, self.weight)
        output = output + self.bias

        return output

class Model(nn.Module):
    def __init__(self, num_words, embedding_dim, hidden_dim, dropout_rate):
        super(Model, self).__init__()

        self.num_words = num_words
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.dropout_rate = dropout_rate

        # =============================================
        # Data Preparation
        # =============================================		
        self.word_embedding \
        = nn.Embedding(self.num_words, self.embedding_dim, padding_idx = 0)
        nn.init.xavier_normal_(self.word_embedding.weight)
        self.gcn_layer_1 \
        = GCNLayer(self.embedding_dim, self.hidden_dim[0])
        self.gcn_layer_2 \
        = GCNLayer(self.hidden_dim[0], self.hidden_dim[1])

    def forward(self, sentences, adjacency_matrics):
        embedded_words = self.word_embedding(sentences)
        gcn_1 = self.gcn_layer_1(embedded_words, adjacency_matrics)
        gcn_1 = F.relu(gcn_1)
        gcn_1 = F.dropout(gcn_1, self.dropout_rate)
        gcn_2 = self.gcn_layer_2(gcn_1, adjacency_matrics)
        gcn_2 = F.relu(gcn_2)

        return gcn_2

In [27]:
# =============================================
# Model Initialize
# =============================================
print("Model Initializing..")
criterion = nn.LogSigmoid()

model = Model(len(word2id), EMBEDDING_DIM, HIDDEN_STATES, DROPOUT_RATE)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

Model Initializing..


In [29]:
# =============================================
# Model Training
# =============================================
model.train()

batches = make_batch(data[:100], BATCH_SIZE)
for batch in batches:
    sentences, adjacency_matrics, labels = batch
    input_sentences = torch.tensor(sentences, dtype = torch.long)
    input_adjacency_matrics = torch.stack([matrix.to_dense() for matrix in adjacency_matrics], dim=0)
    input_labels = torch.tensor(labels, dtype=torch.float32)
    print(model(input_sentences, input_adjacency_matrics))

torch.Size([100, 174, 174])
torch.Size([100, 174, 50])


[[1, 2, 3, 4, 5, 6, 7],
 tensor(indices=tensor([[0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6],
                        [5, 0, 5, 1, 5, 2, 5, 3, 5, 4, 6, 5, 4, 3, 2, 1, 0, 5]]),
        values=tensor([0.7071, 0.7071, 0.7071, 0.7071, 0.7071, 0.7071, 0.7071,
                       0.7071, 0.7071, 0.7071, 0.3780, 0.3780, 0.3780, 0.3780,
                       0.3780, 0.3780, 0.3780, 1.0000]),
        size=(174, 174), nnz=18, layout=torch.sparse_coo),
 0.0]

In [19]:
for line in data:
    line[0] += [0] * (maximum_length - len(line[0]))

[[1,
  2,
  3,
  4,
  5,
  6,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 tensor(indices=tensor([[0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6],
                        [5, 0, 5, 1, 5, 2, 5, 3, 