In [71]:
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import scipy.sparse as sp
import time
import random
from concurrent.futures import ProcessPoolExecutor as prpExecutor
import sys

In [62]:
# ============================
# Data File Path
# ============================
DATA_FILE_PATH = "/hdd1/Spoiler_Detection/ACL/INGGEOL/node_edge_info_10000_new.json"

# ============================
# Model Hyper Parameter
# ============================
EMBEDDING_DIM = 50
HIDDEN_STATES = [50, 50]

# ============================
# Training Hyper Parameter
# ============================
EPOCHS = 100
LEARNING_RATE = 0.00005
BATCH_SIZE = 256
WEIGHT_DECAY = 1e-5
DROPOUT_RATE = 0.5
RANDOM_SEED = 26

In [11]:
# ============================
# Data Pre-Processing
# ============================
def load_data(file_path):
    data = []
    with open(file_path) as f:
        _data = json.load(f)
    for review in _data:
        data += review["text_info"]
    return data

def make_dictionary(data):
    maximum_length = max([len(line.split("\t")[0].split()) for line in data])

    word2id = {"<PAD>":0}
    id2word = ["<PAD>"]
    edge2id = {}
    id2edge = []
    
    for line in data:
        tokens = line.split("\t")
        for word in tokens[0].split():
            if word not in word2id:
                word2id[word] = len(word2id)
                id2word.append(word)
        for edges in tokens[2:]:
            _tokens = edges.split(":")
            if len(_tokens) != 3:
                start, end = _tokens[0], _tokens[1]
                edge = ":".join(_tokens[2:])
            else:
                start, end, edge = _tokens
            if edge not in edge2id:
                edge2id[edge] = len(edge2id)
                id2edge.append(edge)

    return word2id, id2word, edge2id, id2edge, maximum_length

def make_input_data_as_index(_data, word2id, edge2id):
    data = []
    for line in _data:
        tokens = line.split("\t")
        tokens[0] = [word2id[word] for word in tokens[0].split()]
        _edges = []
        for edges in tokens[2:]:
            _tokens = edges.split(":")
            if len(_tokens) != 3:
                start, end = _tokens[0], _tokens[1]
                edge = ":".join(_tokens[2:])
            else:
                start, end, edge = _tokens
            _edges.append([start, end, edge2id[edge]])
        data.append([tokens[0], tokens[1], _edges])
        
    return data

def make_input_adjacency_matrix(_data):
    data = []
    for line in _data:
        words, label, edges = line[0], float(line[1]), line[2]
        adjacency_matrix = make_adjacency_matrix(np.asarray(edges), len(words))
        data.append([words, adjacency_matrix, label])
    
    return data

def normalize_matrix(matrix):
    rowsum = np.asarray(matrix.sum(1))
    row_inv = np.power(np.sqrt(rowsum), -1).flatten()
    row_inv[np.isinf(row_inv)] = 0.
    row_matrix_inv = sp.diags(row_inv)
    matrix = row_matrix_inv.dot(matrix)

    return matrix

def sparse_matrix_to_torch_sparse_tensor(sparse_matrix, maximum_length):
    sparse_matrix = sparse_matrix.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_matrix.row, sparse_matrix.col)).astype(np.int64))
    values = torch.from_numpy(sparse_matrix.data)
    shape = torch.Size((maximum_length, maximum_length))

    return torch.sparse.FloatTensor(indices, values, shape)

def make_adjacency_matrix(edges, num_words):
    adjacency_matrix = sp.coo_matrix(
        (np.ones(len(edges)), (edges[:, 0].astype(np.int32), edges[:, 1].astype(np.int32))),
        shape=(num_words, num_words),
        dtype=np.float32
    )
    
    identity_matrix = sp.coo_matrix(
        (np.ones(len(edges)), (np.arange(len(edges)), np.arange(len(edges)))),
        shape=(num_words, num_words),
        dtype=np.float32
    )
    
    adjacency_matrix = adjacency_matrix + identity_matrix + adjacency_matrix.transpose()
    normalized_adjacency_matrix = normalize_matrix(adjacency_matrix)
    
    return normalized_adjacency_matrix

def make_batch(data, batch_size, is_train=True):
    indices = np.arange(len(data))
    if is_train:
        random.shuffle(indices)
    
    if len(data) % batch_size == 0:
        batch_num = int(len(data)/batch_size)
    else:
        batch_num = int(len(data)/batch_size) + 1
        
    for i in range(batch_num):
        left = i*batch_size
        right = min((i+1)*batch_size, len(data))
        
        sentences = []
        adjacency_matrics = []
        labels = []
        
        for j in indices[left:right]:
            sentences.append(data[j][0])
            adjacency_matrics.append(data[j][1])
            labels.append(data[j][2])
        
        yield sentences, adjacency_matrics, labels

In [5]:
# ============================
# Data Pre Processing
# ============================
print("Load Data...")
data = load_data(DATA_FILE_PATH)

print("Make Dictionary...")
word2id, id2word, edge2id, id2edge, maximum_length = make_dictionary(data)

print("Make Input as Index...")
data = make_input_data_as_index(data, word2id, edge2id)

print("Make Adjacency Matrix...")
start = time.time()
pool = prpExecutor(max_workers=16)
data = list(pool.map(make_input_adjacency_matrix, [data]))[0]
print(int(time.time() - start))

print("Make Sparse Tensor...")
start = time.time()
for line in data:
    line[0] += [0] * (maximum_length - len(line[0]))
    line[1] = sparse_matrix_to_torch_sparse_tensor(line[1], maximum_length)
print(int(time.time() - start))

Load Data...
Make Dictionary...
Make Input as Index...
Make Adjacency Matrix...




332
Make Sparse Tensor...
1529


Process Process-16:
Process Process-14:
Process Process-15:
Process Process-12:
Process Process-9:
Traceback (most recent call last):
Process Process-7:
Process Process-6:
Process Process-5:
Process Process-10:
Process Process-4:
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Process Process-8:
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Process Process-11:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Process Process-3:
Process Process-13:
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Process Process-2:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Process Process-1:
  File

  File "/usr/lib/python3.5/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/usr/lib/python3.5/multiprocessing/queues.py", line 93, in get
    with self._rlock:
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
  File "/usr/lib/python3.5/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/usr/lib/python3.5/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
  File "/usr/lib/python3.5/multiprocessing/queues.py", line 93, in get
    with self._rlock:
  File "/usr/lib/python3.5/multiprocessing/queues.py", line 93, in get
    with self._rlock:
KeyboardInterrupt
  File "/usr/lib/python3.5/multiprocessing/queues.py", line 94, in get
    res = self._recv_bytes()
  File "/usr/lib/python3.5/multiprocessing/queues.py", line 93, in get
    with self._rlock:
  File "/usr/lib/python3.5/multiprocessing/synchronize.py", line 96, in __enter__

In [67]:
# ============================
# Model
# ============================
class GCNLayer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(GCNLayer, self).__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim

        self.weight = nn.Parameter(
            torch.randn(self.input_dim, self.output_dim))
        nn.init.xavier_normal_(self.weight)

        self.bias = nn.Parameter(torch.zeros(self.output_dim))

    def forward(self, x, adj_matrics):
        x = torch.matmul(adj_matrics, x)
        output = torch.matmul(x, self.weight)
        output = output + self.bias

        return output

class Model(nn.Module):
    def __init__(self, num_words, embedding_dim, hidden_dim, maximum_length, dropout_rate):
        super(Model, self).__init__()

        self.num_words = num_words
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.maximum_length = maximum_length
        self.dropout_rate = dropout_rate

        # =============================================
        # Data Preparation
        # =============================================		
        self.word_embedding \
        = nn.Embedding(self.num_words, self.embedding_dim, padding_idx = 0)
        nn.init.xavier_normal_(self.word_embedding.weight)
        self.gcn_layer_1 \
        = GCNLayer(self.embedding_dim, self.hidden_dim[0])
        self.gcn_layer_2 \
        = GCNLayer(self.hidden_dim[0], self.hidden_dim[1])
        
        self.max_pooling = nn.MaxPool1d(self.maximum_length)
        self.output_layer = nn.Linear(self.hidden_dim[1], 1)

    def forward(self, sentences, adjacency_matrics, batch_size):
        embedded_words = self.word_embedding(sentences)
        gcn_1 = self.gcn_layer_1(embedded_words, adjacency_matrics)
        gcn_1 = F.relu(gcn_1)
        gcn_1 = F.dropout(gcn_1, self.dropout_rate)
        gcn_2 = self.gcn_layer_2(gcn_1, adjacency_matrics)
        gcn_2 = F.relu(gcn_2)
        sentence_representations = self.max_pooling(gcn_2.transpose(1, 2)).squeeze()
        sentence_representations = F.dropout(sentence_representations, self.dropout_rate)
        output = self.output_layer(sentence_representations)
        
        return output

In [75]:
# =============================================
# Model Initialize
# =============================================
print("Model Initializing..")
pos_weight = 20*torch.ones([1])
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

model = Model(len(word2id), EMBEDDING_DIM, HIDDEN_STATES, maximum_length, DROPOUT_RATE)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

Model Initializing..


In [76]:
# =============================================
# Model Training
# =============================================
print("Model Training..\n")
for i in range(EPOCHS):
    model.train()
    start = time.time()
    
    if len(data) % BATCH_SIZE == 0:
        batch_num = int(len(data)/BATCH_SIZE)
    else:
        batch_num = int(len(data)/BATCH_SIZE) + 1
    
    loss = .0
    batches = make_batch(data, BATCH_SIZE)
    step = 0
    for batch in batches:
        sentences, adjacency_matrics, labels = batch
        input_sentences = torch.tensor(sentences, dtype = torch.long)
        input_adjacency_matrics = torch.stack([matrix.to_dense() for matrix in adjacency_matrics], dim=0)
        input_labels = torch.tensor(labels, dtype=torch.float32)
        optimizer.zero_grad()
        predicted = model(input_sentences, input_adjacency_matrics, len(sentences))
        _loss = criterion(predicted, input_labels.unsqueeze(dim=1)).sum()
        _loss.backward()
        optimizer.step()
        loss += _loss.item()
        step+=1
        
        sys.stdout.flush()
        sys.stdout.write("\r" + "[{}/{}]".format(step, batch_num))
    print("Loss: {}\n".format(loss))

Model Training..

[0/514]

KeyboardInterrupt: 

In [74]:
print("SEX")
for i in range(10):
    sys.stdout.flush()
    sys.stdout.write("\r" + "{}".format(i))

SEX
9