In [1]:
import os
import json
from glove import Glove
from glove import Corpus
from IPython.core.debugger import set_trace
import re

# Data

In [2]:
data_home = "../data4bilstm"

In [3]:
experiment_name = "webnlg"
emb_dim = 300

In [4]:
data_dir = os.path.join(data_home, experiment_name)
train_data_path = os.path.join(data_dir, "train_data.json")
valid_data_path = os.path.join(data_dir, "valid_data.json")
test_data_dir = os.path.join(data_dir, "test_data")
test_data_path_dict = {}
for path, folds, files in os.walk(test_data_dir):
    for file_name in files:
        file_path = os.path.join(path, file_name)
        file_name = re.match("(.*?)\.json", file_name).group(1)
        test_data_path_dict[file_name] = file_path

In [5]:
train_data = json.load(open(train_data_path, "r", encoding = "utf-8"))
valid_data = json.load(open(valid_data_path, "r", encoding = "utf-8"))
test_data_dict = {}
for file_name, path in test_data_path_dict.items():
    test_data_dict[file_name] = json.load(open(path, "r", encoding = "utf-8"))

In [6]:
all_data = train_data + valid_data
for data in list(test_data_dict.values()):
    all_data.extend(data)
    
corpus = [sample["text"].split(" ") for sample in all_data]
len(corpus)

6222

# Glove

In [7]:
def train_glove_emb(corpus, window = 10, emb_dim = 100, learning_rate = 0.05, epochs = 10, thr_workers = 6):
    corpus_model = Corpus()
    corpus_model.fit(corpus, window = window)
    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)
    
    glove = Glove(no_components = emb_dim, learning_rate = learning_rate)
    glove.fit(corpus_model.matrix, 
              epochs = epochs,
              no_threads = thr_workers, 
              verbose = True)
    glove.add_dictionary(corpus_model.dictionary)
    return glove

In [8]:
# glove
golve = train_glove_emb(corpus, emb_dim = emb_dim)

Dict size: 4126
Collocations: 191141
Performing 10 training epochs with 6 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9


In [9]:
# save
save_path = os.path.join("../pretrained_word_emb", "glove_{}_{}.emb".format(emb_dim, experiment_name))
golve.save(save_path)

In [10]:
golve.most_similar('university', number = 10)

[('Register', 0.9819727727207307),
 ('U', 0.9798825776793576),
 ('location', 0.9765197346769859),
 ('title', 0.9749676051364277),
 ('Battle', 0.9718289957248922),
 ('Republic', 0.9708853824285265),
 ('publisher', 0.9707570316108278),
 ('municipality', 0.967908142328721),
 ('order', 0.9674162826876535)]

In [11]:
golve.word_vectors.shape

(4126, 300)

In [12]:
# Quick Start

# # get similar words
# golve.most_similar('Massachusetts', number = 10)

# # emb matrix shape
# golve.word_vectors.shape

# # get id 
# golve.dictionary['Virginia']

# # # 指定词条词向量
# # glove.word_vectors[glove.dictionary['university']]

# # save
# save_path = os.path.join(data_home, "pretrained_word_embeddings", "glove_100.emb")
# glove.save(save_path)

# # load
# glove = Glove()
# glove = glove.load(save_path)