In [1]:
# Some preparation: load libraries, define functions

In [2]:
!pwd

/home/Tanjin_He/Research/Codes/DataMiningBasedMaterialsSynthesis/Codes/flexible_input


In [3]:
import os
import numpy as np
from collections import OrderedDict
import materials_entity_recognition as MER



MaterialParser version 3.7
Pubchem lookup is on! Will search for unknown materials name in PubChem DB.


In [4]:
def get_default_parameters():
    """
    Load default configuration. 
    """
    # Parse parameters
    parameters = OrderedDict()
    # "Tagging scheme (iob or iobes)"
    parameters['tag_scheme'] = 'iobes'
    # "Lowercase words"
    parameters['lower'] = False
    # "Replace digits with 0"
    parameters['zeros'] = False
    # "Token embedding dimension"
    parameters['word_dim'] = 100
    # "Token LSTM hidden layer dimension"
    parameters['word_lstm_dim'] = 100
    # "Use a bidirectional LSTM for words"
    parameters['word_bidirect'] = True
    # "Matrix of pretrained embeddings"
    parameters['pre_emb'] = None
    # "Use CRF (0 to disable)"
    parameters['crf'] = True
    # "Droupout on the input (0 = no dropout)"
    parameters['dropout'] = 0.5
    # "Learning method (SGD, Adadelta, Adam..) and learn rate (0.05 as default)"
    parameters['lr_method'] = "sgd-lr_.005"
    # "Input ids of tokens"
    parameters['input_vector'] = True
    # "Input embeddings of tokens"
    parameters['input_matrix'] = False
    return parameters

def validate_parameters(parameters):
    """
    Make sure the parameters are valid
    """
    assert parameters['word_dim'] > 0
    assert 0. <= parameters['dropout'] < 1.0
    assert parameters['tag_scheme'] in ['iob', 'iobes']
    # input either vector (ids of words in a sentence) or 
    # matrix (embeddings of words in a sentence)
    assert not (parameters["input_matrix"] and parameters["input_vector"])
    # if input vector, the dimension of embeddings should be claimed
    # the source of embeddings should also be set
    if parameters["input_vector"]:
        assert (parameters['word_dim'] == parameters['pre_emb'].shape[1]) 
        
def prepare_dataset(sentences, word_to_id, tag_to_id, lower=False):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word ids
        - tag ids
    """
    def f(x): return x.lower() if lower else x
    data = []
    for s in sentences:
        str_words = [w[0] for w in s]
        words = []
        for tmp_index, w in enumerate(str_words):
            if f(w) in word_to_id:
                tmp_word = f(w)
            else:
                tmp_word = '<UNK>'
            words.append(word_to_id[tmp_word])

        tags = [tag_to_id[w[-1]] for w in s]
        data.append({
            'str_words': str_words,
            'words': words,
            'tags': tags,
        })
    return data

# Prepare data and set up parameters

In [5]:
#-----------------------------The main part need to be edited-----------------------------------------------------
# data loading
# basically, you can use any format you like to load data 
# here is an example for CoNLL format
# The training/validation/test data file consists of many lines, 
# each of which is a token and its attributes separated by blanks, 
# e.g.: word O Y tag
# the first entry is the token text
# the last entry is the tag, such as B-Mat (beginning of material), 
# I-Tar (intermediate parte of target), O (outside)
# other entries between the first and the last can be used to feed pre-engineered features, 
# which are not used in this example
# There is also empty lines, which are used to separate tokens in different sentences
# tokens between two empty lines form a sentence

# path to training data
path_train = "dataset/test_data/Rs_TP_step2_750_train.text"
# path to develop/validation data
path_dev = "dataset/test_data/Rs_TP_step2_750_dev.text"
# path to test data
path_test = "dataset/test_data/Rs_TP_step2_750_test.text"

# embedding loading
# There are three ways to load embeddings
# 1. use a file, in which each line is a token and its embeddings,
#  e.g.: word 0.0 0.0 0.0 ... 0.0
# specify emb_path in this manner
# 2. use a dict (like word2vec in gensim)
# specify emb_dict in this manner
# 3. use a matrix (numpy 2d array), the index of matrix should be consist with the vocab provided
# specify params['pre_emb'] in this manner

# path to embedding file, here we use the first manner to load embeddings
emb_path = "dataset/embedding/embedding_MAT_combine_sg_win5_size100_iter50_noLemma_4.text"
# dict of embedding, should be provided if the second manner to load embeddings is used
emb_dict = None

# provide the vocab if you want to use all words in embedding source
# in this example, vocab is [] and later the words in the whole dataset 
# (including training/validation/test sets) are assigned to vocab, 
# because embedding file is provided
# <UNK> is automatically inserted as the first token in vocab
vocab = []
# set true if reloading model for prediction
reload_model = False
# should be specified if reloading pre-trained model
model_path = None

# parameters setting 
# get default parameters first
params = get_default_parameters()
# "Token embedding dimension"
params['word_dim'] = 100
# there are two ways to feed inputs
# 1. the words in sentences are automatically converted to 
# a 2d list such as [[w0, w1, w2], [w0, w1, w2]],
# where each element is the id of word (index in one-hot vector)
# then the embedding is automatically loaded
# specify params['input_vector'] = True in this manner
# 2. input self-designed features directly (including all things such as embedding and additional features)
# the input X should to train() and predict() methods should be like 
# [
#     [
#         [w0_emb_0, w0_emb_1, w0_emb_2],    # -> one token 
#         [w0_emb_0, w0_emb_1, w0_emb_2],    # -> one token
#     ]                                      # -> one sentence
# ]                                          # -> all sentences
# specify params['input_matrix'] = True in this manner

# "Input ids of tokens"
# the first manner to feed inputs is used here
params['input_vector'] = True
# "Matrix of pretrained embeddings"
# should be specified as a numpy 2d array if the third manner to load embeddings is used
params['pre_emb'] = None
# "Input embeddings of tokens"
# should be specified as True if using the second manner to feed inputs 
params['input_matrix'] = False


#---------------not need to be edited if not changing the format of data files-----------------------------
# Data parameters
lower = params['lower']
zeros = params['zeros']
tag_scheme = params['tag_scheme']

# Load sentences
train_sentences = MER.loader.load_sentences(path_train, lower, zeros)
dev_sentences = MER.loader.load_sentences(path_dev, lower, zeros)
test_sentences = MER.loader.load_sentences(path_test, lower, zeros)

# Use selected tagging scheme (IOB / IOBES)
MER.loader.update_tag_scheme(train_sentences, tag_scheme)
MER.loader.update_tag_scheme(dev_sentences, tag_scheme)
MER.loader.update_tag_scheme(test_sentences, tag_scheme)

# Create a dictionary / mapping of words
# If we use pretrained embeddings, we add them to the dictionary.
if vocab:
    if '<UNK>' not in vocab:
        vocab.insert(0, '<UNK>')
    id_to_word = {i: v for i, v in enumerate(vocab)}
    word_to_id = {v: k for k, v in list(id_to_word.items())}
elif (emb_path or emb_dict):
    vocab, word_to_id, id_to_word = MER.loader.word_mapping(
        train_sentences + dev_sentences + test_sentences, lower)
else:
    vocab, word_to_id, id_to_word = MER.loader.word_mapping(train_sentences, lower)

# Create a dictionary and a mapping for words / POS tags / tags
dico_tags, tag_to_id, id_to_tag = MER.loader.tag_mapping(train_sentences)
    
# get embedding matrix
if (emb_path or emb_dict) and (not params['pre_emb']):
    params['pre_emb'] = MER.loader.prepare_embedding_matrix(id_to_word, params['word_dim'], 
                                                 emb_path=emb_path, emb_dict=emb_dict)
    
# ensure thee parameters are valid
validate_parameters(params)



Found 6601 unique words (137296 in total)
{'O': 84890, 'S-Mat': 2688, 'S-Tar': 824, 'S-Pre': 2221}
Found 4 unique named entity tags
Loading pretrained embeddings from dataset/embedding/embedding_MAT_combine_sg_win5_size100_iter50_noLemma_4.text...
Loaded 16926 pretrained embeddings.
5928 / 6601 (89.8046%) words have been initialized with pretrained embeddings.
5563 found directly, 63 after lowercasing, 59 after lowercasing + zero. 243 after lemma.


In [6]:
print(params['pre_emb'].shape, type(params['pre_emb']) )

(6601, 100) <class 'numpy.ndarray'>


# training and predicting

In [7]:
# Index data
train_data = prepare_dataset(
    train_sentences, word_to_id, tag_to_id, lower
)
dev_data = prepare_dataset(
    dev_sentences, word_to_id, tag_to_id, lower
)
test_data = prepare_dataset(
    test_sentences, word_to_id, tag_to_id, lower
)

print("%i / %i / %i sentences in train / dev / test." % (
    len(train_data), len(dev_data), len(test_data)))

# build model
if not reload_model:
    # Initialize model
    model = MER.Model_train()
    print("Model location: %s" % model.model_path)
    # Save the mappings to disk
    print('Putting the mappings in model and saving in disk...')
    model.save_mappings(id_to_word, id_to_tag)
    # Build the model
    model.build(**params)
else:
    # Reload previous model values
    print('Reloading previous model...')
    model = MER.Model_train(model_path=model_path)
    print("Model location: %s" % model.model_path)
    # Build the model
    model.build(pre_emb=params['pre_emb'], **model.parameters)
    model.reload()
    
# train model
model.fit(input_X=[d['words'] for d in train_data], 
          input_Y=[d['tags'] for d in train_data],
          dev_X=[d['words'] for d in dev_data], 
          dev_Y=[d['tags'] for d in dev_data],
          dev_sentences=dev_sentences,
          test_X=[d['words'] for d in test_data], 
          test_Y=[d['tags'] for d in test_data],
          test_sentences=test_sentences,
          n_epochs=3,
         )

# use trained model to predict
label_predictions = model.predict_label([d['words'] for d in test_data])



3477 / 718 / 1067 sentences in train / dev / test.
Reloading previous model...
Model location: models/model_0


