# MLTagger

## Background

This repo is based on the paper [Zero-shot Sequence Labeling: Transferring Knowledge from Sentences to Tokens](https://arxiv.org/pdf/1805.02214.pdf) by Marek Rei and Anders Søgaard. The original code repositary can be found [here](https://github.com/marekrei/mltagger).

## Getting Started

To begin, train the model with `experiment.py` where it takes in a configuration file that contains the necessary hyperparameters and file paths. The script is dependent on two other files ([model.py](https://github.com/DerekChia/mltagger/blob/master/src/model.py) and [evaluator.py](https://github.com/DerekChia/mltagger/blob/master/src/evaluator.py)) where the MLTModel and MLTEvaluator objects are initialised and updated during runtime. Since we are using Jupyter Notebook, these two files are imported using `model.ipynb` and `evaluator.ipynb`.

Reference: [src/experiment.py](https://github.com/DerekChia/mltagger/blob/master/src/experiment.py)

In [1]:
%run model.ipynb
%run evaluator.ipynb

In [2]:
import sys
import collections
import numpy
import random
import math
import os
import gc
import tensorflow as tf

try:
    import ConfigParser as configparser
except:
    import configparser

# Replaced by %run model.ipynb and evaluator.ipynb
# from model import MLTModel
# from evaluator import MLTEvaluator

## Available functions in experiment.py

```Python
def read_input_files(file_paths, max_sentence_length=-1):
    return sentences

def parse_config(config_section, config_path):
    return config

def is_float(value):
    return {True, False}

def create_batches_of_sentence_ids(sentences, batch_equal_size, max_batch_size):
    return batches_of_sentence_ids

def process_sentences(epoch, data, model, is_training, learningrate, config, name):
    return results 

def run_experiment(config_path)

```

### read_input_files
This function takes in the input file(s) path and return sentences in the following format. Note that the input files are expected to be in TSV format, CoNLL style.

```
    sentence = [['Not', 'c'], ['only', 'c'], ['as', 'c'], ['a', 'c'], ['hobby', 'c'], ['.', 'c']]
```

Return:
```
    sentences = [
    [['Not', 'c'], ['only', 'c'], ['as', 'c'], ['a', 'c'], ['hobby', 'c'], ['.', 'c']],
    [['They', 'c'], ['use', 'c'], ['computers', 'c'], ['for', 'c'], ['their', 'c'], ['works', 'i'], ['.', 'c']]
    ]
```

In [None]:
def read_input_files(file_paths, max_sentence_length=-1):
    """
    Reads input files in whitespace-separated format.
    Will split file_paths on comma, reading from multiple files.
    """
    sentences = []
    line_length = None
    # file_path might contain multiple files, split them and iterate through
    for file_path in file_paths.strip().split(","):
        with open(file_path, "r") as f:
            sentence = []
            for line in f:
                line = line.strip()
                # Ensure that line contains character, else this is an indicator for newline
                if len(line) > 0:
                    line_parts = line.split()
                    # Check if input file has both word and label
                    # Might not be a necessary check during inference since 
                    # input may not have a ground truth (second column)
                    # assert(len(line_parts) >= 2), line
                    # assert(len(line_parts) == line_length or line_length == None)
                    # line_length = len(line_parts)
                    sentence.append(line_parts)
                # If line has no character (i.e. empty) and length of previous sentence is more than zero,
                # push the previous sentence into the sentences list and anticipate new sentence.
                elif len(line) == 0 and len(sentence) > 0:
                    if max_sentence_length <= 0 or len(sentence) <= max_sentence_length:
                        sentences.append(sentence)
                    sentence = []
            # Not in use
            if len(sentence) > 0:
                if max_sentence_length <= 0 or len(sentence) <= max_sentence_length:
                    sentences.append(sentence)
    return sentences

### parse_config
This function reads configuration from the input `config_path` and returns a dictionary where it tries to guess the correct datatype for each config value. `config_section` is used as a starting key [config].

In [None]:
def parse_config(config_section, config_path):
    """
    Reads configuration from the file and returns a dictionary.
    Tries to guess the correct datatype for each of the config values.
    """
    config_parser = configparser.SafeConfigParser(allow_no_value=True)
    config_parser.read(config_path)
    config = collections.OrderedDict()
    for key, value in config_parser.items(config_section):
        if value is None or len(value.strip()) == 0:
            config[key] = None
        elif value.lower() in ["true", "false"]:
            config[key] = config_parser.getboolean(config_section, key)
        elif value.isdigit():
            config[key] = config_parser.getint(config_section, key)
        elif is_float(value):
            config[key] = config_parser.getfloat(config_section, key)
        else:
            config[key] = config_parser.get(config_section, key)
    return config

### is_float
This function checks if the parameter is of type float.

In [None]:
def is_float(value):
    """
    Check in value is of type float()
    """
    try:
        float(value)
        return True
    except ValueError:
        return False

### create_batches_of_sentence_ids
Based on the input parameters:

- If `batch_equal_size` is `True`, group sentences into batches by IDs based on their length, according to a limit of `max_batch_size`. For example, if there are 64 sentences with length 10 (i.e. 64 sentences each with 10 characters in total) and `max_batch_size` is set to `32`, these 64 sentences will be batched together into 2 batches.
    ```
    Example:
    Length of each sentence, [ID of each sentence]
    10, [   [0, ... , 31]
    10,     [32, ... , 63]
    25,     [2553, 2680]
    43,     [142, 305, 1490, 1775, 1973]
    27,     [194, 197, 245, 348, 359]
    XX,     [...]    ]
    ```
- If `batch_equal_size` is `False`, each batch will contain a fixed number of sentences (`max_batch_size`).
    ```
    Example:
    [ID of each sentence]
    [
        [0, 1, 2, 3, 4]
        [5, 6, 7, 8, 9]
        [10, 11, 12, 13, 14]
    ]
    ```

Return: 
- List of lists with sentences ids.

Default value:
- `batch_equal_size` = `True`
- `max_batch_size` = `32`

In [None]:
def create_batches_of_sentence_ids(sentences, batch_equal_size, max_batch_size):
    """
    Groups together sentences into batches
    If max_batch_size is positive, this value determines the maximum number of sentences in each batch.
    If max_batch_size has a negative value, the function dynamically creates the batches such that each batch contains abs(max_batch_size) words.
    Returns a list of lists with sentences ids.
    """
    if batch_equal_size == True:
        sentence_ids_by_length = collections.OrderedDict()
        for _id, sentence in enumerate(sentences):
            length = len(sentence)
            if length not in sentence_ids_by_length:
                sentence_ids_by_length[length] = []
                sentence_ids_by_length[length].append(_id)
            else:
                sentence_ids_by_length[length].append(_id)
        # Fitting each batch to the size defined in max_batch_size
        for sentence_length in sentence_ids_by_length:
            for i in range(0, len(sentence_ids_by_length[sentence_length]), max_batch_size):
                batches_of_sentence_ids.append(sentence_ids_by_length[sentence_length][i : i + max_batch_size])
    else:
        batch = []
        for i in range(len(sentences)):
            if len(batch) != max_batch_size:
                batch.append(i)
            else:
                batches_of_sentence_ids.append(batch)
                batch = []
                batch.append(i)
    return batches_of_sentence_ids

"""Original Code Below"""
#     batches_of_sentence_ids = []
#     if batch_equal_size == True:
#         sentence_ids_by_length = collections.OrderedDict()
#         sentence_length_sum = 0.0
#         for i in range(len(sentences)):
#             length = len(sentences[i])
#             if length not in sentence_ids_by_length:
#                 sentence_ids_by_length[length] = []
#             sentence_ids_by_length[length].append(i)

#         for sentence_length in sentence_ids_by_length:
#             if max_batch_size > 0:
#                 batch_size = max_batch_size
#             else:
#                 batch_size = int((-1 * max_batch_size) / sentence_length)

#             for i in range(0, len(sentence_ids_by_length[sentence_length]), batch_size):
#                 batches_of_sentence_ids.append(sentence_ids_by_length[sentence_length][i:i + batch_size])
#     else:
#         current_batch = []
#         max_sentence_length = 0
#         for i in range(len(sentences)):
#             current_batch.append(i)
#             if len(sentences[i]) > max_sentence_length:
#                 max_sentence_length = len(sentences[i])
#             if (max_batch_size > 0 and len(current_batch) >= max_batch_size) \
#               or (max_batch_size <= 0 and len(current_batch)*max_sentence_length >= (-1 * max_batch_size)):
#                 batches_of_sentence_ids.append(current_batch)
#                 current_batch = []
#                 max_sentence_length = 0
#         if len(current_batch) > 0:
#             batches_of_sentence_ids.append(current_batch)
#     return batches_of_sentence_ids

### process_sentences

Function processes all sentences with `evaluator` object and return evaluation metrics. Function will trigger `process_batch` function by feeding in batch of sentences and learning rate configured.

Return:
- `evaluator` object with results populated

Default values:
- `garbage_collection` = `False`
- `batch_equal_size` = `True`
- `max_batch_size` = `32`

In [3]:
def process_sentences(epoch, data, model, is_training, learningrate, config, name):
    """
    Process all the sentences with the labeler, return evaluation metrics.
    """
    evaluator = MLTEvaluator(config)
    
    # From data, create batches of sentences ids based batch_equal_size flag.
    # batch_equal_size = False, max_batch_size = 32
    batches_of_sentence_ids = create_batches_of_sentence_ids(data, config["batch_equal_size"], config["max_batch_size"])
    
    # Randomly shuffle data AGAIN if this is training data. 
    # Question: Why shuffle again when data_train was shuffled in run_experiment() before this?
    # See random.shuffle(data_train)
    if is_training == True:
        random.shuffle(batches_of_sentence_ids)
    
    # sentence_ids_in_batch refers to each batch of sentence_ids
    # E.g. sentence_ids_in_batch = [10, 11, 12, 13, 14]
    for count, sentence_ids_in_batch in enumerate(batches_of_sentence_ids):        
        # Build batch from data. Each batch now contains sentences (not ID)
        # For every sentence ID (i) in sentence_ids, get the actual sentence data[i]
        # batch = [
        # [['Not', 'c'], ['only', 'c'], ['as', 'c'], ['a', 'c'], ['hobby', 'c'], ['.', 'c']], # First sentence
        # [['They', 'c'], ['use', 'c'], ['computers', 'c'], ['for', 'c'], ['their', 'c'], ['works', 'i'], ['.', 'c']] # Second sentence
        #]
        batch = [data[i] for i in sentence_ids_in_batch]
        
        # Get cost, predicted labels and probs for each batch
        print('############### Epoch', epoch + 1,'Batch', count + 1, 'of', len(batches_of_sentence_ids) , '###############')
        cost, sentence_scores, token_scores_list = model.process_batch(batch, is_training, learningrate)
        
        # Append cost, predicted labels and probs to the evaluator object
        evaluator.append_data(cost, batch, sentence_scores, token_scores_list)
    
        # Not in use. garbage_collection defaults to False.
        while config["garbage_collection"] == True and gc.collect() > 0:
            pass

        results = evaluator.get_results(name)
        for key in results:
            print(key + ": " + str(results[key]))

    return results

### run_experiment

This function puts together all the supporting functions (above). 
1. Load train, dev and test data using read_input_files
2. Intialize MLTModel with configuration
3. 


Dependencies:
- model
- model.build_vocabs
- model.construct_network
- model.initialize_session
- model.preload_word_embeddings
- model.get_parameter_count
- model.get_parameter_count_without_word_embeddings

Default values (config):
- random_seed = 100
- {path_train, path_dev, path_test} = PATH_TO_DATA
- preload_vectors = glove.6B.300d.txt
- model_selector = dev_sentence_f1_score:high
- learningrate = 1.0
- epochs = 200
- stop_if_no_improvement_for_epochs = 7
- learningrate_decay = 0.9
- save = PATH_TO_SAVED_MODEL

In [1]:
def run_experiment(config_path):
    # config is an ConfigParser object
    config = parse_config("config", config_path)
    
    # Temporary path for storing model
    temp_model_path = config_path + ".model"
    
    # Taking in the random seed from config
    if "random_seed" in config:
        random.seed(config["random_seed"])
        numpy.random.seed(config["random_seed"])

    # Not needed - To print everything in config
    # for key, val in config.items():
    #     print(str(key) + ": " + str(val))

    # Turning training, dev and test dataset into sentences using read_input_files
    data_train, data_dev, data_test = None, None, None
    if config["path_train"] != None and len(config["path_train"]) > 0:
        data_train = read_input_files(config["path_train"], config["max_train_sent_length"])
    if config["path_dev"] != None and len(config["path_dev"]) > 0:
        data_dev = read_input_files(config["path_dev"])
    
    # There could be multiple test datasets, so we need to iterate through the paths
    if config["path_test"] != None and len(config["path_test"]) > 0:
        data_test = []
        for path_test in config["path_test"].strip().split(":"):
            data_test += read_input_files(path_test)
    
    # Initialising the model object
    model = MLTModel(config)
    
    # build_vocabs returns a list of vocabulary in a form of word2id, char2id and singletons
    model.build_vocabs(data_train, data_dev, data_test, config["preload_vectors"])
    
    # construct_network will set up tensorflow graph with the layers described in the paper
    model.construct_network()
    
    # Initialising tf.Session and set up session configuration
    model.initialize_session()
    if config["preload_vectors"] != None:
        model.preload_word_embeddings(config["preload_vectors"])
    
    # Get the number of parameters that need to be tuned in the graph
    print("parameter_count: " + str(model.get_parameter_count()))
    print("parameter_count_without_word_embeddings: " + str(model.get_parameter_count_without_word_embeddings()))

    # Run if there is training data loaded
    if data_train != None:
        # model_selector = dev_sentence_f1_score
        # model_selector_type = high
        model_selector = config["model_selector"].split(":")[0]
        model_selector_type = config["model_selector"].split(":")[1]
        best_selector_value = 0.0
        best_epoch = -1
        learningrate = config["learningrate"]
        
        # This is where training begins - iterating through the epochs
        for epoch in range(config["epochs"]):
            print("EPOCH: " + str(epoch))
            print("current_learningrate: " + str(learningrate))
            
            # Shuffling training data (the first time). Will be shuffling again in process_sentences
            random.shuffle(data_train)
            
            # process_sentences will trigger 
            results_train = process_sentences(epoch, data_train, model, is_training=True, learningrate=learningrate, config=config, name="train")

            # Run if there is dev data loaded
            if data_dev != None:
                results_dev = process_sentences(epoch, data_dev, model, is_training=False, learningrate=0.0, config=config, name="dev")

                if math.isnan(results_dev["dev_cost_sum"]) or math.isinf(results_dev["dev_cost_sum"]):
                    raise ValueError("Cost is NaN or Inf. Exiting.")

                if (epoch == 0 or (model_selector_type == "high" and results_dev[model_selector] > best_selector_value) 
                               or (model_selector_type == "low" and results_dev[model_selector] < best_selector_value)):
                    best_epoch = epoch
                    best_selector_value = results_dev[model_selector]
                    model.saver.save(model.session, temp_model_path, latest_filename=os.path.basename(temp_model_path)+".checkpoint")
                print("best_epoch: " + str(best_epoch))

                if config["stop_if_no_improvement_for_epochs"] > 0 and (epoch - best_epoch) >= config["stop_if_no_improvement_for_epochs"]:
                    break

                if (epoch - best_epoch) > 3:
                    learningrate *= config["learningrate_decay"]

            while config["garbage_collection"] == True and gc.collect() > 0:
                pass

        if data_dev != None and best_epoch >= 0:
            # loading the best model so far
            model.saver.restore(model.session, temp_model_path)
            os.remove(temp_model_path+".checkpoint")
            os.remove(temp_model_path+".data-00000-of-00001")
            os.remove(temp_model_path+".index")
            os.remove(temp_model_path+".meta")

    if config["save"] is not None and len(config["save"]) > 0:
        model.save(config["save"])

    if config["path_test"] is not None:
        i = 0
        for path_test in config["path_test"].strip().split(":"):
            data_test = read_input_files(path_test)
            results_test = process_sentences(epoch, data_test, model, is_training=False, learningrate=0.0, config=config, name="test"+str(i))
            i += 1

### main

Executes run_experiment with configuration file

In [None]:
if __name__ == "__main__":
    run_experiment('config.conf')