FED - Frame Event Detection

[![Github](https://img.shields.io/badge/GitHub-100000?style=for-the-badge&logo=github&logoColor=white)](https://github.com/FORMAS/FED)

[![Docker](https://img.shields.io/badge/docker-%230db7ed.svg?style=for-the-badge&logo=docker&logoColor=white)](https://hub.docker.com/r/andersonsacramento/fed)



# DESCRIPTION
FED is a closed domain event detector system for sentences in the Portuguese language. It detect events from sentences, i.e., event trigger identification and classification. The event types are based on the typology of the FrameNet project (BAKER, 2017). The models were trained on an enriched TimeBankPT (COSTA; BRANCO,2012) corpus.


Currently, in this Colab, 5 different trained models are available to execution: 0, 5, 25, 50, and 100 which respectively correspond to: 214, 137, 31, 13, and 5 event types.

## How to cite this work

Peer-reviewed accepted paper:

10th Brazilian Conference on Intelligent Systems (BRACIS)

* Sacramento A. ; Souza M. . Joint Event Extraction with Contextualized Word Embeddings for the Portuguese 
Language.


# Download and locate BERTimbau Base model and FED model files

In [None]:
!pip install gdown



In [None]:
!gdown --id 13d7PKSp6dRLeMeThraA6tZ_oSUm-DRCu --output fed.zip
!unzip fed.zip

Downloading...
From: https://drive.google.com/uc?id=13d7PKSp6dRLeMeThraA6tZ_oSUm-DRCu
To: /content/fed.zip
38.2MB [00:01, 27.8MB/s]
Archive:  fed.zip
   creating: res/
  inflating: res/events_by_pos_types_137.json  
  inflating: res/events_by_pos_types_31.json  
  inflating: res/events_by_pos_types_5.json  
  inflating: res/events_by_pos_types_13.json  
  inflating: res/events_by_pos_types_214.json  
   creating: models/
  inflating: models/edff_0.h5        
  inflating: models/edff_5.h5        
  inflating: models/edc1ff_50.h5     
  inflating: models/edc1ff_100.h5    
  inflating: models/edff_25.h5       


In [None]:
!gdown --id 1qIR2GKpBqB-sOmX0Q5j1EQ6NSugYMCsX --output bertimbau.zip

Downloading...
From: https://drive.google.com/uc?id=1qIR2GKpBqB-sOmX0Q5j1EQ6NSugYMCsX
To: /content/bertimbau.zip
1.21GB [00:08, 138MB/s]


In [None]:
!mv bertimbau.zip models/
!unzip models/bertimbau.zip -d models/
!rm models/bertimbau.zip

Archive:  models/bertimbau.zip
  inflating: models/BERTimbau/bert_model.ckpt.index  
  inflating: models/BERTimbau/bert_config.json  
  inflating: models/BERTimbau/vocab.txt  
  inflating: models/BERTimbau/bert_model.ckpt.meta  
  inflating: models/BERTimbau/bert_model.ckpt.data-00000-of-00001  


# Load FED code

In [None]:
!pip install tensorflow>=2.6.0
!pip install keras-bert>=0.88
!pip install numpy



## load functions

In [None]:
import sys
import os
import numpy as np
import re
import json
import glob

from keras_bert import load_vocabulary, load_trained_model_from_checkpoint, Tokenizer, get_checkpoint_paths
from keras_bert.datasets import get_pretrained, PretrainedList
import tensorflow as tf
from tensorflow.keras.models import load_model


BERTIMBAU_MODEL_PATH = 'models/BERTimbau/'
EMBEDDING_ID = 'sum_all_12'


RUN_CONFIGS = {
    '100': {'model':'models/edc1ff_100.h5',
            'events-types': 'res/events_by_pos_types_5.json'},
    '50':  {'model':'models/edc1ff_50.h5',
            'events-types': 'res/events_by_pos_types_13.json'},
    '25':  {'model':'models/edff_25.h5',
            'events-types': 'res/events_by_pos_types_31.json'},
    '5':   {'model':'models/edff_5.h5',
            'events-types': 'res/events_by_pos_types_137.json'},
    '0':   {'model':'models/edff_0.h5',
            'events-types': 'res/events_by_pos_types_214.json'}}

DEFAULT_RUN_CONFIG = '0'



def tokenize_and_compose(text):
        tokens = tokenizer.tokenize(text)
        text_tokens = []
        for i, token in enumerate(tokens):
            split_token = token.split("##")
            if len(split_token) > 1:
                token = split_token[1]
                text_tokens[-1] += token
            else:
                text_tokens.append(token)
        if len(text_tokens[1:-1]) == 1:
          return text_tokens[1]
        else:
          return text_tokens[1:-1]


def compose_token_embeddings(sentence, tokenized_text, embeddings):
        tokens_indices_composed = [0] * len(tokenized_text)
        j = -1
        for i, x in enumerate(tokenized_text):
            if x.find('##') == -1:
                j += 1
            tokens_indices_composed[i] = j
        word_embeddings = [0] * len(set(tokens_indices_composed))
        j = 0
        for i, embedding in enumerate(embeddings):
            if j == tokens_indices_composed[i]:
                word_embeddings[j] = embedding
                j += 1
            else:
                word_embeddings[j - 1] += embedding
        return word_embeddings

    

def extract(text, options={'sum_all_12':True}, seq_len=512, output_layer_num=12):
        features = {k:v for (k,v) in options.items() if v}
        tokens = tokenizer.tokenize(text)
        indices, segments = tokenizer.encode(first = text, max_len = seq_len)
        predicts = model_bert.predict([np.array([indices]), np.array([segments])])[0]
        predicts = predicts[1:len(tokens)-1,:].reshape((len(tokens)-2, output_layer_num, 768))

        for (k,v) in features.items():
            if k == 'sum_all_12':
                features[k] = compose_token_embeddings(text, tokens[1:-1], predicts.sum(axis=1))
            if k == 'sum_last_4':
                features[k] = compose_token_embeddings(text, tokens[1:-1], predicts[:,-4:,:].sum(axis=1))
            if k == 'concat_last_4':
                features[k] = compose_token_embeddings(text, tokens[1:-1], predicts[:,-4:,:].reshape((len(tokens)-2,768*4)))
            if k == 'last_hidden':
                features[k] = compose_token_embeddings(text, tokens[1:-1], predicts[:,-1:,:].reshape((len(tokens)-2, 768)))
        return features



def get_sentence_original_tokens(sentence, tokens):
        token_index = 0
        started = False
        sentence_pos_tokens = []
        i = 0
        while i < len(sentence):
                if sentence[i] != ' ' and not started:
                        start = i
                        started = True
                if sentence[i] == tokens[token_index] and started:
                        sentence_pos_tokens.append(sentence[i])
                        started = False
                        token_index += 1
                elif i<len(sentence) and (sentence[i] == ' ' or tokenize_and_compose(sentence[start:i+1]) == tokens[token_index] ) and started:
                        sentence_pos_tokens.append(sentence[start:i+1])
                        start = i+1
                        started = False
                        token_index += 1
                i += 1
        return sentence_pos_tokens


def get_text_location(text, arg, start_search_at=0):
        text = text.lower()
        arg = arg.lower()
        pattern = re.compile(r'\b%s\b' % arg)
        match = pattern.search(text, start_search_at)
        if match:
                return (match.start(), match.end())
        else:
                return (-1, -1)


            
def load_bertimbau_model():    
        global tokenizer
        global model_bert
        
        paths = get_checkpoint_paths(BERTIMBAU_MODEL_PATH)

        model_bert = load_trained_model_from_checkpoint(paths.config, paths.checkpoint, seq_len=512, output_layer_num=12)

        token_dict = load_vocabulary(paths.vocab)
        tokenizer = Tokenizer(token_dict)

def load_fed_model():
        global model
        global events_types

        events_types  = load_events_info()
        model = load_model(RUN_CONFIGS[model_config]['model'])
        return model

def load_events_info():
        events_types = {}

        with open(RUN_CONFIGS[model_config]['events-types'], 'r') as read_content:        
                events_types = json.load(read_content)
                
        return events_types




def detect_events_c1ff(text, feature_option):
    MAX_SEQUENCE_LENGTH = 150

    text_tokens = get_sentence_original_tokens(text, tokenize_and_compose(text))
    features = extract(text, {feature_option:True})[feature_option]
    x_pred = np.zeros((1, MAX_SEQUENCE_LENGTH, 768))
    embedding = np.array(features).reshape((len(text_tokens), 768))
    x_pred[0,:embedding.shape[0]] = embedding
    prediction = model.predict(x_pred)
    positions = list(filter((lambda i: i>= 0 and i < len(text_tokens)), [pos if np.argmax(pred_value) > 0 else -1 for (pos, pred_value) in enumerate(prediction[0])]))
    output = []
    if len(positions) > 0:
        start_at = sum([len(token) for token in text_tokens[:positions[0]]])
    for pos in positions:
        loc_start, loc_end = get_text_location(text, text_tokens[pos], start_at)
        start_at = loc_end
        event_type = events_types[str(np.argmax(prediction[0][pos]))]
        output.append({'text':  text[loc_start:loc_end],
                       'start': loc_start,
                       'end':   loc_end,
                       'event_type': event_type['name'] })
    return output



def detect_events_ff(text, feature_option):
    text_tokens = get_sentence_original_tokens(text, tokenize_and_compose(text))
    features = extract(text, {feature_option:True})[feature_option]
    embedding = np.array(features).reshape((len(text_tokens), 768))
    prediction = [model.predict(e.reshape((1, 768))) for e in embedding ]
    positions = list(filter((lambda i: i>= 0 and i < len(text_tokens)), [pos if np.argmax(pred_value) > 0 else -1 for (pos, pred_value) in enumerate(prediction)]))

    output = []
    if len(positions) > 0:
        start_at = sum([len(token) for token in text_tokens[:positions[0]]])
    for pos in positions:
        loc_start, loc_end = get_text_location(text, text_tokens[pos], start_at)
        start_at = loc_end
        event_type = events_types[str(np.argmax(prediction[pos]))]
        output.append({'text':  text[loc_start:loc_end],
                       'start': loc_start,
                       'end':   loc_end,
                       'event_type': event_type['name'] })
    return output


def detect_events(text, feature_option):
    if model_config in ['100', '50']:
        return detect_events_c1ff(text, feature_option)
    else:
        return detect_events_ff(text, feature_option)
    
    
def detect_from_files(input_path, output_path):
        for filepathname in glob.glob(f'{input_path}*.txt'):
                extractions = []
                for line in open(filepathname):
                        line = line.strip()
                        print(line)
                        extractions.append(detect_events(line, EMBEDDING_ID))

                filename = filepathname.split('.txt')[0].split(os.sep)[-1]
                with open(f'{output_path}{filename}.json', 'w')  as outfile:
                        json.dump(extractions, outfile)
                print(f'{filename}')


def detect_events_from(input_path, output_path):
        run_detect_context(lambda : detect_from_files(input_path, output_path))
        

def detect_events_from_sentence(sentence):
        sentence = sentence.strip()
        run_detect_context(lambda : print(detect_events(sentence, EMBEDDING_ID)))
        

def run_detect_context(run_detect_func):                        
        if len(tf.config.list_physical_devices('GPU')) > 0:
                with tf.device('/GPU:0'):
                        load_bertimbau_model()
                        load_fed_model()
                        run_detect_func()
        else:
                with tf.device('/cpu:0'):
                        load_bertimbau_model()
                        load_fed_model()
                        run_detect_func()


# RUN

## Detect Events From Sentence

In [None]:
#@title Input the sentence and select the model

sentence = 'A Petrobras aumentou o preço da gasolina para 2,30 reais, disse o presidente.' #@param {type:"string"}
model_config = '0' #@param ["0", "5", "25", "50", "100"]


print(sentence)
print(model_config)
detect_events_from_sentence(sentence)

## Extract Events From Files

In [None]:
# If you want to be able to process files from your drive folders 

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#@title ## Input and Output directory fields

#@markdown The text files in the input directory are expected to have the format:

#@markdown * all text files end with the extension .txt
#@markdown * sentences are separated by newlines


#@markdown ---
#@markdown ### Enter the directories paths:
input_dir = "/content/drive/MyDrive/input-files/" #@param {type:"string"}
output_dir = "/content/drive/MyDrive/output-files/" #@param {type:"string"}
model_config = '0' #@param ["0", "5", "25", "50", "100"]
#@markdown ---

detect_events_from(input_dir, output_dir)