#Introduction to this colab

Welcome, this is a **stable** version that renders the necessary data for the relational extraction phase. Currently, this code extracts: **Document Name, Source Token ID, Source Type, Source Sentence ID, Source POS Tag, Target Token ID, Target Type, Target Sentence ID, Target POS Tag, Dependency Tags, and Relation Type from the structured data**. We did this for the original PET data as well as our own. The files you need to run this colab are "LESCHNEIDER_formatted_relations_combined.json" and "PETv1.1-relations.json"

In [None]:
# @title Installing Dependencies
!pip install spacy
!python -m spacy download en_core_web_md
!pip install nltk

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# @title importing libraries and setting file paths
import json
import pandas as pd
import numpy as np
from itertools import product
import spacy
import json
from itertools import product
from sklearn.preprocessing import LabelEncoder
from google.colab import drive
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize

nltk.download('averaged_perceptron_tagger')

drive.mount('/content/drive')
# The paths
LESCHEIDER_PATH = '/content/drive/MyDrive/THESIS/DATA/LESCHNEIDER DATA/Documents/FORMATTED_RELATIONS/LESCHNEIDER_formatted_relations_combined.json'
PET_PATH = '/content/drive/MyDrive/THESIS/DATA/PET/actual PET data from Patrizio Bellan/PETv1.1-relations.json'

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


These are all the support functions for the dependency tag extraction.

In [None]:
#@title Support Functions
def format_sentences(tokens, sentence_IDs):

    # Initialize a dictionary to hold sentence IDs as keys and their corresponding formatted sentences as values
    sentences = {}

    # Set characters that should not be preceded by a space
    no_preceding_space_chars = {"'", ",", ".", "s", ";", "?", "!", ":", "-"}

    for token, sentence_id in zip(tokens, sentence_IDs):
        # Initialize the sentence key in the dictionary if not already present
        if sentence_id not in sentences:
            sentences[sentence_id] = ''

        # Clean token if it is a subword part that BERT might have split
        clean_token = token.replace('##', '')

        # Determine if a space should be added
        should_add_space = True
        if clean_token in no_preceding_space_chars:  # Check against no_preceding_space_chars
            should_add_space = False
        if sentences[sentence_id] == '':  # Do not add space before the first token
            should_add_space = False

        # Add space before the token if the condition is met
        if should_add_space:
            sentences[sentence_id] += ' ' + clean_token
        else:
            sentences[sentence_id] += clean_token

    return sentences

def get_entity_chunks(tokens, ner_tags, sentence_ids):
    chunks = []
    current_chunk = []
    start_index = None
    last_sentence_id = None  # Variable to track the sentence ID of the previous token
    sentence_start_index = 0  # Index where the current sentence starts in the tokens list

    for i, (token, tag, sentence_id) in enumerate(zip(tokens, ner_tags, sentence_ids)):
        if sentence_id != last_sentence_id:
            # Reset start_index relative to the sentence when sentence_id changes
            sentence_start_index = i
            last_sentence_id = sentence_id

        if tag.startswith('B-'):
            if current_chunk:
                # Append the current chunk before starting a new one
                chunks.append((' '.join(current_chunk), start_index, last_sentence_id))
                current_chunk = [token]
            else:
                current_chunk = [token]
            start_index = i - sentence_start_index  # Calculate start index relative to the start of the sentence

        elif tag.startswith('I-') and current_chunk:
            current_chunk.append(token)

        elif tag == 'O' and current_chunk:
            # Complete the current chunk if it exists
            chunks.append((' '.join(current_chunk), start_index, last_sentence_id))
            current_chunk = []
            start_index = None

    if current_chunk:  # Add the last chunk if it exists
        chunks.append((' '.join(current_chunk), start_index, last_sentence_id))

    return chunks

def get_root_of_chunk(nlp, chunk_text):

    doc = nlp(chunk_text)
    # Usually, the root token is the one whose head is outside the phrase itself or is itself
    for token in doc:
        if token.head == token or token.head not in doc:
            return token
    return doc[0]  # Fallback to the first token if no clear root is found

def root_index_lookup(root, chunk, chunk_start_index, sentence, nlp):

    # Calculate the number of tokens in the chunk
    chunk_length = len(nlp(chunk))

    # Define the upper bound of the search
    end_index = chunk_start_index + chunk_length

    #print(f" This the the START index of the chunk: {chunk_start_index}")
    #print(f" This is the END index of the chunk: {end_index}")

    for i in range(chunk_start_index, end_index-1):

        #print(f" type sentence[i].text compared to root: {type(sentence[i].text)} and {type(root)}")
        #print(f"compared with root {sentence[i].text} <-> {root}")

        if sentence[i].text == root.text:
            return i

    return -1  # Return -1 if the root word is not found within the bounds

def check_dependency_path(source_token, target_token):

    source_path = []
    target_path = []
    current_token = source_token

    # Trace path from source_token to root
    while current_token.head != current_token:
        source_path.append(current_token)
        current_token = current_token.head
    source_path.append(current_token)  # Include root

    current_token = target_token
    # Trace path from target_token to root
    while current_token.head != current_token:
        target_path.append(current_token)
        current_token = current_token.head
    target_path.append(current_token)  # Include root

    # Find lowest common ancestor
    set_source = set(source_path)
    common_ancestors = [token for token in target_path if token in set_source]
    if common_ancestors:
        # Return the path from source to LCA and LCA to target
        lca = common_ancestors[0]
        source_to_lca = source_path[:source_path.index(lca)+1]
        lca_to_target = target_path[:target_path.index(lca)+1]
        return [token.dep_ for token in source_to_lca + lca_to_target[::-1]]
    return []

def analyze_chunk_dependency(sentence_text, source_phrase, target_phrase, nlp):

    doc = nlp(sentence_text)

    #Since the chunks are sometimes too big and DET are often not relevant we try to pinpoint the root word
    #For example in the source_phrase "The MPON" we see that MPON is more relevant than The
    #print("\nSENTENCE:",doc)
    #print(f"original source tokens: \"{source_phrase[0]}\"")
    #print(f"original target tokens: \"{target_phrase[0]}\"")
    #print(f"original source tokens INDEX POS: \"{source_phrase[1]}\"")
    #print(f"original target tokens INDEX POS: \"{target_phrase[1]}\"")

    source_root_token = get_root_of_chunk(nlp, source_phrase[0])
    target_root_token = get_root_of_chunk(nlp, target_phrase[0])
    #print(f"extracted source root tokens: \"{source_root_token}\"")
    #print(f"extracted target root tokens: \"{target_root_token}\"")

    # Directly match root tokens based on text and position

    source_tokens = [token for token in doc if token.text == source_root_token.text and token.i == root_index_lookup(source_root_token,source_phrase[0], source_phrase[1], doc, nlp)]
    target_tokens = [token for token in doc if token.text == target_root_token.text and token.i == root_index_lookup(target_root_token,target_phrase[0], target_phrase[1], doc, nlp)]

    #print("acquired root source in text ", source_tokens)
    #print("acquired root target in text ", target_tokens)

    if not source_tokens or not target_tokens:
        return ["/"]  # Return "/" indicating no tokens found

    all_dependencies = set()  # Use a set to avoid duplicate entries
    dependency_found = False  # Reintroducing the boolean to track if any dependency was found

    for s_token in source_tokens:
        for t_token in target_tokens:
            deps = check_dependency_path(s_token, t_token)
            if deps:
                all_dependencies.update(deps)
                dependency_found = True  # Set to True if any dependency is found


    if not dependency_found:
        #print("NO DEPENDENCY FOUND")
        return ["/"]  # Return "/" if no dependencies were found

    return list(all_dependencies)

def encode_dependency_path(dependency_path, label_encoder):

    encoded_path = []
    if not dependency_path:
        return -1
    else:
        for tag in dependency_path:
            try:
                # Normalize the tag to lower case and encode it
                encoded_tag = label_encoder.transform([tag.lower()])[0]
            except ValueError:
                # If the tag is unknown, assign a default value of -1
                encoded_tag = -1
            encoded_path.append(encoded_tag)

        return encoded_path

def initialize_label_encoder():
    # Not at all exhaustive
    possible_tags = ['acl', 'advcl', 'advmod', 'amod', 'appos', 'attr', 'aux', 'auxpass',
    'case', 'cc', 'ccomp', 'compound', 'conj', 'csubj', 'csubjpass',
    'dative', 'dep', 'det', 'discourse', 'dislocated', 'dobj', 'expl',
    'fixed', 'flat', 'goeswith', 'iobj', 'intj', 'list', 'mark', 'meta',
    'neg', 'nounmod', 'npmod', 'nsubj', 'nsubjpass', 'nummod', 'oprd',
    'parataxis', 'pcomp', 'pobj', 'poss', 'preconj', 'predet', 'prep',
    'prt', 'punct', 'quantmod', 'relcl', 'root', 'xcomp', 'npadvmod',
    'complm', 'infmod', 'partmod', 'hmod', 'hyph', 'num', 'number',
    'nmod', 'nn', 'npadvmod', 'possessive', 'rcmod', '/']

    label_encoder = LabelEncoder()
    label_encoder.fit(possible_tags)
    return label_encoder

def find_neighboring_tags(entity_idx, ner_tags, direction='prev'):
    step = -1 if direction == 'prev' else 1
    start, end = (entity_idx - 1, -1) if direction == 'prev' else (entity_idx + 1, len(ner_tags))
    for i in range(start, end, step):
        if i >= 0 and i < len(ner_tags) and (ner_tags[i].startswith('B-') or ner_tags[i] == 'O'):
            return ner_tags[i]
    return 'NONE'

In [None]:
#@title Main Function

def create_df(path, nlp_in):

    with open(path, 'r') as file:
      data = json.load(file)

    nlp = nlp_in
    #nlp = spacy.load("en_core_web_trf")  # takes forever, performs comparibly to en_core_web_md
    label_encoder = initialize_label_encoder() #To translate the paths into numbers for easier processing

    transformed_data = []

    for document in data:

        # progress monitoring
        print(f"\rProcessing progess {round((data.index(document)+1)/len(data)*100,2)}%...", end= "  ", flush = True)  # Add logging to monitor progress due to slow processing

        tokens = document['tokens']
        ner_tags = document['ner_tags']
        sentence_ids = document['sentence-IDs']
        doc_name = document['document name']
        sentences = format_sentences(tokens, sentence_ids)
        # Generate POS tags for the tokens
        pos_tags = pos_tag(tokens)


        document_chunks = get_entity_chunks(document['tokens'], document['ner_tags'], document['sentence-IDs'])

        # Initialize entities list using comprehensive condition checks for 'B-' prefixes
        entities = {
            (sentence_id, token_id): {
                'token': tokens[idx],
                'type': ner_tags[idx],
                'sentence_id': sentence_id,
                'pos_tag': pos_tags[idx][1],
                'token_id': token_id,
                'index': idx
            }
            for idx, (token_id, sentence_id) in enumerate(zip(document['tokens-IDs'], sentence_ids))
            if ner_tags[idx].startswith('B-')
        }

        relations_dict = {
            (doc_name, rel['source-head-sentence-ID'], rel['source-head-word-ID'], rel['target-head-sentence-ID'],
             rel['target-head-word-ID']): rel['relation-type']
            for rel in document['relations']
        }

        # Generate all combinations of entities and check for relations within the same sentence
        for ((src_sentence_id, src_token_id), source), ((tgt_sentence_id, tgt_token_id), target) in product(
                entities.items(), repeat=2):
            if (src_sentence_id, src_token_id) != (tgt_sentence_id, tgt_token_id):  # Explicitly prevent self-comparison
                if src_sentence_id == tgt_sentence_id:


                    sentence = sentences[src_sentence_id]

                    # Extract full chunks for the source and target using their token_ids and sentence_ids.
                    # It saves a tuple where the first element is the actual text and the second the start index position relative to the sentence.
                    source_chunk = next(((chunk[0], chunk[1]) for chunk in document_chunks
                                         if chunk[1] <= src_token_id and chunk[2] == src_sentence_id and src_token_id <
                                         chunk[1] + len(chunk[0].split())),
                                        (source['token'], src_token_id))

                    target_chunk = next(((chunk[0], chunk[1]) for chunk in document_chunks
                                         if chunk[1] <= tgt_token_id and chunk[2] == tgt_sentence_id and tgt_token_id <
                                         chunk[1] + len(chunk[0].split())), (target['token'], tgt_token_id))


                    results = analyze_chunk_dependency(sentence, source_chunk, target_chunk, nlp)
                    results = encode_dependency_path(results, label_encoder)
                else:
                    results = ["/"]
                    results = encode_dependency_path(results, label_encoder)

                relation_key = (doc_name, src_sentence_id, src_token_id, tgt_sentence_id, tgt_token_id)
                relation_type = relations_dict.get(relation_key, "no_relation")

                # Get neighboring B-tags or 'O' for source and target
                src_prev_tag = find_neighboring_tags(source['index'], ner_tags, 'prev')
                src_next_tag = find_neighboring_tags(source['index'], ner_tags, 'next')
                tgt_prev_tag = find_neighboring_tags(target['index'], ner_tags, 'prev')
                tgt_next_tag = find_neighboring_tags(target['index'], ner_tags, 'next')

                row = {
                    'document_name': doc_name,
                    'source_token': source['token'],
                    'source_type': source['type'],
                    'source_pos_tag': source['pos_tag'],  # Include source POS tag
                    'source_sentence_ID': src_sentence_id,
                    'source_token_ID': src_token_id,
                    'source_prev_tag': src_prev_tag,
                    'source_next_tag': src_next_tag,
                    'target_token': target['token'],
                    'target_type': target['type'],
                    'target_pos_tag': target['pos_tag'],  # Include target POS tag
                    'target_sentence_ID': tgt_sentence_id,
                    'target_token_ID': tgt_token_id,
                    'target_prev_tag': tgt_prev_tag,
                    'target_next_tag': tgt_next_tag,
                    'token_distance': abs(src_token_id - tgt_token_id),
                    'sentence_distance': abs(src_sentence_id - tgt_sentence_id),
                    'dependency_tags': results,
                    'relation_type': relation_type
                }
                transformed_data.append(row)





    df_relations = pd.DataFrame(transformed_data)
    print("\n---------------------------------")
    print("Processing Done!")
    print("---------------------------------\n")
    filtered_df = df_relations[['source_token', 'target_token', 'dependency_tags', 'relation_type']]
    filtered_df = filtered_df[filtered_df['dependency_tags'].apply(lambda x: x != [0])]
    filtered_df_again = filtered_df[filtered_df['relation_type'] != "no_relation"]
    df_true_relations = df_relations[df_relations['relation_type'] != "no_relation"]

    pd.set_option('display.max_rows', 500)
    pd.set_option('display.max_columns', 5)# Example: 500 rows

    print(f"total number of elements in dataframe: {len(df_relations)}")
    print(f"total number of non empty dependency rows in dataframe: {len(filtered_df)}")
    print(f"total number of non empty dependency rows WITH RELATION in dataframe: {len(filtered_df_again)}")
    print(f"total number of non empty relation rows in dataframe: {len(df_true_relations)}")
    print("\n")

    return df_relations


In [None]:
# Create a DataFrame
nlp = spacy.load("en_core_web_md")

#------------
#------------
# LESCHNEIDER
df_relations1 = create_df(LESCHEIDER_PATH, nlp)
print("----------TESTING----------")
labels = df_relations1.relation_type
print(f"These are the unique labels \n{np.unique(labels)}\n")
print(f"These are the heads of the table \n{df_relations1.head(5)}")
print("----------TESTING----------")
df_relations1.to_csv('/content/drive/MyDrive/THESIS/CODING/NAM_TESTING/TESTING DATA/RE_TRAINING_DATA/total_relation_entity_pairs_DEPENDENCY_CONTEXT_md_LESCHNEIDER.csv', index=False)  #rename to liking

#------------
#------------
#PET
df_relations2 = create_df(PET_PATH, nlp)
df_relations2.to_csv('/content/drive/MyDrive/THESIS/CODING/NAM_TESTING/TESTING DATA/RE_TRAINING_DATA/total_relation_entity_pairs_DEPENDENCY_CONTEXT_md_PET1.1.csv', index=False)  #rename to liking




Processing progess 100.0%...  
---------------------------------
Processing Done!
---------------------------------

total number of elements in dataframe: 11054
total number of non empty dependency rows in dataframe: 54
total number of non empty dependency rows WITH RELATION in dataframe: 6
total number of non empty relation rows in dataframe: 425


----------TESTING----------
These are the unique labels 
['actor performer' 'actor recipient' 'flow' 'further specification'
 'no_relation' 'same gateway' 'uses']

These are the heads of the table 
               document_name source_token  ... dependency_tags relation_type
0  doc-20.1 - order shipping          the  ...             [0]   no_relation
1  doc-20.1 - order shipping          the  ...             [0]   no_relation
2  doc-20.1 - order shipping          the  ...             [0]   no_relation
3  doc-20.1 - order shipping          the  ...             [0]   no_relation
4  doc-20.1 - order shipping          the  ...             [0]  