# Create Dataset for relation classification

In [1]:
import sys
# sys.path.insert(1, 'C:/Users/fschr/Desktop/Masterarbeit/master-thesis-software-ie/experiments')
sys.path.append("..")
import helper

import pandas as pd
import numpy as np
import json
import gc

from sklearn.model_selection import train_test_split

import re

  from .autonotebook import tqdm as notebook_tqdm


### Read the datasets

In [28]:
# open the data of sentences
split = "train" 
split = "test"
with open(f'../../data/subtask3/subtask3_{split}.data.txt', 'r', encoding='utf-8') as file: #../master-thesis-software-ie/data/subtask3/
    sentences = file.readlines()

# open the entity labels
with open(f'../../data/subtask3/subtask3_{split}.info.txt', 'r', encoding='utf-8') as file:
    entity_labels = file.readlines()

# open the data of relations
with open(f'../../data/subtask3/subtask3_{split}.labels.txt', 'r', encoding='utf-8') as file:
    relations_info = file.readlines()
    if split == "test":
        relations_info = [r.replace(";", ";;") for r in relations_info]

### Relation_label to ID and reverse

In [50]:
relation_labels = ['nil', 'Developer_of', 'Abbreviation_of', 'URL_of', 'Citation_of', 'Release_of', 'Version_of',
                   'Specification_of', 'Extension_of', 'PlugIn_of', 'AlternativeName_of', 'License_of']
                   #'Developer_of-1', 'Abbreviation_of-1', 'URL_of-1', 'Citation_of-1', 'Release_of-1', 'Version_of-1',
                   #'Specification_of-1', 'Extension_of-1', 'PlugIn_of-1', 'AlternativeName_of-1', 'License_of-1']

relation_label_to_ID = {relation_label: ID for ID, relation_label in enumerate(relation_labels)}
ID_to_relation_label = {ID: relation_label for ID, relation_label in enumerate(relation_labels)}
with open("../../data/subtask3/label_to_id.json", "w") as f:
    json.dump(relation_label_to_ID, f)
relation_label_to_ID

{'nil': 0,
 'Developer_of': 1,
 'Abbreviation_of': 2,
 'URL_of': 3,
 'Citation_of': 4,
 'Release_of': 5,
 'Version_of': 6,
 'Specification_of': 7,
 'Extension_of': 8,
 'PlugIn_of': 9,
 'AlternativeName_of': 10,
 'License_of': 11}

# Combine all info for each sentence

In [32]:
from pprint import pprint
sentence_entities_relations = []
for idx, (sentence, entity_label_list, relation_info) in enumerate(tuple(zip(sentences, entity_labels, relations_info))):
    token = sentence.split()
    entity_bio_tags = entity_label_list.split()
    ents = helper.get_entities(token, entity_bio_tags)
    ent_dict = {e['begin']:e for e in ents}
    relations = helper.get_relations(relation_info)
    sentence_info = dict(
        sentence=sentence.strip(),
        entities=ent_dict,
        relations=relations
    )
    sentence_entities_relations.append(sentence_info)
# len(sentence_entities_relations)

### Allowed subject, object

In [33]:
from itertools import chain
from collections import Counter

allowed_subj_obj_all = list(chain(*[helper.sentence_allowed_subj_obj(sent, allow_inverse=False) for sent in sentence_entities_relations]))
allowed_subj_obj_counts = Counter(allowed_subj_obj_all)
allowed_subj_obj = set(allowed_subj_obj_counts.keys())
len(allowed_subj_obj)
he = set(list(allowed_subj_obj))

In [34]:
if split == "train":
    with open("data_allowed_sub_obj_test.json", "w") as f:
        json.dump(list(allowed_subj_obj), f)
if split == "test":
    with open("data_allowed_sub_obj_test.json", "r") as f:
        allowed_subj_obj = json.load(f)
        allowed_subj_obj = {tuple(p) for p in allowed_subj_obj}
len(allowed_subj_obj)

40

In [35]:
import random
from itertools import combinations

## Idea: transform sentence based
def sent_to_relation_representations(sent, allowed_subj_obj):
    rel_dict = {(r['subject'], r['object']):r['relation_type'] for r in sent['relations']}
    #rel_dict |= {(r['object'], r['subject']):f"{r['relation_type']}-1" for r in sent['relations']}
    for ent_one, ent_two in combinations(sent['entities'].values(), r=2):
        for e_one, e_two in [ent_one, ent_two], [ent_two, ent_one]:
            combination_key = e_one['begin'], e_two['begin']
            relation_type = rel_dict.get(combination_key, "nil")
            subj_obj = e_one['label'], e_two['label']
            if subj_obj in allowed_subj_obj:
                context, relation = helper.build_relation_reprentation(sent, subj=e_one, obj=e_two, rel=relation_type)
                label = relation_label_to_ID.get(relation)
                rel_context = dict(
                    context=context,
                    relation=relation,
                    label=label,
                    subject=e_one,
                    object=e_two)
                rel_context |= {f"subject_{key}": value
                                for key, value in e_one.items()}
                rel_context |= {f"object_{key}": value
                                for key, value in e_two.items()}
                yield rel_context
    #return relation_representations
sent = random.choice(sentence_entities_relations)

In [36]:
relation_corpus = []
for idx, sent in enumerate(sentence_entities_relations):
    sent["sentence_id"] = idx
    rel_infos = sent_to_relation_representations(sent, allowed_subj_obj)
    for rel_info in rel_infos:
        rel_info |= sent
        relation_corpus.append(rel_info)
len(sentence_entities_relations), len(relation_corpus)

(131, 960)

### Create a dataframe of the dataset

In [37]:
infos_dataset_df = pd.DataFrame(relation_corpus)

In [38]:
infos_dataset_df['relation'].unique()

array(['License_of', 'URL_of', 'nil', 'Version_of', 'Developer_of',
       'Citation_of', 'Abbreviation_of', 'AlternativeName_of',
       'Extension_of', 'PlugIn_of', 'Specification_of', 'Release_of'],
      dtype=object)

# Add Additional Features

In [39]:
infos_dataset_df["subject_begin"] = infos_dataset_df.subject.apply(lambda x: x["begin"])
infos_dataset_df["object_begin"] = infos_dataset_df.object.apply(lambda x: x["begin"])
infos_dataset_df["subject_label"] = infos_dataset_df.subject.apply(lambda x: x["label"])
infos_dataset_df["object_label"] = infos_dataset_df.object.apply(lambda x: x["label"])
infos_dataset_df["left_to_right"] = infos_dataset_df.subject_begin < infos_dataset_df.object_begin
def get_entity_distance(relation):
    if relation.left_to_right:
        distance = relation.object_begin - relation.subject_end
    else:
        distance = relation.subject_end - relation.object_begin
    return distance
infos_dataset_df["subject_object_distance"] = infos_dataset_df.apply(get_entity_distance, axis=1)

## Add info for binary decision (subject_labels have only one possible relation_type)

In [40]:
### Only one exception in whole dataset: License, Specification_of
possible_relation_types_for_subjects = {'License': 'License_of',
 'URL': 'URL_of',
 'Version': 'Version_of',
 'Developer': 'Developer_of',
 'Citation': 'Citation_of',
 'Abbreviation': 'Abbreviation_of',
 'AlternativeName': 'AlternativeName_of',
 'Extension': 'Extension_of',
 'PlugIn': 'PlugIn_of',
 'Application': 'Specification_of',
 'Release': 'Release_of',
 'ProgrammingEnvironment': 'Specification_of'
}
infos_dataset_df["label_binary"] = (infos_dataset_df.relation != "nil").astype(int)
infos_dataset_df["target_label_binary"] = infos_dataset_df.subject_label.apply(lambda x: possible_relation_types_for_subjects[x])

### maximal existent subject object distance based on train split

In [41]:
max_distance_train = {'Abbreviation_of': 11,
 'AlternativeName_of': 23,
 'Citation_of': 33,
 'Developer_of': 29,
 'Extension_of': 13,
 'License_of': 51,
 'PlugIn_of': 31,
 'Release_of': 16,
 'Specification_of': 18,
 'URL_of': 46,
 'Version_of': 32}
infos_dataset_df["max_distance_train"] = infos_dataset_df["target_label_binary"].apply(lambda x: max_distance_train[x])

### Check how many False Negatives using max distance

In [42]:
f = infos_dataset_df.subject_object_distance > infos_dataset_df.max_distance_train
infos_dataset_df[f][["relation", "target_label_binary"]].value_counts().sort_index()

relation  target_label_binary
nil       Abbreviation_of         4
          Citation_of            17
          Developer_of            2
          Extension_of            1
          PlugIn_of               2
          Release_of              1
          Specification_of       71
          Version_of             17
Name: count, dtype: int64

## unprobable relation direction

In [49]:
# only max 2 positive samples are seen in train for those labels and left_to_right relations
#  * not for "Extension", "PlugIn", "Developer"
unprobable_left_to_right = {
    'URL', 'AlternativeName',
    'Citation', 'Abbreviation',
    'ProgrammingEnvironment', 'Application',
    'License',
    'Version', 'Release'}
infos_dataset_df["unprobable_relation_direction"] = infos_dataset_df[["subject_label", "left_to_right"]]\
    .apply(lambda x: x.subject_label in unprobable_left_to_right and x.left_to_right, axis=1)

## nr alternative candidates for each candidates

In [44]:
f = infos_dataset_df.subject_object_distance < infos_dataset_df.max_distance_train
f &= ~infos_dataset_df.unprobable_relation_direction
#f = infos_dataset_df.relation != "nil"
n_alt_cands = infos_dataset_df.groupby(["subject_begin", "sentence_id"]).target_label_binary.value_counts().rename("n_alternative_candidates_by_subject")
n_alt_cands_w_filtering = infos_dataset_df[f].groupby(["subject_begin", "sentence_id"]).target_label_binary.value_counts().rename("n_alternative_candidates_by_subject_filtered")
infos_dataset_df = infos_dataset_df.set_index(["subject_begin", "sentence_id", "target_label_binary"]).join(n_alt_cands).join(n_alt_cands_w_filtering).reset_index()
infos_dataset_df["n_alternative_candidates_by_subject_filtered"] = infos_dataset_df.n_alternative_candidates_by_subject_filtered.fillna(0).astype(int)

## rank of candidate by relation_distance

In [45]:
def add_distance_order(relations):
    rels = relations.sort_values("subject_object_distance").copy()
    rels["rank_by_candidate_distance"] = list(range(len(rels)))
    rels["rank_by_candidate_distance"] += 1
    return rels.set_index(["object_begin"]).rank_by_candidate_distance
f = infos_dataset_df.subject_object_distance < infos_dataset_df.max_distance_train
f &= ~infos_dataset_df.unprobable_relation_direction
cols = ["object_begin", "subject_object_distance"]
rank_by_candidate_distance = infos_dataset_df.groupby(["sentence_id", "subject_begin"])[cols].apply(add_distance_order)
rank_by_candidate_distance_filtered = infos_dataset_df[f].groupby(["sentence_id", "subject_begin"])[cols].apply(add_distance_order).rename("rank_by_candidate_distance_filtered")
rank_by_candidate_distance = rank_by_candidate_distance.to_frame().join(rank_by_candidate_distance_filtered).fillna(-1).astype(int)
cols_index = ["sentence_id", "subject_begin", "object_begin"]
for col in ["rank_by_candidate_distance", "rank_by_candidate_distance_filtered"]:
    if col in infos_dataset_df:
        del infos_dataset_df[col]
infos_dataset_df = infos_dataset_df.set_index(cols_index).join(rank_by_candidate_distance).reset_index()

In [46]:
infos_dataset_df.rank_by_candidate_distance_filtered.value_counts().sort_index()

rank_by_candidate_distance_filtered
-1    328
 1    377
 2    155
 3     59
 4     26
 5     10
 6      4
 7      1
Name: count, dtype: int64

## Save test split (if loaded)

In [47]:
if split == "test":
    infos_dataset_df.to_json("../../data/subtask3/test_prepro.json", orient="records")

# Split in train and valid set

In [25]:
#from sklearn.model_selection import train_test_split
#train, validation = train_test_split(infos_dataset_df, test_size=0.25, random_state=42)
from sklearn.model_selection import GroupShuffleSplit
gss = GroupShuffleSplit(n_splits=1, test_size=0.25, random_state=45)
# Perform the split
train_index, valid_index = next(gss.split(infos_dataset_df, groups=infos_dataset_df.sentence_id))
train_split = infos_dataset_df.iloc[train_index].copy()
valid_split = infos_dataset_df.iloc[valid_index].copy()
len(train_split), len(valid_split), len(valid_split) / len(infos_dataset_df)

(4450, 1446, 0.24525101763907733)

In [26]:
set(train_split.sentence_id) & set(valid_split.sentence_id)

set()

In [48]:
if split == "train":
    train_split.to_json("../../data/subtask3/train_prepro.json", orient="records")
    valid_split.to_json("../../data/subtask3/valid_prepro.json", orient="records")