In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

In [2]:
import pandas as pd
import numpy as np
from torch import cuda
import sys


from transformers import DistilBertTokenizer, DataCollatorWithPadding, TextClassificationPipeline
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

sys.path.append("..")
import helper

  from pandas.core import (


In [3]:
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cpu'

## Dataset

### Read the datasets

In [4]:
# open the data of sentences
with open('../../data/subtask3/subtask3_train.data.txt', 'r', encoding='utf-8') as file: 
    sentences = file.readlines()

# open the entity labels
with open('../../data/subtask3/subtask3_train.info.txt', 'r', encoding='utf-8') as file:
    entity_labels = file.readlines()

# open the data of relations
with open('../../data/subtask3/subtask3_train.labels.txt', 'r', encoding='utf-8') as file:
    relations_info = file.readlines()

### Unique entity labels

In [5]:
# Use a set comprehension to collect unique labels
unique_entity_labels = list({label for line in entity_labels for label in line.split()})
# unique_entity_labels

### Unique relation labels

In [6]:
unique_relation_labels = set()

for line in relations_info:
    relations_info_list = line.replace(';;', ' ').split()
    unique_relation_labels.update(relation for relation in relations_info_list if not relation.isdigit())

unique_relation_labels = list(unique_relation_labels)

unique_relation_labels

['URL_of',
 'Extension_of',
 'License_of',
 'Developer_of',
 'Version_of',
 'Specification_of',
 'Citation_of',
 'PlugIn_of',
 'Release_of',
 'Abbreviation_of',
 'AlternativeName_of']

### Relation_label to ID and Updside down

In [7]:
relation_labels = ['nil', 'Developer_of', 'Abbreviation_of', 'URL_of', 'Citation_of', 'Release_of', 'Version_of',
                   'Specification_of', 'Extension_of', 'PlugIn_of', 'AlternativeName_of', 'License_of',
#                    'Developer_of-1', 'Abbreviation_of-1', 'URL_of-1', 'Citation_of-1', 'Release_of-1', 'Version_of-1',
#                    'Specification_of-1', 'Extension_of-1', 'PlugIn_of-1', 'AlternativeName_of-1', 'License_of-1'
                  ]

relation_label_to_ID = {relation_label: ID for ID, relation_label in enumerate(relation_labels)}
ID_to_relation_label = {ID: relation_label for ID, relation_label in enumerate(relation_labels)}
relation_label_to_ID

{'nil': 0,
 'Developer_of': 1,
 'Abbreviation_of': 2,
 'URL_of': 3,
 'Citation_of': 4,
 'Release_of': 5,
 'Version_of': 6,
 'Specification_of': 7,
 'Extension_of': 8,
 'PlugIn_of': 9,
 'AlternativeName_of': 10,
 'License_of': 11}

# Combine all info for each sentence

In [8]:
from pprint import pprint
sentence_entities_relations = []
for idx, (sentence, entity_label_list, relation_info) in enumerate(tuple(zip(sentences, entity_labels, relations_info))):
    token = sentence.split()
    entity_bio_tags = entity_label_list.split()
    ents = helper.get_entities(token, entity_bio_tags)
    ent_dict = {e['begin']:e for e in ents}
    relations = helper.get_relations(relation_info)
    sentence_info = dict(
        sentence=sentence.strip(),
        entities=ent_dict,
        relations=relations
    )
    sentence_entities_relations.append(sentence_info)

### Allowed subject, object

In [9]:
from itertools import chain
from collections import Counter

allowed_subj_obj_all = list(chain(*[helper.sentence_allowed_subj_obj(sent, allow_inverse=False) for sent in sentence_entities_relations]))
allowed_subj_obj_counts = Counter(allowed_subj_obj_all)
allowed_subj_obj = set(allowed_subj_obj_counts.keys())
len(allowed_subj_obj)

40

# Create instances for each candidate relation

In [10]:
import random
from itertools import combinations

## Idea: transform sentence based
def sent_to_relation_representations(sent, allowed_subj_obj=None):
    rel_dict = {(r['subject'], r['object']):r['relation_type'] for r in sent['relations']}
#     rel_dict |= {(r['object'], r['subject']):f"{r['relation_type']}-1" for r in sent['relations']}
    for ent_one, ent_two in combinations(sent['entities'].values(), r=2):
        for e_one, e_two in [ent_one, ent_two], [ent_two, ent_one]:
            combination_key = e_one['begin'], e_two['begin']
            relation_type = rel_dict.get(combination_key, "nil")
            subj_obj = e_one['label'], e_two['label']
            if subj_obj in allowed_subj_obj:
                sample = helper.build_relation_reprentation(sent, subj=e_one, obj=e_two, rel=relation_type)
                yield sample
    #return relation_representations
sent = random.choice(sentence_entities_relations)
relation_corpus = []
for idx, sent in enumerate(sentence_entities_relations):
    sent_samples = list(sent_to_relation_representations(sent, allowed_subj_obj))
#     sent_samples = list(sent_to_relation_representations(sent))
    for sent_sample, target in sent_samples:
        relation_corpus.append((idx, sent_sample, target))
len(sentence_entities_relations), len(relation_corpus)

(1091, 5896)

### Create a dataframe of the dataset

In [11]:
sent_idx, X, y = zip(*relation_corpus)

infos_dataset_df = pd.DataFrame(list(zip(sent_idx, X, y)), columns=['sentence_IDs', 'contexts', 'relations'])

In [12]:
infos_dataset_df['all_infos'] = infos_dataset_df.sentence_IDs.apply(lambda sent_idx: sentence_entities_relations[sent_idx])

In [13]:
infos_dataset_df.shape, len(sentence_entities_relations)

((5896, 4), 1091)

#### Map each relation to label of it

In [14]:
infos_dataset_df['labels'] = infos_dataset_df['relations'].apply(lambda row: relation_label_to_ID.get(row))

In [15]:
infos_dataset_df = infos_dataset_df.sample(len(infos_dataset_df))

In [16]:
infos_dataset_df.head()

Unnamed: 0,sentence_IDs,contexts,relations,all_infos,labels
4371,807,Statistical analysis was performed using Excel...,Developer_of,{'sentence': 'Statistical analysis was perform...,1
991,124,Ensembler is free and open source software lic...,nil,{'sentence': 'Ensembler is free and open sourc...,0
1973,348,MRI and fMRI data were preprocessed by using F...,nil,{'sentence': 'MRI and fMRI data were preproces...,0
3944,709,All statistical analyses were done in Stata 12...,Developer_of,{'sentence': 'All statistical analyses were do...,1
927,108,To compare the performance of SNPdetector with...,nil,{'sentence': 'To compare the performance of SN...,0


In [17]:
infos_dataset_df['all_infos'][0]

{'sentence': '" Project name : DelPhi Project home page : e.g. http://compbio.clemson.edu/delphi.php Operating system ( s ) : Linux , Mac , Windows Programming language : Fortran and C Other requirements : no License : free of charge license is required Any restrictions to use by non - academics : Commercial users should contact Accelrys Inc . "',
 'entities': {4: {'text': 'DelPhi',
   'label': 'Application',
   'intention': 'Deposition',
   'begin': 4,
   'end': 4},
  10: {'text': 'http://compbio.clemson.edu/delphi.php',
   'label': 'URL',
   'intention': None,
   'begin': 10,
   'end': 10},
  17: {'text': 'Linux',
   'label': 'OperatingSystem',
   'intention': 'Usage',
   'begin': 17,
   'end': 17},
  19: {'text': 'Mac',
   'label': 'OperatingSystem',
   'intention': 'Usage',
   'begin': 19,
   'end': 19},
  21: {'text': 'Windows',
   'label': 'OperatingSystem',
   'intention': 'Usage',
   'begin': 21,
   'end': 21},
  25: {'text': 'Fortran',
   'label': 'ProgrammingEnvironment',
   

#### Split the dataset into train and validation

In [18]:
from sklearn.model_selection import train_test_split

train, validation = train_test_split(infos_dataset_df, test_size=0.25, random_state=42)

### Tokenizer

In [19]:
from transformers import BertTokenizerFast, AutoTokenizer

model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [20]:
def prepare_sentence(sample):
    text = sample.contexts
    label = sample.labels
    tokenized = tokenizer(text, truncation=True, padding="longest", max_length=512)
    tokenized["label"] = sample.labels
    return tokenized

train_dataset = train.apply(prepare_sentence, axis=1).to_list()
validation_dataset = validation.apply(prepare_sentence, axis=1).to_list()

In [21]:
len(train_dataset), len(validation_dataset)

(4422, 1474)

### Define the model

In [22]:
from sklearn.metrics import precision_recall_fscore_support
def compute_metrics(eval_preds, label_names):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    labels = [label_names[l] for l in labels]
    predictions = pd.Series([label_names[p] for p in predictions])
    n_predicted = predictions.value_counts().rename("n_predicted")
    metrics = precision_recall_fscore_support(labels, predictions, labels=label_names, zero_division=0)
    metrics = pd.DataFrame(metrics, index=["prec", "recall", "f1", "support"], columns=label_names).T
    f = metrics.index != "nil"
    metrics = metrics[f].copy()
    ## weighted f1
    weights = metrics.support / metrics.support.sum()
    metrics = metrics.join(n_predicted)
    metrics["n_predicted"] = metrics.n_predicted.fillna(0)
    metric_info = dict(
        eval_f1_macro_weighted = (metrics.f1 * weights).sum(),
        eval_support = metrics.support.sum(),
        eval_n_predicted = metrics.n_predicted.fillna(0).sum()
    )
    for label, row in metrics.iterrows():
        for metr, value in row.to_dict().items():
            metric_info[f"eval_{metr}_{label}"] = value
    return metric_info

In [23]:
labels = list(relation_label_to_ID.keys())
labels

['nil',
 'Developer_of',
 'Abbreviation_of',
 'URL_of',
 'Citation_of',
 'Release_of',
 'Version_of',
 'Specification_of',
 'Extension_of',
 'PlugIn_of',
 'AlternativeName_of',
 'License_of']

In [24]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments, AutoModelForTokenClassification
from transformers import BertForSequenceClassification
from functools import partial

training_args = TrainingArguments(
    output_dir='./model_test',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    warmup_ratio=0.1,
    weight_decay=1e-5,
    logging_dir='./logs',
    evaluation_strategy='steps',
    eval_steps=100,
)
training_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(labels))
data_collator = DataCollatorWithPadding(tokenizer,
                                        padding="longest",
                                        max_length=512)

    
    
trainer = Trainer(
    model=training_model,
    args=training_args,
    train_dataset=train_dataset[:556],
    eval_dataset=validation_dataset[:556], #validation_dataset
    data_collator=data_collator,
    compute_metrics=partial(compute_metrics, label_names=labels),
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
trainer.train()

Step,Training Loss,Validation Loss,F1 Macro Weighted,Support,N Predicted,Prec Developer Of,Recall Developer Of,F1 Developer Of,Support Developer Of,N Predicted Developer Of,Prec Abbreviation Of,Recall Abbreviation Of,F1 Abbreviation Of,Support Abbreviation Of,N Predicted Abbreviation Of,Prec Url Of,Recall Url Of,F1 Url Of,Support Url Of,N Predicted Url Of,Prec Citation Of,Recall Citation Of,F1 Citation Of,Support Citation Of,N Predicted Citation Of,Prec Release Of,Recall Release Of,F1 Release Of,Support Release Of,N Predicted Release Of,Prec Version Of,Recall Version Of,F1 Version Of,Support Version Of,N Predicted Version Of,Prec Specification Of,Recall Specification Of,F1 Specification Of,Support Specification Of,N Predicted Specification Of,Prec Extension Of,Recall Extension Of,F1 Extension Of,Support Extension Of,N Predicted Extension Of,Prec Plugin Of,Recall Plugin Of,F1 Plugin Of,Support Plugin Of,N Predicted Plugin Of,Prec Alternativename Of,Recall Alternativename Of,F1 Alternativename Of,Support Alternativename Of,N Predicted Alternativename Of,Prec License Of,Recall License Of,F1 License Of,Support License Of,N Predicted License Of
100,No log,1.10762,0.494179,262.0,172.0,0.891892,0.445946,0.594595,74.0,37.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,25.0,0.0,0.681818,0.375,0.483871,40.0,22.0,0.0,0.0,0.0,5.0,0.0,0.672566,0.873563,0.76,87.0,113.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,6.0,0.0
200,No log,0.920535,0.657005,262.0,271.0,0.790123,0.864865,0.825806,74.0,81.0,0.7,1.0,0.823529,7.0,10.0,0.5,0.12,0.193548,25.0,6.0,0.58209,0.975,0.728972,40.0,67.0,0.0,0.0,0.0,5.0,0.0,0.729167,0.804598,0.765027,87.0,96.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,1.0,0.0,0.555556,0.625,0.588235,8.0,9.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0,0.0,6.0,0.0
300,No log,0.820807,0.631742,262.0,217.0,0.791045,0.716216,0.751773,74.0,67.0,0.666667,0.571429,0.615385,7.0,6.0,0.5,0.12,0.193548,25.0,6.0,0.702703,0.65,0.675325,40.0,37.0,1.0,0.8,0.888889,5.0,4.0,0.727273,0.735632,0.731429,87.0,88.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,1.0,0.0,0.666667,0.75,0.705882,8.0,9.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,6.0,0.0


TrainOutput(global_step=345, training_loss=0.9497047313745471, metrics={'train_runtime': 2420.9547, 'train_samples_per_second': 1.148, 'train_steps_per_second': 0.143, 'total_flos': 166773937279680.0, 'train_loss': 0.9497047313745471, 'epoch': 4.9640287769784175})

In [43]:
model_pred = BertForSequenceClassification.from_pretrained("model_test/checkpoint-345")
classifier = TextClassificationPipeline(model=model_pred,
                                        tokenizer=tokenizer, device=-1)

## Predict

### Read the test Dataset

In [44]:
# open the data of sentences
with open('../../data/subtask3/subtask3_test.data.txt', 'r', encoding='utf-8') as file:
    test_sentences = file.readlines()

# open the entity labels
with open('../../data/subtask3/subtask3_test.info.txt', 'r', encoding='utf-8') as file:
    test_entity_labels = file.readlines()

In [45]:
test_sentence_entities = []
for idx, (sentence, entity_label_list) in enumerate(tuple(zip(test_sentences, test_entity_labels))):
    token = sentence.split()
    entity_bio_tags = entity_label_list.split()
    ents = helper.get_entities(token, entity_bio_tags)
    ent_dict = {e['begin']:e for e in ents}
    sentence_info = dict(
        sentence=sentence.strip(),
        entities=ent_dict
    )
    test_sentence_entities.append(sentence_info)

In [46]:
def sentence_subject_object(sent, allowed_subj_obj):
    for ent_one, ent_two in combinations(sent['entities'].values(), r=2):
            for e_one, e_two in [ent_one, ent_two], [ent_two, ent_one]:
                subj_obj = e_one['label'], e_two['label']
                if subj_obj in allowed_subj_obj:
                    sample = helper.build_sentence_subj_obj(sent, subj=e_one, obj=e_two)
                    yield sample

In [47]:
sentece_subj_obj_corpus = []
for idx, sent in enumerate(test_sentence_entities):
    sent_samples = list(sentence_subject_object(sent, allowed_subj_obj))
    for sent_sample in sent_samples:
        sentece_subj_obj_corpus.append((idx, sent_sample))
# len(test_sentence_entities), len(sentece_subj_obj_corpus)

### Create the dataframe for tests

In [48]:
sent_idx, X = zip(*sentece_subj_obj_corpus)

test_dataset_df = pd.DataFrame(list(zip(sent_idx, X)), columns=['sentence_IDs', 'contexts'])

test_dataset_df['all_infos'] = test_dataset_df.apply(lambda row: [all_info
                                                       for index, all_info in enumerate(test_sentence_entities)
                                                       if row['sentence_IDs'] == index], axis=1)

In [49]:
test_dataset_df.head(20)

Unnamed: 0,sentence_IDs,contexts,all_infos
0,0,The source code is available under the GNU GPL...,[{'sentence': 'The source code is available un...
1,0,The source code is available under the GNU GPL...,[{'sentence': 'The source code is available un...
2,0,The source code is available under the GNU GPL...,[{'sentence': 'The source code is available un...
3,0,The source code is available under the GNU GPL...,[{'sentence': 'The source code is available un...
4,0,The source code is available under the GNU GPL...,[{'sentence': 'The source code is available un...
5,0,The source code is available under the GNU GPL...,[{'sentence': 'The source code is available un...
6,0,The source code is available under the GNU GPL...,[{'sentence': 'The source code is available un...
7,0,The source code is available under the GNU GPL...,[{'sentence': 'The source code is available un...
8,0,The source code is available under the GNU GPL...,[{'sentence': 'The source code is available un...
9,0,The source code is available under the GNU GPL...,[{'sentence': 'The source code is available un...


## Predict

In [None]:
# corrected version
n_expected_sentences = test_dataset_df.sentence_IDs.max() + 1
previous_sentence_id = None
sentence_predictions = ["" for _ in range(n_expected_sentences)]
current_relations = []
for index, row in test_dataset_df.iterrows():
    current_sentence_id = row['sentence_IDs']
    # Check if we have moved to a new sentence
    if current_sentence_id != previous_sentence_id and previous_sentence_id is not None:
        # Append the formatted line for the previous sentence
        sentence_predictions[previous_sentence_id] = ";".join(current_relations)
        current_relations = []
 
    # Predict the label using the classifier
    predict = classifier(row['contexts'])
    label = ID_to_relation_label.get(int(predict[0]['label'].split('_')[-1]))
    # skip the nil labels
 
    # Split the contexts and process each entity
    # E.g. "satz python [34, 13] [SEP] [Reference: '[34, 13]'], [Enironment: 'python']"
    subj_info, object_info = row['contexts'].split('[SEP] [')[-1].split("'], [")
    subj_label, subj_text = subj_info.split(": '")
    obj_label, obj_text = object_info[:-2].split(": '")
    entities = row["all_infos"][0]["entities"]
    for ent_begin, ent in entities.items():
        if ent['label'] == subj_label and ent['text'] == subj_text:
            subj_begin = ent["begin"]
        if ent['label'] == obj_label and ent['text'] == obj_text:
            obj_begin = ent["begin"]
    relation_repr = f"{label}\t{subj_begin}\t{obj_begin}"
    if label != "nil":
        current_relations.append(relation_repr)
    previous_sentence_id = current_sentence_id
 
# Ensure the last set is appended
sentence_predictions[previous_sentence_id] = ";".join(current_relations)

In [69]:
with open('../../data/subtask3/prediction/prediction_dataset.txt', 'w') as file:
  for prediction in all_predicted_text_classification_dataset:
    file.write(f'{prediction}\n')