In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

In [3]:
# !pip install evaluate
# !pip install seqeval

In [5]:
# !pip install accelerate -U
# !pip install transformers[torch]

In [6]:
import sys
sys.path.append("..")
import helper

from transformers import DistilBertTokenizer, BertTokenizer
from transformers import DataCollatorWithPadding
from transformers import TFDistilBertForSequenceClassification
from transformers import BertForSequenceClassification
from transformers import TextClassificationPipeline
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from functools import partial

import torch
from torch import cuda
from itertools import chain, combinations
from collections import Counter
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
import matplotlib.pyplot as plt

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [7]:
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cpu'

## Dataset

In [8]:
# open the data of sentences
with open('../../data/subtask3/subtask3_train.data.txt', 'r', encoding='utf-8') as file: 
    sentences = file.readlines()

# open the entity labels
with open('../../data/subtask3/subtask3_train.info.txt', 'r', encoding='utf-8') as file:
    entity_labels = file.readlines()

# open the data of relations
with open('../../data/subtask3/subtask3_train.labels.txt', 'r', encoding='utf-8') as file:
    relations_info = file.readlines()

### Unique entity labels

In [9]:
unique_entity_labels = list({label for line in entity_labels for label in line.split()})

### Unique relation labels

In [10]:
unique_relation_labels = set()

for line in relations_info:
    relations_info_list = line.replace(';;', ' ').split()
    unique_relation_labels.update(relation for relation in relations_info_list if not relation.isdigit())

unique_relation_labels = list(unique_relation_labels)

### Relation_label to IDs and updside down

In [11]:
relation_labels = ['nil', 'Developer_of', 'Abbreviation_of', 'URL_of', 'Citation_of', 'Release_of', 'Version_of',
                   'Specification_of', 'Extension_of', 'PlugIn_of', 'AlternativeName_of', 'License_of',
#                    'Developer_of-1', 'Abbreviation_of-1', 'URL_of-1', 'Citation_of-1', 'Release_of-1', 'Version_of-1',
#                    'Specification_of-1', 'Extension_of-1', 'PlugIn_of-1', 'AlternativeName_of-1', 'License_of-1'
                  ]

relation_label_to_ID = {relation_label: ID for ID, relation_label in enumerate(relation_labels)}
ID_to_relation_label = {ID: relation_label for ID, relation_label in enumerate(relation_labels)}
relation_label_to_ID

{'nil': 0,
 'Developer_of': 1,
 'Abbreviation_of': 2,
 'URL_of': 3,
 'Citation_of': 4,
 'Release_of': 5,
 'Version_of': 6,
 'Specification_of': 7,
 'Extension_of': 8,
 'PlugIn_of': 9,
 'AlternativeName_of': 10,
 'License_of': 11}

### Combine all info for each sentence

In [12]:
from pprint import pprint
sentence_entities_relations = []
for idx, (sentence, entity_label_list, relation_info) in enumerate(tuple(zip(sentences, entity_labels, relations_info))):
    token = sentence.split()
    entity_bio_tags = entity_label_list.split()
    ents = helper.get_entities(token, entity_bio_tags)
    ent_dict = {e['begin']:e for e in ents}
    relations = helper.get_relations(relation_info)
    sentence_info = dict(
        sentence=sentence.strip(),
        entities=ent_dict,
        relations=relations
    )
    sentence_entities_relations.append(sentence_info)

### Allowed subject, object

In [13]:
allowed_subj_obj_all = list(chain(*[helper.sentence_allowed_subj_obj(sent, allow_inverse=False) for sent in sentence_entities_relations]))
allowed_subj_obj_counts = Counter(allowed_subj_obj_all)
allowed_subj_obj = set(allowed_subj_obj_counts.keys())
len(allowed_subj_obj)

40

###  Create instances for each candidate relation

In [14]:
## Idea: transform sentence based
def sent_to_relation_representations(sent, allowed_subj_obj=None):
    rel_dict = {(r['subject'], r['object']):r['relation_type'] for r in sent['relations']}
#     rel_dict |= {(r['object'], r['subject']):f"{r['relation_type']}-1" for r in sent['relations']}
    for ent_one, ent_two in combinations(sent['entities'].values(), r=2):
        for e_one, e_two in [ent_one, ent_two], [ent_two, ent_one]:
            combination_key = e_one['begin'], e_two['begin']
            relation_type = rel_dict.get(combination_key, "nil")
            subj_obj = e_one['label'], e_two['label']
            if subj_obj in allowed_subj_obj:
                sample = helper.build_relation_reprentation(sent, subj=e_one, obj=e_two, rel=relation_type)
                yield sample
    #return relation_representations
sent = random.choice(sentence_entities_relations)
relation_corpus = []
for idx, sent in enumerate(sentence_entities_relations):
    sent_samples = list(sent_to_relation_representations(sent, allowed_subj_obj))
#     sent_samples = list(sent_to_relation_representations(sent))
    for sent_sample, target in sent_samples:
        relation_corpus.append((idx, sent_sample, target))
len(sentence_entities_relations), len(relation_corpus)

(1091, 5896)

### Create a dataframe of the dataset

In [15]:
sent_idx, X, y = zip(*relation_corpus)

infos_dataset_df = pd.DataFrame(list(zip(sent_idx, X, y)), columns=['sentence_IDs', 'contexts', 'relations'])
infos_dataset_df['all_infos'] = infos_dataset_df.sentence_IDs.apply(lambda sent_idx: sentence_entities_relations[sent_idx])

### Map each relation to label of it

In [16]:
infos_dataset_df['labels'] = infos_dataset_df['relations'].apply(lambda row: relation_label_to_ID.get(row))

### Split the dataset into train and validation

In [17]:
from sklearn.model_selection import train_test_split

train, validation = train_test_split(infos_dataset_df, test_size=0.25, random_state=42)

# Tokenizer

In [18]:
from transformers import BertTokenizerFast, AutoTokenizer

model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [19]:
def prepare_sentence(sample):
    text = sample.contexts
    label = sample.labels
    tokenized = tokenizer(text, truncation=True, padding="longest", max_length=512)
    tokenized["label"] = sample.labels
    return tokenized
train_dataset = train.apply(prepare_sentence, axis=1).to_list()
validation_dataset = validation.apply(prepare_sentence, axis=1).to_list()

In [20]:
len(train_dataset), len(validation_dataset)

(4422, 1474)

# Evaluation

In [21]:
def compute_metrics(eval_preds, label_names):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    labels = [label_names[l] for l in labels]
    predictions = pd.Series([label_names[p] for p in predictions])
    n_predicted = predictions.value_counts().rename("n_predicted")
    metrics = precision_recall_fscore_support(labels, predictions, labels=label_names, zero_division=0)
    metrics = pd.DataFrame(metrics, index=["prec", "recall", "f1", "support"], columns=label_names).T
    f = metrics.index != "nil"
    metrics = metrics[f].copy()
    ## weighted f1
    weights = metrics.support / metrics.support.sum()
    metrics = metrics.join(n_predicted)
    metrics["n_predicted"] = metrics.n_predicted.fillna(0)
    metric_info = dict(
        eval_f1_macro_weighted = (metrics.f1 * weights).sum(),
        eval_support = metrics.support.sum(),
        eval_n_predicted = metrics.n_predicted.fillna(0).sum()
    )
    for label, row in metrics.iterrows():
        for metr, value in row.to_dict().items():
            metric_info[f"eval_{metr}_{label}"] = value
    return metric_info

In [22]:
labels = list(relation_label_to_ID.keys())
labels

['nil',
 'Developer_of',
 'Abbreviation_of',
 'URL_of',
 'Citation_of',
 'Release_of',
 'Version_of',
 'Specification_of',
 'Extension_of',
 'PlugIn_of',
 'AlternativeName_of',
 'License_of']

# Fine-tuning

In [25]:
# !pip install transformers optuna dataset

In [None]:
import optuna
from transformers import Trainer, TrainingArguments, BertForTokenClassification
from transformers import DataCollatorForTokenClassification, DataCollatorWithPadding


data_collator = DataCollatorWithPadding(tokenizer,
                                        padding="longest",
                                        max_length=512)
def model_init():
#     return DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=len(labels), return_dict=True)
    return BertForSequenceClassification.from_pretrained(model_name, num_labels=len(labels), return_dict=True)



def objective(trial):
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 5e-5, log=True)
    train_batch_size = trial.suggest_categorical('train_batch_size', [4, 8, 16, 32])
    val_batch_size = trial.suggest_categorical('val_batch_size', [4, 8, 16, 32])
    epochs = trial.suggest_int("num_train_epochs", 3, 8)
    weight_decay = trial.suggest_float("weight_decay",  1e-5, 0.1, log=True)

    training_args = TrainingArguments(
        output_dir="./result",
        eval_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=train_batch_size,
        per_device_eval_batch_size=val_batch_size,
        num_train_epochs=epochs,
        weight_decay=weight_decay,
        logging_dir='./logs',
        logging_steps=100,
        warmup_ratio=0.1,
        gradient_accumulation_steps=2,
        max_grad_norm=1.0
    )



    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset,
        data_collator=data_collator,
        compute_metrics=partial(compute_metrics, label_names=labels)
    )

    trainer.train()

    eval_result = trainer.evaluate()
    overall_f1 = eval_result['eval_f1_macro_weighted']
    return overall_f1

study = optuna.create_study(direction='maximize')
assert len(study.trials) == 0
study.optimize(objective, n_trials=20)

if len(study.trials) == 0 or all([t.state != optuna.trial.TrialState.COMPLETE for t in study.trials]):
    print("No trials are completed yet.")
else:
  print("Best trial:")
  trial = study.best_trial
  print(trial.values)
  print("Best hyperparameters: {}".format(trial.params))

[I 2024-07-21 19:25:20,407] A new study created in memory with name: no-name-c25fc7c6-876f-4367-87f6-3b25205646d7


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
