In [None]:
! pip install sentence-transformers datasets

In [None]:
import logging
import sys
import traceback
from datetime import datetime
import pandas as pd

from datasets import load_dataset, concatenate_datasets
import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer, losses, models
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.similarity_functions import SimilarityFunction
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments


modelpath = '/content/drive/MyDrive/Doctorado/Beca Doctoral Fede Schmidt/Proyectos/Semantic similarity & arguments identification/'
datapath = '/content/drive/MyDrive/Doctorado/Beca Doctoral Fede Schmidt/Proyectos/Semantic similarity & arguments identification/Datasets/'

model_names = ["sentence-transformers/all-MiniLM-L6-v2", modelpath + 'argueBert_base_similar/', modelpath + 'argueBert_edge/']
short_model_names = ["sbert-all-MiniLM-L6-v2", 'argueBert_base_similar', 'argueBert_edge']

index_selected_model = 2    ## change here.

model_name = model_names[index_selected_model]
short_model_name = short_model_names[index_selected_model]

def build_model(model_name):
    model = None
    if model_name == "sentence-transformers/all-MiniLM-L6-v2":
        model = SentenceTransformer(model_name)
    else:
        word_embedding_model = models.Transformer(model_name)
        # print(word_embedding_model)
        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
        # print(pooling_model)
        normalize_layer = models.Normalize()
        model = SentenceTransformer(modules=[word_embedding_model, pooling_model, normalize_layer])

    for param in model.parameters(): param.data = param.data.contiguous()
    return model


# model
model = build_model(model_name)

print()
print(model)

In [None]:
"""
Code based on https://github.com/mabehrendt/argueBERT
"""

import logging
import sys
import traceback
from datetime import datetime
import pandas as pd

from datasets import load_dataset, concatenate_datasets
import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer, losses, models
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.similarity_functions import SimilarityFunction
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments


modelpath = '/content/drive/MyDrive/Doctorado/Beca Doctoral Fede Schmidt/Proyectos/Semantic similarity & arguments identification/'
datapath = '/content/drive/MyDrive/Doctorado/Beca Doctoral Fede Schmidt/Proyectos/Semantic similarity & arguments identification/Datasets/'

model_names = ["sentence-transformers/all-MiniLM-L6-v2", modelpath + 'argueBert_base_similar/', modelpath + 'argueBert_edge/']
short_model_names = ["sbert-all-MiniLM-L6-v2", 'argueBert_base_similar', 'argueBert_edge']

index_selected_model = 0    ## change here.

model_name = model_names[index_selected_model]
short_model_name = short_model_names[index_selected_model]

# hyperparams of argueBERT for fine-tuning.
train_batch_size = 16
num_epochs = 5
learning_rate = 2e-5
weight_decay = 0.01
optimizer = "adamw_hf"

do_fine_tuning = False ## change here

# build model
def build_model(model_name):
    model = None
    if model_name == "sentence-transformers/all-MiniLM-L6-v2":
        model = SentenceTransformer(model_name)
    else:
        word_embedding_model = models.Transformer(model_name)
        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
        normalize_layer = models.Normalize()
        model = SentenceTransformer(modules=[word_embedding_model, pooling_model, normalize_layer])

    for param in model.parameters(): param.data = param.data.contiguous()
    return model

def load_data(path = 'STSb/'):
    dataset_path = datapath + path
    if 'STSb' in path:
        train_dataset = load_dataset('parquet', data_files=dataset_path+'train.parquet')
        eval_dataset = load_dataset('parquet', data_files=dataset_path+'validation.parquet')
        test_dataset = load_dataset('parquet', data_files=dataset_path+'test.parquet')
        train_dataset = train_dataset["train"]
        eval_dataset = eval_dataset["train"]
        test_dataset = test_dataset["train"]
    elif 'BWS' in path:
        train_dataset = load_dataset("csv", data_files=dataset_path+"trainsplit.csv")
        eval_dataset = load_dataset("csv", data_files=dataset_path+"devsplit.csv")
        test_dataset = load_dataset("csv", data_files=dataset_path+"testsplit.csv")

        train_dataset = train_dataset["train"].remove_columns(['id', 'topic'])
        train_dataset = train_dataset.rename_column("argument1", "sentence1")
        train_dataset = train_dataset.rename_column("argument2", "sentence2")

        eval_dataset = eval_dataset["train"].remove_columns(['id', 'topic'])
        eval_dataset = eval_dataset.rename_column("argument1", "sentence1")
        eval_dataset = eval_dataset.rename_column("argument2", "sentence2")

        test_dataset = test_dataset["train"].remove_columns(['id', 'topic'])
        test_dataset = test_dataset.rename_column("argument1", "sentence1")
        test_dataset = test_dataset.rename_column("argument2", "sentence2")

    elif 'AFS' in path:
        train_dataset = load_dataset("csv", data_files=dataset_path+"trainsplit.csv")
        eval_dataset = load_dataset("csv", data_files=dataset_path+"devsplit.csv")
        test_dataset = load_dataset("csv", data_files=dataset_path+"testsplit.csv")

        train_dataset = train_dataset["train"]
        train_dataset = train_dataset.rename_column("sentence_1", "sentence1")
        train_dataset = train_dataset.rename_column("sentence_2", "sentence2")

        eval_dataset = eval_dataset["train"]
        eval_dataset = eval_dataset.rename_column("sentence_1", "sentence1")
        eval_dataset = eval_dataset.rename_column("sentence_2", "sentence2")

        test_dataset = test_dataset["train"]
        test_dataset = test_dataset.rename_column("sentence_1", "sentence1")
        test_dataset = test_dataset.rename_column("sentence_2", "sentence2")
    else:
        raise Exception("No dataset in given path...")

    return (train_dataset, eval_dataset, test_dataset)


output_dir = (
    "output/training_stsbenchmark_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)

# model
model = build_model(model_name)
print(model)

# data
sts_data = load_data()
bws_data = load_data('BWS Argument Similarity/')
afs_data = load_data('AFS/')


# train_dataset = concatenate_datasets([sts_data[0]])
train_dataset = concatenate_datasets([bws_data[0]])
# train_dataset = concatenate_datasets([afs_data[0]])
# train_dataset = concatenate_datasets([sts_data[0], bws_data[0]])
# train_dataset = concatenate_datasets([sts_data[0], afs_data[0]])
# train_dataset = concatenate_datasets([sts_data[0], bws_data[0], afs_data[0]])

# eval_dataset = concatenate_datasets([sts_data[1]])
eval_dataset = concatenate_datasets([bws_data[1]])
# eval_dataset = concatenate_datasets([afs_data[1]])
# eval_dataset = concatenate_datasets([sts_data[1], bws_data[1]])
# eval_dataset = concatenate_datasets([sts_data[1], afs_data[1]])
# eval_dataset = concatenate_datasets([sts_data[1], bws_data[1], afs_data[1]])

test_datasets = [sts_data[2], bws_data[2], afs_data[2]]

# Define training loss
train_loss = losses.CosineSimilarityLoss(model=model)

# Define an evaluator for use during training. This is useful to keep track of alongside the evaluation loss.
dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=eval_dataset["sentence1"],
    sentences2=eval_dataset["sentence2"],
    scores=eval_dataset["score"],
    main_similarity=SimilarityFunction.COSINE,
    name="res-dev",
)

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir=output_dir,
    # Optional training parameters:
    num_train_epochs=num_epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=train_batch_size,
    warmup_ratio=0.1,
    optim = optimizer,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=False,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    # Optional tracking/debugging parameters:
    evaluation_strategy="steps",
    eval_steps=100,
    save_safetensors=False,
    load_best_model_at_end=True,
    save_strategy="steps",
    # save_strategy="no",
    logging_steps=100
)

# Create the trainer & start training
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=train_loss,
    evaluator=dev_evaluator,
)

if do_fine_tuning:
    trainer.train()



In [None]:
results_data = []
for test_dataset, test_dataset_name in zip(test_datasets, ['STS', 'BWS', 'AFS']):
    test_evaluator = EmbeddingSimilarityEvaluator(
        sentences1=test_dataset["sentence1"],
        sentences2=test_dataset["sentence2"],
        scores=test_dataset["score"],
        main_similarity=SimilarityFunction.COSINE,
        name="test",
    )

    results = test_evaluator(model)
    results['test'] = test_dataset_name

    results_data.append(results)


# save testing results
results_dataframe = pd.DataFrame(results_data)
results_dataframe

In [None]:
results_dataframe.to_csv(f"test_results_{'sbert_no_ft'}.csv")

In [None]:
# save trained model

# names_train_sets = 'sts' ## change here based on used training set.
# names_train_sets = 'sts-bws'
# names_train_sets = 'sts-afs'
# names_train_sets = 'sts-bws-afs'
# names_train_sets = 'bws'
names_train_sets = 'afs'
saving_model_name = short_model_name + '-' + names_train_sets

model.save_pretrained(f"models/{saving_model_name}/final")

results_dataframe.to_csv(f"models/{saving_model_name}/final/test_results_{short_model_name}.csv")

In [None]:
# /content/models/sbert-all-MiniLM-L6-v2-sts-afs
# /content/models/sbert-all-MiniLM-L6-v2-sts-bws-afs
# /content/models/argueBert_base_similar-sts
# /content/models/argueBert_base_similar-bws

!zip -r /content/models/argueBert_base_similar-afs/arguebert-afs.zip /content/models/argueBert_base_similar-afs/final

!cp /content/models/argueBert_base_similar-afs/arguebert-afs.zip '/content/drive/MyDrive/Doctorado/Beca Doctoral Fede Schmidt/Proyectos/Semantic similarity & arguments identification/similarity_models'

# USAGE

In [None]:
# trained_model = SentenceTransformer("all-MiniLM-L6-v2")
# trained_model = SentenceTransformer('/content/models/all-MiniLM-L6-v2-stsb/final')
# trained_model = SentenceTransformer('/content/models/argueBert_similar-stsb/final')
trained_model = SentenceTransformer('/content/models/argueBert_base_similar-stsb/final')

trained_model

In [None]:
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]

# 2. Calculate embeddings by calling model.encode()
embeddings = trained_model.encode(sentences)
print(embeddings.shape)
# [3, 384]

# 3. Calculate the embedding similarities
similarities = trained_model.similarity(embeddings, embeddings)
print(similarities)
# tensor([[1.0000, 0.6660, 0.1046],
#         [0.6660, 1.0000, 0.1411],
#         [0.1046, 0.1411, 1.0000]])