In [2]:
import argparse
import glob
import logging
import math
import os
import random
from datetime import datetime

import numpy as np
import torch
from sentence_transformers import SentenceTransformer, LoggingHandler, models, losses, datasets
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

from data_util import load_kor_sts_samples, load_kor_nli_samples

seed = 42
epochs = 10
batch_size = 64
# Fix random seed
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

word_embedding_model = models.Transformer(
    model_name_or_path='klue/roberta-base',
    max_seq_length=512,
    do_lower_case=False,
    )
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False
)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Configure logger
logging.basicConfig(
    format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]
)

# Read the dataset
model_save_path = os.path.join(
    '/home/choi/Git/RAG_con_doc/langchain/FT_model','MRL_MNRL_NLI' +'-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)

# Read the dataset
nli_dataset_path = '/home/choi/Git/ConSRoBERTa/data/ConNLI'
sts_dataset_path = '/home/choi/Git/ConSRoBERTa/data/ConSTS'
logging.info("Read ConNLI train/ConSTS test dataset")
train_files = glob.glob(os.path.join(nli_dataset_path, "*ConNLI_train.tsv"))
dev_file = os.path.join(sts_dataset_path, "ConSTS_test.tsv")
train_samples = []
for train_file in train_files:
    train_samples += load_kor_nli_samples(train_file)
train_dataloader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=batch_size)
dev_samples = load_kor_sts_samples(dev_file)
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=batch_size, name='sts-dev')
train_loss = losses.MultipleNegativesRankingLoss(model)
train_loss = losses.MatryoshkaLoss(model, train_loss, [768, 512, 256, 128, 64])

# Configure the training.
warmup_steps = math.ceil(len(train_dataloader) * epochs * 0.1)  # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=epochs,
    evaluation_steps=10000,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
    checkpoint_path=os.path.join(model_save_path, "checkpoint"),
    checkpoint_save_steps=len(train_dataloader),
    checkpoint_save_total_limit=epochs
    )

# Load the stored model and evaluate its performance on STS benchmark dataset
model = SentenceTransformer(model_save_path)
logging.info("Read ConSTS benchmark test dataset")
test_file = os.path.join(sts_dataset_path, "ConSTS_test.tsv")
test_samples = load_kor_sts_samples(test_file)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)


Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2024-10-17 14:16:30 - Use pytorch device_name: cuda
2024-10-17 14:16:30 - Read ConNLI train/ConSTS test dataset
2024-10-17 14:16:30 - Warmup-steps: 161


  0%|          | 0/1610 [00:00<?, ?it/s]

2024-10-17 14:17:10 - Saving model checkpoint to /home/choi/Git/RAG_con_doc/langchain/FT_model/MRL_MNRL_NLI-2024-10-17_14-16-30/checkpoint/checkpoint-161
2024-10-17 14:17:10 - Save model to /home/choi/Git/RAG_con_doc/langchain/FT_model/MRL_MNRL_NLI-2024-10-17_14-16-30/checkpoint/checkpoint-161


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

2024-10-17 14:17:11 - EmbeddingSimilarityEvaluator: Evaluating the model on the sts-dev dataset after epoch 1.0:
2024-10-17 14:17:11 - Cosine-Similarity :	Pearson: nan	Spearman: nan
2024-10-17 14:17:11 - Manhattan-Distance:	Pearson: nan	Spearman: nan
2024-10-17 14:17:11 - Euclidean-Distance:	Pearson: nan	Spearman: nan
2024-10-17 14:17:11 - Dot-Product-Similarity:	Pearson: nan	Spearman: nan
2024-10-17 14:17:11 - Save model to /home/choi/Git/RAG_con_doc/langchain/FT_model/MRL_MNRL_NLI-2024-10-17_14-16-30


  eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
  eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)
  eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
  eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)
  eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
  eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)
  eval_pearson_dot, _ = pearsonr(labels, dot_products)
  eval_spearman_dot, _ = spearmanr(labels, dot_products)


2024-10-17 14:17:50 - Saving model checkpoint to /home/choi/Git/RAG_con_doc/langchain/FT_model/MRL_MNRL_NLI-2024-10-17_14-16-30/checkpoint/checkpoint-322
2024-10-17 14:17:50 - Save model to /home/choi/Git/RAG_con_doc/langchain/FT_model/MRL_MNRL_NLI-2024-10-17_14-16-30/checkpoint/checkpoint-322
2024-10-17 14:17:51 - EmbeddingSimilarityEvaluator: Evaluating the model on the sts-dev dataset after epoch 2.0:
2024-10-17 14:17:51 - Cosine-Similarity :	Pearson: nan	Spearman: nan
2024-10-17 14:17:51 - Manhattan-Distance:	Pearson: nan	Spearman: nan
2024-10-17 14:17:51 - Euclidean-Distance:	Pearson: nan	Spearman: nan
2024-10-17 14:17:51 - Dot-Product-Similarity:	Pearson: nan	Spearman: nan


  eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
  eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)
  eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
  eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)
  eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
  eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)
  eval_pearson_dot, _ = pearsonr(labels, dot_products)
  eval_spearman_dot, _ = spearmanr(labels, dot_products)


2024-10-17 14:18:30 - Saving model checkpoint to /home/choi/Git/RAG_con_doc/langchain/FT_model/MRL_MNRL_NLI-2024-10-17_14-16-30/checkpoint/checkpoint-483
2024-10-17 14:18:30 - Save model to /home/choi/Git/RAG_con_doc/langchain/FT_model/MRL_MNRL_NLI-2024-10-17_14-16-30/checkpoint/checkpoint-483
2024-10-17 14:18:31 - EmbeddingSimilarityEvaluator: Evaluating the model on the sts-dev dataset after epoch 3.0:
2024-10-17 14:18:31 - Cosine-Similarity :	Pearson: nan	Spearman: nan
2024-10-17 14:18:31 - Manhattan-Distance:	Pearson: nan	Spearman: nan
2024-10-17 14:18:31 - Euclidean-Distance:	Pearson: nan	Spearman: nan
2024-10-17 14:18:31 - Dot-Product-Similarity:	Pearson: nan	Spearman: nan


  eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
  eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)
  eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
  eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)
  eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
  eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)
  eval_pearson_dot, _ = pearsonr(labels, dot_products)
  eval_spearman_dot, _ = spearmanr(labels, dot_products)


{'loss': 2.2773, 'grad_norm': 5.85750150680542, 'learning_rate': 1.532091097308489e-05, 'epoch': 3.11}
2024-10-17 14:19:10 - Saving model checkpoint to /home/choi/Git/RAG_con_doc/langchain/FT_model/MRL_MNRL_NLI-2024-10-17_14-16-30/checkpoint/checkpoint-644
2024-10-17 14:19:10 - Save model to /home/choi/Git/RAG_con_doc/langchain/FT_model/MRL_MNRL_NLI-2024-10-17_14-16-30/checkpoint/checkpoint-644
2024-10-17 14:19:10 - EmbeddingSimilarityEvaluator: Evaluating the model on the sts-dev dataset after epoch 4.0:
2024-10-17 14:19:10 - Cosine-Similarity :	Pearson: nan	Spearman: nan
2024-10-17 14:19:10 - Manhattan-Distance:	Pearson: nan	Spearman: nan
2024-10-17 14:19:10 - Euclidean-Distance:	Pearson: nan	Spearman: nan
2024-10-17 14:19:10 - Dot-Product-Similarity:	Pearson: nan	Spearman: nan


  eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
  eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)
  eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
  eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)
  eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
  eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)
  eval_pearson_dot, _ = pearsonr(labels, dot_products)
  eval_spearman_dot, _ = spearmanr(labels, dot_products)


2024-10-17 14:19:49 - Saving model checkpoint to /home/choi/Git/RAG_con_doc/langchain/FT_model/MRL_MNRL_NLI-2024-10-17_14-16-30/checkpoint/checkpoint-805
2024-10-17 14:19:49 - Save model to /home/choi/Git/RAG_con_doc/langchain/FT_model/MRL_MNRL_NLI-2024-10-17_14-16-30/checkpoint/checkpoint-805
2024-10-17 14:19:50 - EmbeddingSimilarityEvaluator: Evaluating the model on the sts-dev dataset after epoch 5.0:
2024-10-17 14:19:50 - Cosine-Similarity :	Pearson: nan	Spearman: nan
2024-10-17 14:19:50 - Manhattan-Distance:	Pearson: nan	Spearman: nan
2024-10-17 14:19:50 - Euclidean-Distance:	Pearson: nan	Spearman: nan
2024-10-17 14:19:50 - Dot-Product-Similarity:	Pearson: nan	Spearman: nan


  eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
  eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)
  eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
  eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)
  eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
  eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)
  eval_pearson_dot, _ = pearsonr(labels, dot_products)
  eval_spearman_dot, _ = spearmanr(labels, dot_products)


2024-10-17 14:20:29 - Saving model checkpoint to /home/choi/Git/RAG_con_doc/langchain/FT_model/MRL_MNRL_NLI-2024-10-17_14-16-30/checkpoint/checkpoint-966
2024-10-17 14:20:29 - Save model to /home/choi/Git/RAG_con_doc/langchain/FT_model/MRL_MNRL_NLI-2024-10-17_14-16-30/checkpoint/checkpoint-966
2024-10-17 14:20:29 - EmbeddingSimilarityEvaluator: Evaluating the model on the sts-dev dataset after epoch 6.0:
2024-10-17 14:20:30 - Cosine-Similarity :	Pearson: nan	Spearman: nan
2024-10-17 14:20:30 - Manhattan-Distance:	Pearson: nan	Spearman: nan
2024-10-17 14:20:30 - Euclidean-Distance:	Pearson: nan	Spearman: nan
2024-10-17 14:20:30 - Dot-Product-Similarity:	Pearson: nan	Spearman: nan


  eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
  eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)
  eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
  eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)
  eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
  eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)
  eval_pearson_dot, _ = pearsonr(labels, dot_products)
  eval_spearman_dot, _ = spearmanr(labels, dot_products)


{'loss': 0.0921, 'grad_norm': 1.4522508382797241, 'learning_rate': 8.419599723947551e-06, 'epoch': 6.21}
2024-10-17 14:21:06 - Saving model checkpoint to /home/choi/Git/RAG_con_doc/langchain/FT_model/MRL_MNRL_NLI-2024-10-17_14-16-30/checkpoint/checkpoint-1127
2024-10-17 14:21:06 - Save model to /home/choi/Git/RAG_con_doc/langchain/FT_model/MRL_MNRL_NLI-2024-10-17_14-16-30/checkpoint/checkpoint-1127
2024-10-17 14:21:07 - EmbeddingSimilarityEvaluator: Evaluating the model on the sts-dev dataset after epoch 7.0:
2024-10-17 14:21:07 - Cosine-Similarity :	Pearson: nan	Spearman: nan
2024-10-17 14:21:07 - Manhattan-Distance:	Pearson: nan	Spearman: nan
2024-10-17 14:21:07 - Euclidean-Distance:	Pearson: nan	Spearman: nan
2024-10-17 14:21:07 - Dot-Product-Similarity:	Pearson: nan	Spearman: nan


  eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
  eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)
  eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
  eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)
  eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
  eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)
  eval_pearson_dot, _ = pearsonr(labels, dot_products)
  eval_spearman_dot, _ = spearmanr(labels, dot_products)


2024-10-17 14:21:44 - Saving model checkpoint to /home/choi/Git/RAG_con_doc/langchain/FT_model/MRL_MNRL_NLI-2024-10-17_14-16-30/checkpoint/checkpoint-1288
2024-10-17 14:21:44 - Save model to /home/choi/Git/RAG_con_doc/langchain/FT_model/MRL_MNRL_NLI-2024-10-17_14-16-30/checkpoint/checkpoint-1288
2024-10-17 14:21:44 - EmbeddingSimilarityEvaluator: Evaluating the model on the sts-dev dataset after epoch 8.0:
2024-10-17 14:21:44 - Cosine-Similarity :	Pearson: nan	Spearman: nan
2024-10-17 14:21:44 - Manhattan-Distance:	Pearson: nan	Spearman: nan
2024-10-17 14:21:44 - Euclidean-Distance:	Pearson: nan	Spearman: nan
2024-10-17 14:21:44 - Dot-Product-Similarity:	Pearson: nan	Spearman: nan


  eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
  eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)
  eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
  eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)
  eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
  eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)
  eval_pearson_dot, _ = pearsonr(labels, dot_products)
  eval_spearman_dot, _ = spearmanr(labels, dot_products)


2024-10-17 14:22:21 - Saving model checkpoint to /home/choi/Git/RAG_con_doc/langchain/FT_model/MRL_MNRL_NLI-2024-10-17_14-16-30/checkpoint/checkpoint-1449
2024-10-17 14:22:21 - Save model to /home/choi/Git/RAG_con_doc/langchain/FT_model/MRL_MNRL_NLI-2024-10-17_14-16-30/checkpoint/checkpoint-1449
2024-10-17 14:22:21 - EmbeddingSimilarityEvaluator: Evaluating the model on the sts-dev dataset after epoch 9.0:
2024-10-17 14:22:22 - Cosine-Similarity :	Pearson: nan	Spearman: nan
2024-10-17 14:22:22 - Manhattan-Distance:	Pearson: nan	Spearman: nan
2024-10-17 14:22:22 - Euclidean-Distance:	Pearson: nan	Spearman: nan
2024-10-17 14:22:22 - Dot-Product-Similarity:	Pearson: nan	Spearman: nan


  eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
  eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)
  eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
  eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)
  eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
  eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)
  eval_pearson_dot, _ = pearsonr(labels, dot_products)
  eval_spearman_dot, _ = spearmanr(labels, dot_products)


{'loss': 0.0233, 'grad_norm': 2.373992681503296, 'learning_rate': 1.5182884748102142e-06, 'epoch': 9.32}
2024-10-17 14:22:57 - Saving model checkpoint to /home/choi/Git/RAG_con_doc/langchain/FT_model/MRL_MNRL_NLI-2024-10-17_14-16-30/checkpoint/checkpoint-1610
2024-10-17 14:22:57 - Save model to /home/choi/Git/RAG_con_doc/langchain/FT_model/MRL_MNRL_NLI-2024-10-17_14-16-30/checkpoint/checkpoint-1610
2024-10-17 14:22:58 - EmbeddingSimilarityEvaluator: Evaluating the model on the sts-dev dataset after epoch 10.0:
2024-10-17 14:22:58 - Cosine-Similarity :	Pearson: nan	Spearman: nan
2024-10-17 14:22:58 - Manhattan-Distance:	Pearson: nan	Spearman: nan
2024-10-17 14:22:58 - Euclidean-Distance:	Pearson: nan	Spearman: nan
2024-10-17 14:22:58 - Dot-Product-Similarity:	Pearson: nan	Spearman: nan
{'train_runtime': 387.4178, 'train_samples_per_second': 265.966, 'train_steps_per_second': 4.156, 'train_loss': 0.7442021006382771, 'epoch': 10.0}
2024-10-17 14:22:58 - Use pytorch device_name: cuda
2024-

  eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
  eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)
  eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
  eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)
  eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
  eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)
  eval_pearson_dot, _ = pearsonr(labels, dot_products)
  eval_spearman_dot, _ = spearmanr(labels, dot_products)
  eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
  eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)
  eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
  eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)
  eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
  eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)
  eval_pearson_dot, _ = pearsonr(labels, dot_products)
  eval_spearman_dot, _ = spearmanr(labels

{'sts-test_pearson_cosine': nan,
 'sts-test_spearman_cosine': nan,
 'sts-test_pearson_manhattan': nan,
 'sts-test_spearman_manhattan': nan,
 'sts-test_pearson_euclidean': nan,
 'sts-test_spearman_euclidean': nan,
 'sts-test_pearson_dot': nan,
 'sts-test_spearman_dot': nan,
 'sts-test_pearson_max': nan,
 'sts-test_spearman_max': nan}