In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
!pip install sentence_transformers 
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments, losses
from datasets import Dataset

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.0.1


  from tqdm.autonotebook import tqdm, trange
2024-07-08 08:00:23.185506: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 08:00:23.185632: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-08 08:00:23.316530: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Prepare datasets

Format of a sample data point
```json
[
    {
        "id": 4169,
        "href": "https://thuvienphapluat.vn/hoi-dap-phap-luat/5CFC0-hd-bao-nhieu-tuoi-moi-duoc-dang-ky-dao-tao-thuong-xuyen-hoc-nghe.html",
        "question": "Thời gian học mỗi buổi đào tạo thường xuyên học nghề tối đa bao nhiêu giờ?",
        "answer": "khi bạn tham gia chương trình đào tạo thường xuyên học nghề thì thời gian học mỗi buổi tối đa là 05 (năm) giờ và thời gian học trong một ngày tối đa là 08 (tám) giờ.",
        "relevant_laws": [
            {
                "name": "Điều 9 Thông tư 43/2015/TT-BLĐTBXH",
                "href": "https://thuvienphapluat.vn/van-ban/lao-dong-tien-luong/thong-tu-43-2015-tt-bldtbxh-dao-tao-thuong-xuyen-297839.aspx",
                "id_Law": "43/2015/TT-BLĐTBXH",
                "id_Chapter": 3,
                "id_Section": 1,
                "id_Article": 9
            }
        ],
        "annotation_list": []
    },
  ...
]
```

In [2]:
import json
import random

# Load the question and law data
with open('/kaggle/input/data-qa-991/question_9.91_train_gui_duy.json', 'r') as qftr:
    question_data_train = json.load(qftr)
        
with open('/kaggle/input/data-qa-991/question_9.91_test_gui_duy.json', 'r') as qfte:
    question_data_test = json.load(qfte)

with open('/kaggle/input/data-qa-991/law_nondup copy 11_modified.json', 'r') as lf:
    law_data = json.load(lf)

In [3]:
# Create a dictionary for quick lookup of laws by their IDs
law_dict = {}
for law in law_data:
    law_dict[law['id']] = law

In [4]:
#The find_hard_negatives function returns 5 random articles from the same law but different from the correct article
def find_hard_negatives(law, correct_chapter_id, correct_section_id, correct_article_id, num_articles=5):
    # Flatten all sections and articles in the law except the correct one
    articles = []
    for chapter in law['content']:
        for section in chapter['content_Chapter']:
            for article in section['content_Section']:
                if not (chapter['id_Chapter'] == correct_chapter_id and 
                        section['id_Section'] == correct_section_id and 
                        article['id_Article'] == correct_article_id):
                    articles.append(article['content_Article'])
    
    if len(articles) < num_articles:
        return articles  # If there are fewer articles than needed, return all of them
    return random.sample(articles, num_articles) if articles else None

#The find_soft_negatives function returns 5 random articles from different laws
def find_soft_negatives(law_dict, correct_law_id, num_articles=5):
    articles = []
    law_ids = list(law_dict.keys())
    if correct_law_id in law_ids:
        law_ids.remove(correct_law_id)

    while len(articles) < num_articles and law_ids:
        random_law_id = random.choice(law_ids)
        random_law = law_dict[random_law_id]
        random_chapter = random.choice(random_law['content'])
        random_section = random.choice(random_chapter['content_Chapter'])
        random_article = random.choice(random_section['content_Section'])
        articles.append(random_article['content_Article'])
        law_ids.remove(random_law_id)  # Remove the used law ID to avoid duplicates

    return articles if articles else None

# Function to process the questions, match the relevant laws and add negative samples used for Contrastive Learning
def process_questions(question_data, law_dict):
    for question in question_data:
        for relevant_law in question['relevant_laws']:
            law_id = relevant_law['id_Law']
            chapter_id = relevant_law['id_Chapter']
            section_id = relevant_law['id_Section']
            article_id = relevant_law['id_Article']

            # Find the corresponding law
            if law_id in law_dict:
                law = law_dict[law_id]
                for chapter in law['content']:
                    if chapter['id_Chapter'] == chapter_id:
                        for section in chapter['content_Chapter']:
                            if section['id_Section'] == section_id:
                                for article in section['content_Section']:
                                    if article['id_Article'] == article_id:
                                        # Add the content of the article to the relevant law in the question
                                        relevant_law['content'] = article['content_Article']

                                        # Add soft negatives
                                        relevant_law['soft_negative'] = find_soft_negatives(law_dict, law_id)

                                        # Add hard negatives
                                        relevant_law['hard_negative'] = find_hard_negatives(law, chapter_id, section_id, article_id)

process_questions(question_data_train, law_dict)
process_questions(question_data_test, law_dict)

In [5]:
# Save the updated question data
with open('updated_question_file_train_5.json', 'w') as f:
    json.dump(question_data_train, f, ensure_ascii=False, indent=4)

with open('updated_question_file_test_5.json', 'w') as f:
    json.dump(question_data_test, f, ensure_ascii=False, indent=4)

In [6]:
#The prepare_triplets function returns triplets used for Contrastive Learning with Triplet Loss
def prepare_triplets(data):
    triplets = {
        'anchor': [],
        'positive': [],
        'negative': []
    }
    for item in data:
        anchor = item['question']
        
        for relevant_law in item["relevant_laws"]:
            positive = relevant_law["content"]
            soft_negative = relevant_law['soft_negative']
            hard_negative = relevant_law['hard_negative']
            if soft_negative:
                for soft in soft_negative:
                    triplets['anchor'].append(anchor)
                    triplets['positive'].append(positive)
                    triplets['negative'].append(soft)
            if hard_negative:
                for hard in hard_negative:
                    triplets['anchor'].append(anchor)
                    triplets['positive'].append(positive)
                    triplets['negative'].append(hard)
                
    return triplets

with open('/kaggle/working/updated_question_file_train_5.json', 'r', encoding = 'utf-8') as f:
    train_data = json.load(f)
with open('/kaggle/working/updated_question_file_test_5.json', 'r', encoding = 'utf-8') as f:
    eval_data = json.load(f)
    
train_triplets = prepare_triplets(train_data)
eval_triplets = prepare_triplets(eval_data)

# Fine-tuning Sentence BERT using Contrastive Learning with Triplet Loss

In [7]:
# Load the pre-trained sBERT model on Vietnamese text
model = SentenceTransformer("keepitreal/vietnamese-sbert")
train_dataset = Dataset.from_dict(train_triplets)
eval_dataset = Dataset.from_dict(eval_triplets)

# Define loss function
loss = losses.TripletLoss(model=model)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/752 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/17.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
training_args = SentenceTransformerTrainingArguments(
    output_dir="./results",
    eval_strategy="steps",  
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=2, #5
    logging_steps=27217,
    save_steps=27217,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none",
#     load_best_model_at_end=True,
    
)
trainer = SentenceTransformerTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss,
)
trainer.train()

Step,Training Loss,Validation Loss
27217,0.406,0.478832


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=27218, training_loss=0.4059453848326475, metrics={'train_runtime': 20830.4703, 'train_samples_per_second': 15.679, 'train_steps_per_second': 1.307, 'total_flos': 0.0, 'train_loss': 0.4059453848326475, 'epoch': 2.0})

In [9]:
# Continue training on the eval_dataset
# training_args.logging_steps = 
trainer = SentenceTransformerTrainer(
    model=model,
    args=training_args,
    train_dataset=eval_dataset,
    eval_dataset=eval_dataset,
    loss=loss,
)
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=6836, training_loss=0.22146508231924736, metrics={'train_runtime': 4983.7428, 'train_samples_per_second': 16.46, 'train_steps_per_second': 1.372, 'total_flos': 0.0, 'train_loss': 0.22146508231924736, 'epoch': 2.0})

In [10]:
trainer.save_model()

# Infer

In [11]:
# Reload the fine-tuned model
model = SentenceTransformer(model_name_or_path="/kaggle/working/results", local_files_only=True)

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
import torch
from tqdm import tqdm
import numpy as np

# We use cosine similarity to calculate the relevance score of the embeddings of a query with an article
def cosine_similarity_sklearn(embedding1, embedding2):
    embedding1 = np.array(embedding1).reshape(1, -1)
    embedding2 = np.array(embedding2).reshape(1, -1)
    return cosine_similarity(embedding1, embedding2)[0][0]

def get_relevance_scores(model, scores):
    sbert_scores = []
    for idx, score in tqdm(scores.iterrows()):
        query = score['query']
        article = score['content']
        embeddings = model.encode([query, article], show_progress_bar=False)
        
        bert_score = float(model.similarity(embeddings[0], embeddings[1]).squeeze(0))
#         print(bert_score)
        sbert_scores.append(bert_score)
        
    scores["sbert_contras_score"] = sbert_scores
    return scores

In [13]:
# Example usage
sentences_pos = ["Việc bán ngôi nhà duy nhất vợ chồng đang ở có thể được thực hiện chỉ cần có sự đồng ý của chồng mà không cần sự đồng ý của vợ, đúng hay sai?", "Giao dịch liên quan đến nhà là nơi ở duy nhất của vợ chồng Việc xác lập, thực hiện, chấm dứt các giao dịch liên quan đến nhà là nơi ở duy nhất của vợ chồng phải có sự thỏa thuận của vợ chồng. Trong trường hợp nhà ở thuộc sở hữu riêng của vợ hoặc chồng thì chủ sở hữu có quyền xác lập, thực hiện, chấm dứt giao dịch liên quan đến tài sản đó nhưng phải bảo đảm chỗ ở cho vợ chồng."]
sentences_neg = ["Việc bán ngôi nhà duy nhất vợ chồng đang ở có thể được thực hiện chỉ cần có sự đồng ý của chồng mà không cần sự đồng ý của vợ, đúng hay sai?", "Chiếm hữu, sử dụng, định đoạt tài sản riêng 1. Vợ, chồng có quyền chiếm hữu, sử dụng, định đoạt tài sản riêng của mình; nhập hoặc không nhập tài sản riêng vào tài sản chung. 2. Trong trường hợp vợ hoặc chồng không thể tự mình quản lý tài sản riêng và cũng không ủy quyền cho người khác quản lý thì bên kia có quyền quản lý tài sản đó. Việc quản lý tài sản phải bảo đảm lợi ích của người có tài sản. 3. Nghĩa vụ riêng về tài sản của mỗi người được thanh toán từ tài sản riêng của người đó. 4. Trong trường hợp vợ, chồng có tài sản riêng mà hoa lợi, lợi tức từ tài sản riêng đó là nguồn sống duy nhất của gia đình thì việc định đoạt tài sản này phải có sự đồng ý của chồng, vợ."]

embeddings_pos = model.encode(sentences_pos)
print(cosine_similarity_sklearn(embeddings_pos[0], embeddings_pos[1]))

embeddings_neg = model.encode(sentences_neg)
print(cosine_similarity_sklearn(embeddings_neg[0], embeddings_neg[1]))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.6212554


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.5437257


In [14]:
# Load other features between a query and an article (Top-100 BM25-score articles)
df = pd.read_csv("/kaggle/input/top100-bm25-qld-alqac2024/vimonot5/bm25_top100_priv_23_wl_vimonot5.csv")
df1 = pd.read_csv("/kaggle/input/top100-bm25-qld-alqac2024/vimonot5/bm25_top100_pub_test_24_wl_vimonot5.csv")
df2 = pd.read_csv("/kaggle/input/top100-bm25-qld-alqac2024/vimonot5/bm25_top100_train_2224_wl_vimonot5.csv")

In [15]:
# Add a new sbert-score column
df["sbert_contras_score"] = None
df = get_relevance_scores(model, df)

df1["sbert_contras_score"] = None
df1 = get_relevance_scores(model, df1)

df2["sbert_contras_score"] = None
df2 = get_relevance_scores(model, df2)

11000it [04:24, 41.63it/s]
20800it [08:22, 41.37it/s]
38100it [14:51, 42.73it/s]


In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

def scale_scores_by_query(df, score_column, group_column):
    scaler = MinMaxScaler()
    
    def scale_group(group):
        group[[score_column]] = scaler.fit_transform(group[[score_column]])
        return group
    
    scaled_df = df.groupby(group_column).apply(scale_group).reset_index(drop=True)
    return scaled_df


# Apply Min-Max scaling
df['sbert_contras_score_scaled'] = df['sbert_contras_score']
df= scale_scores_by_query(df, 'sbert_contras_score_scaled', 'question_id')

df1['sbert_contras_score_scaled'] = df1['sbert_contras_score']
df1= scale_scores_by_query(df1, 'sbert_contras_score_scaled', 'question_id')

df2['sbert_contras_score_scaled'] = df2['sbert_contras_score']
df2= scale_scores_by_query(df2, 'sbert_contras_score_scaled', 'question_id')

  scaled_df = df.groupby(group_column).apply(scale_group).reset_index(drop=True)
  scaled_df = df.groupby(group_column).apply(scale_group).reset_index(drop=True)
  scaled_df = df.groupby(group_column).apply(scale_group).reset_index(drop=True)


In [17]:
from sklearn.metrics import precision_score, recall_score

def calculate_f2(precision, recall):
    return 5 * precision * recall / (4 * precision + recall)

def calculate_best_threshold(df, score_column, label_column):
    best_threshold = 0.0
    best_precision = 0.0
    best_recall = 0.0
    best_f2 = 0.0
    
    thresholds = np.arange(0.0,1.0,0.01)
    for threshold in thresholds:
        df['predicted_label'] = (df[score_column] >= threshold).astype(int)
        precision = precision_score(df[label_column], df['predicted_label'])
        recall = recall_score(df[label_column], df['predicted_label'])
        f2 = calculate_f2(precision, recall)
        
        if f2 > best_f2:
            best_threshold = threshold
            best_precision = precision
            best_recall = recall
            best_f2 = f2
            
    print(f"Best Threshold: {best_threshold}")
    print(f"Precision at Best Threshold: {best_precision}")
    print(f"Recall at Best Threshold: {best_recall}")
    print(f"F2 Score at Best Threshold: {best_f2}")
    
    return best_threshold, best_precision, best_recall, best_f2

In [18]:
full_df = pd.concat([df, df1, df2])
calculate_best_threshold(full_df, 'sbert_contras_score_scaled', 'label')

Best Threshold: 0.98
Precision at Best Threshold: 0.3350050150451354
Recall at Best Threshold: 0.4691011235955056
F2 Score at Best Threshold: 0.43433029908972687


(0.98, 0.3350050150451354, 0.4691011235955056, 0.43433029908972687)