In [1]:
from src.Case_Builder import (device,
                              bert_version,
                              bert_model_name, 
                              dataset_name
                              )
from transformers import BertTokenizer, BertModel
import pandas as pd
import torch
import torch.nn.functional as F

In [2]:
print(device)

cpu


In [3]:
# Lead BERT Model
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
model = BertModel.from_pretrained(bert_model_name)
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [4]:
data_train = pd.read_json(f'src/dataset/raw/{dataset_name}/train.json')
data_val = pd.read_json(f'src/dataset/raw/{dataset_name}/val.json')
data_test = pd.read_json(f'src/dataset/raw/{dataset_name}/test.json')

In [5]:
print(f"""
Dataset name: {dataset_name}
Train Data: {data_train.shape}
Validation Data: {data_val.shape}
Test Data: {data_test.shape}
""")


Dataset name: elife
Train Data: (4346, 8)
Validation Data: (241, 8)
Test Data: (241, 8)



In [6]:
def cosine_similarity(tensor1, tensor2):
    """
    Compute the cosine similarity between two tensors.
    :param tensor1: PyTorch tensor of shape [1, 768]
    :param tensor2: PyTorch tensor of shape [1, 768]
    :return: Cosine similarity value (scalar)
    """
    # Normalize the tensors to have unit length
    tensor1_normalized = F.normalize(tensor1, p=2, dim=1)
    tensor2_normalized = F.normalize(tensor2, p=2, dim=1)
    
    # Compute cosine similarity as the dot product of the normalized vectors
    cosine_sim = torch.sum(tensor1_normalized * tensor2_normalized)
    
    return cosine_sim.item()

In [7]:
def get_similarity(row):
    """
    Verilen row içindeki `keywords` ve `sections` bilgilerini kullanarak
    cümlelerin keyword'lere göre benzerliklerini hesaplar.
    """
    global tmp, idx
    print(f"{idx}|{len(tmp)}|{(idx/len(tmp)*100):.2f}", end='\r')

    idx += 1
    
    # Keyword tokenizasyonu
    rag_ref = []
    rag_ref.extend(row['title'])
    rag_ref.extend(" ".join(row['abstract']))
    rag_ref.extend(row['keywords'])
    keyword_tokens = tokenizer(rag_ref, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        keyword_embeddings = model(**keyword_tokens).pooler_output

    # Section ve cümle tokenizasyonu
    all_sentences = [sentence for section in row['sections'] for sentence in section]
    sentence_tokens = tokenizer(all_sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        sentence_embeddings = model(**sentence_tokens).pooler_output

    # Kosinüs benzerliği hesaplama
    similarities = torch.nn.functional.cosine_similarity(
        sentence_embeddings.unsqueeze(1),
        keyword_embeddings.unsqueeze(0),
        dim=2)

    result_sections = []
    start_idx = 0
    for section in row['sections']:
        section_length = len(section)
        section_similarities = similarities[start_idx : start_idx + section_length]  
        start_idx += section_length

        result_section = [
            {
                "sentence": sentence,
                "similarities": section_similarities[i].tolist()
            }
            for i, sentence in enumerate(section)
        ]
        result_sections.append(result_section)

    return result_sections

In [8]:
data_train = data_train.iloc[:5].copy()
data_val = data_val.iloc[:2].copy()
data_test = data_test.iloc[:3].copy()

In [9]:
tmp, idx = data_train.copy(), 1
data_train['sentences_similarity'] = data_train.apply(get_similarity, axis=1)

5|5|100.00

In [10]:
data_train.to_json(f'src/dataset/clean/{dataset_name}/{bert_version}_train.json', orient='records')

In [11]:
tmp, idx = data_val.copy(), 1
data_val['sentences_similarity'] = data_val.apply(get_similarity, axis=1)

2|2|100.00

In [12]:
data_val.to_json(f'src/dataset/clean/{dataset_name}/{bert_version}_validation.json', orient='records')

In [13]:
tmp, idx = data_test.copy(), 1
data_test['sentences_similarity'] = data_test.apply(get_similarity, axis=1)

3|3|100.00

In [14]:
data_test.to_json(f'src/dataset/clean/{dataset_name}/{bert_version}_test.json', orient='records')