In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
from datasets import DatasetDict, Dataset
import torch
import torch.nn.functional as F

In [11]:
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device('cpu')
print(device)

cpu


In [12]:
# Lead BERT Model
model_name = 'bert-base-cased'

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [13]:
data_plos_train = pd.read_json('src/dataset/raw/plos/train.json')
data_plos_val = pd.read_json('src/dataset/raw/plos/val.json')
data_plos_test = pd.read_json('src/dataset/raw/plos/test.json')

data_elife_train = pd.read_json('src/dataset/raw/elife/train.json')
data_elife_val = pd.read_json('src/dataset/raw/elife/val.json')
data_elife_test = pd.read_json('src/dataset/raw/elife/test.json')

In [14]:
data_train = pd.concat([data_elife_train, data_plos_train]).sample(frac=1, random_state=42).reset_index(drop=True)
data_val = pd.concat([data_elife_val, data_plos_val]).sample(frac=1, random_state=42).reset_index(drop=True)
data_test = pd.concat([data_elife_test, data_plos_test]).sample(frac=1, random_state=42).reset_index(drop=True)

In [15]:
print(f"""
Train Data: {data_train.shape}
Validation Data: {data_val.shape}
Test Data: {data_test.shape}
""")


Train Data: (29119, 9)
Validation Data: (1617, 9)
Test Data: (1617, 9)



In [16]:
def cosine_similarity(tensor1, tensor2):
    """
    Compute the cosine similarity between two tensors.
    :param tensor1: PyTorch tensor of shape [1, 768]
    :param tensor2: PyTorch tensor of shape [1, 768]
    :return: Cosine similarity value (scalar)
    """
    # Normalize the tensors to have unit length
    tensor1_normalized = F.normalize(tensor1, p=2, dim=1)
    tensor2_normalized = F.normalize(tensor2, p=2, dim=1)
    
    # Compute cosine similarity as the dot product of the normalized vectors
    cosine_sim = torch.sum(tensor1_normalized * tensor2_normalized)
    
    return cosine_sim.item()

In [17]:
def get_similarity(row):
    """
    Verilen row içindeki `keywords` ve `sections` bilgilerini kullanarak
    cümlelerin keyword'lere göre benzerliklerini hesaplar.
    """
    global tmp, idx
    print(f"{idx}|{len(tmp)}|{(idx/len(tmp)*100):.2f}", end='\r')

    idx += 1
    
    # Keyword tokenizasyonu
    keyword_tokens = tokenizer(row['keywords'], return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        keyword_embeddings = model(**keyword_tokens).pooler_output  # Pooler layer kullanımı

    # Section ve cümle tokenizasyonu
    all_sentences = [sentence for section in row['sections'] for sentence in section]
    sentence_tokens = tokenizer(all_sentences, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        sentence_embeddings = model(**sentence_tokens).pooler_output  # Pooler layer kullanımı

    # Kosinüs benzerliği hesaplama
    similarities = torch.nn.functional.cosine_similarity(
        sentence_embeddings.unsqueeze(1),  # (n_sentences, 1, hidden_size)
        keyword_embeddings.unsqueeze(0),  # (1, n_keywords, hidden_size)
        dim=2  # Benzerlikleri her cümle ve keyword çifti için hesapla
    )  # Shape: (n_sentences, n_keywords)

    # Sonuçları gruplandır
    result_sections = []
    start_idx = 0
    for section in row['sections']:
        section_length = len(section)
        section_similarities = similarities[start_idx : start_idx + section_length]  # İlgili cümlelerin benzerlikleri
        start_idx += section_length

        # Her cümleye benzerlikleri ekle
        result_section = [
            {
                "sentence": sentence,
                "similarities": section_similarities[i].tolist()
            }
            for i, sentence in enumerate(section)
        ]
        result_sections.append(result_section)

    return result_sections

In [None]:
tmp, idx = data_train.copy(), 0
data_train['sentences_similarity'] = data_train.apply(get_similarity, axis=1)

0|29119|0.00

In [None]:
tmp, idx = data_val.copy(), 0
data_val['sentences_similarity'] = data_val.apply(get_similarity, axis=1)

In [None]:
tmp, idx = data_test.copy(), 0
data_test['sentences_similarity'] = data_test.apply(get_similarity, axis=1)

In [None]:
data_train.to_json('src/dataset/clean/train.json', orient='records')
data_val.to_json('src/dataset/clean/validation.json', orient='records')
data_test.to_json('src/dataset/clean/test.json', orient='records')