In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
from src.Case_Builder import (device,
                              bert_version,
                              bert_model_name,
                              genai_model_name,
                              prompt_strategy_used,
                              dataset_name
                              )

In [2]:
data_train = pd.read_json(f'src/dataset/clean/{dataset_name}/{bert_version}_train.json')
data_val = pd.read_json(f'src/dataset/clean/{dataset_name}/{bert_version}_validation.json')
data_test = pd.read_json(f'src/dataset/clean/{dataset_name}/{bert_version}_test.json')

In [3]:
# BioBERT model ve tokenizer yükleniyor
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
model = AutoModel.from_pretrained(bert_model_name)
model.eval()  # İnferans için modeli evaluation moduna alıyoruz

# Batch embedding işlemi
def get_embeddings(texts, batch_size=256):
    embeddings = []
    with torch.no_grad():  # Gradyan hesaplamasını kapat
        for i in tqdm(range(0, len(texts), batch_size)):
            str_idx = i
            end_idx = min(i + batch_size, len(texts))
            batch_texts = texts[str_idx:end_idx]
            tokens = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
            outputs = model(**tokens)
            batch_embeddings = outputs.pooler_output.cpu().numpy()
            embeddings.extend(batch_embeddings)
    return embeddings

In [4]:
# Title sütunu için embedding al
data_train['title_embedding'] = get_embeddings(data_train['title'].tolist())
data_test['title_embedding'] = get_embeddings(data_test['title'].tolist())
data_val['title_embedding'] = get_embeddings(data_val['title'].tolist())

100%|██████████| 17/17 [01:02<00:00,  3.65s/it]
100%|██████████| 1/1 [00:03<00:00,  3.61s/it]
100%|██████████| 1/1 [00:02<00:00,  2.97s/it]


In [5]:
data_train.to_json(f'src/dataset/clean/{dataset_name}/{bert_version}_train.json', orient='records')
data_val.to_json(f'src/dataset/clean/{dataset_name}/{bert_version}_validation.json', orient='records')
data_test.to_json(f'src/dataset/clean/{dataset_name}/{bert_version}_test.json', orient='records')

In [6]:
data_train.head()

Unnamed: 0,id,year,title,sections,headings,abstract,summary,keywords,sentences_similarity,title_embedding
0,elife-35500-v1,2018,National and regional seasonal dynamics of all...,[[It is well-established that death rates vary...,"[Introduction, Results, Discussion, Materials ...","[In temperate climates , winter deaths exceed ...","[In the USA , more deaths happen in the winter...",[epidemiology and global health],[[{'sentence': 'It is well-established that de...,"[-0.562906, 0.3944506, 0.99972486, -0.98139304..."
1,elife-48378-v2,2019,Complement and CD4+ T cells drive context-spec...,[[Dysregulated complement activation is increa...,"[Introduction, Results, Discussion, Materials ...",[Whether complement dysregulation directly con...,[Most people have likely experienced the disco...,"[microbiology and infectious disease, immunolo...",[[{'sentence': 'Dysregulated complement activa...,"[-0.70537615, 0.39024797, 0.99982345, -0.98723..."
2,elife-04494-v1,2015,Phenotypic complementation of genetic immunode...,"[[HOIL-1 ( encoded by the RBCK1 gene ) , HOIP ...","[Introduction, Results, Discussion, Materials ...",[Variation in the presentation of hereditary i...,[The immune system protects an individual from...,"[microbiology and infectious disease, immunolo...",[[{'sentence': 'HOIL-1 ( encoded by the RBCK1 ...,"[-0.5257905, 0.23297422, 0.99937505, -0.980458..."
3,elife-12352-v2,2016,Cascade of neural processing orchestrates cogn...,[[Flexible control of cognitive processes is f...,"[Introduction, Results, Discussion, Materials ...",[Rapid and flexible interpretation of conflict...,[The brain adapts to control our behavior in d...,[neuroscience],[[{'sentence': 'Flexible control of cognitive ...,"[-0.7957994, 0.42419413, 0.9999085, -0.98985, ..."
4,elife-05413-v2,2015,Structural dynamics of myosin 5 during process...,[[Myosin 5a moves in a hand-over-hand fashion ...,"[Introduction, Results, Discussion, Materials ...",[Myosin 5a is a dual-headed molecular motor th...,[Cells use motor proteins that to move organel...,[structural biology and molecular biophysics],[[{'sentence': 'Myosin 5a moves in a hand-over...,"[-0.64044166, 0.3628359, 0.9991132, -0.9735480..."
