In [1]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, pipeline
from adapters import AutoAdapterModel

# tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base')

# models
model = AutoAdapterModel.from_pretrained('allenai/specter2_base')
model.load_adapter(
    adapter_name_or_path='allenai/specter2_proximity',
    source='hf',
    set_active=True,
    weights_only=True,
)
# allenai/specter2_<proximity|classification|regression|adhoc_query>

# feature_extractor = pipeline(
#     task="feature-extraction",
#     model=model,
#     tokenizer=tokenizer,
#     device="mps",
# )

  from .autonotebook import tqdm as notebook_tqdm
BertAdapterModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 34169.48it/s]
  state_dict = torch.load(weights_file, map_location="cpu")


'[PRX]'

In [2]:
df_sample = pd.read_csv("SciSciNet_Sample_Journals_subset.csv")

focal_papers = []
for idx, row in df_sample.iterrows():
    focal_papers.append(
        {
            'title': row["PaperTitle"],
            'abstract': open(f"abstracts/{row['PaperID']}.txt", 'r').read()
        }
    )

focal_batch = [d['title'] + '[SEP]' + (d.get('abstract') or '') for d in focal_papers]

In [3]:
model.to("mps")
all_embeddings = []

CHUNK_SIZE = 512

with torch.no_grad():
    for i in range(0, len(focal_batch), CHUNK_SIZE):
        sub_batch = focal_batch[i : i + CHUNK_SIZE]
        encoded = tokenizer(
            sub_batch,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt",
        ).to("mps")

        outputs = model(**encoded)

        cls_vec = outputs.last_hidden_state[:, 0, :]
        all_embeddings.append(cls_vec.cpu())

final_embeddings = torch.cat(all_embeddings, dim=0).numpy()

# np.save("focal_abstract_embeddings.npy", final_embeddings)

In [None]:
sample_paper_titles = df_sample["PaperTitle"].tolist()
sample_raw_embeddings = feature_extractor(
    sample_paper_titles,
    batch_size=512,
    padding=True,
    return_tensors=True,
)
sample_final_embeddings = torch.cat(
    [embedding [:, 0, :] for embedding in sample_raw_embeddings]
).numpy()

# np.save("focal_embeddings.npy", sample_final_embeddings)

In [2]:
generated_titles = df_generated["Title"].tolist()
generated_raw_embeddings = feature_extractor(
    generated_titles,
    batch_size=512,
    padding=True,
    return_tensors=True,
)
generated_final_embeddings = torch.cat(
    [embedding[:, 0, :] for embedding in generated_raw_embeddings]
).numpy()

# np.save("generated_embeddings.npy", generated_final_embeddings)

In [None]:
ground_truth_paper_titles = df_ground_truth["PaperTitle"].tolist()
ground_truth_raw_embeddings = feature_extractor(
    ground_truth_paper_titles,
    batch_size=512,
    padding=True,
    return_tensors=True,
)
ground_truth_final_embeddings = torch.cat(
    [embedding [:, 0, :] for embedding in ground_truth_raw_embeddings]
).numpy()

# np.save("ground_truth_embeddings.npy", ground_truth_final_embeddings)