Specter

In [37]:
from transformers import AutoTokenizer, AutoModel

# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
model = AutoModel.from_pretrained('allenai/specter')

papers = [
    {'title': 'Biogeographic Ancestry and Socioeconomic Outcomes in the Americas: a Meta-analysis',
     'abstract': 'Narrative reports suggest that socioeconomic status (SES) outcomes correlate with biogeographical ancestry (BGA) in the Americas. We conducted a meta-analysis to quantify this association... Implications for future studies are discussed.'},

    {'title': 'Learning from critical care management of sheep receiving extracorporeal membrane oxygenation for respiratory failure: using machine learning to predict lung injury as a tool for processing large clinical datasets',
     'abstract': 'Background: Numerous successful therapies developed in preclinical studies... of the severe coronavirus disease 2019 (COVID-19) in humans.'},

    {'title': 'A signal detection theoretic argument against claims of unmeritocratic faculty hiring',
     'abstract': 'To get a faculty job, graduating doctoral students must demonstrate evidence of research ability... difficulty in empirically demonstrating that it is not so.'},

    {'title': 'Machine learning-assisted directed evolution navigates a combinatorial epistatic fitness landscape with algorithmically generated training sets',
     'abstract': 'Engineering proteins with novel functions is challenging because of the immense size of sequence space... This approach provides a general framework for machine learning-assisted protein engineering.'},

    {'title': 'A tale of two chromosomes: a comprehensive comparison of the Y chromosome and its homolog the X chromosome',
     'abstract': 'The human Y chromosome has been historically understudied... shedding light on the unique evolutionary dynamics of both chromosomes.'},

    {'title': 'Improving reproducibility and generalizability in animal research: A role for multiple lab collaboration',
     'abstract': 'Animal research often suffers from poor reproducibility... Potential benefits and challenges of this approach are explored.'},

    {'title': 'Patterns of gene expression associated with autism spectrum disorder in the human brain',
     'abstract': 'Autism spectrum disorder (ASD) is associated with widespread gene expression differences... potential molecular signatures underlying ASD.'},

    {'title': 'Quantifying the social media attention of scientific publications: a large-scale study',
     'abstract': 'The growing use of social media has transformed scholarly communication... Our findings highlight emerging patterns of attention.'},

    {'title': 'Using natural language processing to extract biological pathways from scientific literature',
     'abstract': 'Biological knowledge is increasingly encoded in unstructured text... outperforming several baseline approaches.'},

    {'title': 'Evaluating researcher career trajectories using bibliometric indicators',
     'abstract': 'Bibliometric indicators are widely used for research evaluation... implications for career assessment policies.'},

    {'title': 'The role of collaboration networks in shaping scientific impact',
     'abstract': 'Scientific collaboration has grown dramatically... Our findings support the importance of network structures in scholarly impact.'},

    {'title': 'Genomic determinants of drug resistance in emerging pathogens: a machine learning approach',
     'abstract': 'Drug resistance poses a major threat to global health... demonstrating strong predictive performance.'},

    {'title': 'High-resolution mapping of transcription factor binding using improved ChIP-seq workflows',
     'abstract': 'ChIP-seq is widely used for mapping protein-DNA interactions... improvements to experimental workflows are presented.'},

    {'title': 'Educational data mining: predicting student performance using machine learning models',
     'abstract': 'Educational institutions increasingly adopt data-driven tools... with practical implications for interventions.'},

    {'title': 'Characterizing peer review text to understand reviewer behavior',
     'abstract': 'Peer review is central to scientific quality control... This study provides insights into reviewer behavior patterns.'},

    {'title': 'Assessing reproducibility in computational biology through standardized workflows',
     'abstract': 'Reproducibility remains a major challenge in computational biology... leading to more reliable research outputs.'},

    {'title': 'Mapping the global landscape of preprint adoption in biological sciences',
     'abstract': 'Preprints have become an important tool for rapid dissemination... factors influencing adoption across regions.'},

    {'title': 'Information extraction from biomedical text using deep neural networks',
     'abstract': 'Deep learning has transformed biomedical NLP... achieving state-of-the-art performance.'},

    {'title': 'Modeling scientific collaboration dynamics using temporal network analysis',
     'abstract': 'Understanding how collaborations evolve over time... providing new insights into scientific teamwork.'},

    {'title': 'Predicting grant funding outcomes using machine learning and text analysis',
     'abstract': 'Grant funding is a critical component of scientific progress... highlighting opportunities and limitations.'}
]

# concatenate title and abstract
title_abs = [d['title'] + tokenizer.sep_token + (d.get('abstract') or '') for d in papers]
# preprocess the input
inputs = tokenizer(title_abs, padding=True, truncation=True, return_tensors="pt", max_length=512)
result = model(**inputs)
# take the first token in the batch as the embedding
embeddings = result.last_hidden_state[:, 0, :]
print(embeddings)
print(embeddings.shape)


tensor([[-0.4822,  0.3710,  0.0151,  ...,  0.4635, -0.0923,  0.3282],
        [ 0.6337, -0.1935, -0.7914,  ...,  0.5908, -0.1041,  0.5936],
        [-0.7441,  0.7889, -0.1690,  ..., -0.2301,  0.3148,  0.8052],
        ...,
        [-0.7129,  1.4612, -0.2798,  ...,  0.3722,  0.4748,  0.7545],
        [-0.1843,  0.5408, -0.3659,  ...,  1.2115,  0.5760,  0.9252],
        [-0.7109,  1.0945, -0.8484,  ...,  1.2831,  0.3304,  1.3488]],
       grad_fn=<SelectBackward0>)
torch.Size([20, 768])


In [38]:
import json
import numpy as np
import pandas as pd
import umap
import plotly.express as px # Không dùng vì output là JSON
import argparse
from tqdm.auto import tqdm
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import os
import nbformat

In [39]:
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
embeddings_2d = reducer.fit_transform(embeddings.detach().cpu().numpy())
df = pd.DataFrame(embeddings_2d, columns=['x', 'y'])
print(df)

            x          y
0   15.545433  25.353773
1   14.876294  24.011454
2   14.273252  25.303478
3   15.502002  24.077629
4   14.905021  24.558260
5   14.892072  25.138577
6   15.588358  24.638674
7   14.952589  25.879463
8   15.469921  25.880253
9   15.797227  26.389076
10  16.581196  25.442848
11  14.937137  23.464212
12  14.096848  24.349354
13  16.243593  24.035284
14  16.103600  25.027142
15  16.684971  24.611588
16  16.238796  26.036146
17  15.780494  23.475122
18  14.303536  26.008951
19  14.997232  26.586445



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [40]:
kmeans = KMeans(n_clusters=6, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(embeddings_2d)

In [43]:
df_vis = pd.DataFrame(embeddings_2d, columns=['x', 'y'])
    # Gán nhãn cluster (từ kết quả clustering Cosine)
df_vis['cluster'] = cluster_labels
    # -------------------- Visualization --------------------
fig = px.scatter(
    df_vis,
    x='x', y='y',
    color='cluster',
    title=f'SPECTER Embeddings (UMAP 2D) — 6 Clusters',
    color_continuous_scale='Viridis',
    width=950, height=650
    )
print(f"Saved interactive clustered plot to")
fig.write_html("plot.html")
print("Saved plot.html")

Saved interactive clustered plot to
Saved plot.html
